1 /*****************************************************************************
2
3 Copyright (c) 1994, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 *****************************************************************************/
34
35 /**************************************************//**
36 @file btr/btr0cur.cc
37 The index tree cursor
38
39 All changes that row operations make to a B-tree or the records
40 there must go through this module! Undo log records are written here
41 of every modify or insert of a clustered index record.
42
43 NOTE!!!
44 To make sure we do not run out of disk space during a pessimistic
45 insert or update, we have to reserve 2 x the height of the index tree
46 many pages in the tablespace before we start the operation, because
47 if leaf splitting has been started, it is difficult to undo, except
48 by crashing the database and doing a roll-forward.
49
50 Created 10/16/1994 Heikki Tuuri
51 *******************************************************/
52
53 #include "btr0cur.h"
54
55 #ifdef UNIV_NONINL
56 #include "btr0cur.ic"
57 #endif
58
59 #include "row0upd.h"
60 #ifndef UNIV_HOTBACKUP
61 #include "mtr0log.h"
62 #include "page0page.h"
63 #include "page0zip.h"
64 #include "rem0rec.h"
65 #include "rem0cmp.h"
66 #include "buf0lru.h"
67 #include "btr0btr.h"
68 #include "btr0sea.h"
69 #include "row0log.h"
70 #include "row0purge.h"
71 #include "row0upd.h"
72 #include "trx0rec.h"
73 #include "trx0roll.h"
74 #include "que0que.h"
75 #include "row0row.h"
76 #include "srv0srv.h"
77 #include "ibuf0ibuf.h"
78 #include "lock0lock.h"
79 #include "zlib.h"
80 #include "srv0start.h"
81
82 /** Buffered B-tree operation types, introduced as part of delete buffering. */
83 enum btr_op_t {
84 BTR_NO_OP = 0, /*!< Not buffered */
85 BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
86 BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
87 BTR_DELETE_OP, /*!< Purge a delete-marked record */
88 BTR_DELMARK_OP /*!< Mark a record for deletion */
89 };
90
91 /** Modification types for the B-tree operation. */
92 enum btr_intention_t {
93 BTR_INTENTION_DELETE,
94 BTR_INTENTION_BOTH,
95 BTR_INTENTION_INSERT
96 };
97 #if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH
98 #error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH"
99 #endif
100 #if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT
101 #error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT"
102 #endif
103
104 /** For the index->lock scalability improvement, only possibility of clear
105 performance regression observed was caused by grown huge history list length.
106 That is because the exclusive use of index->lock also worked as reserving
107 free blocks and read IO bandwidth with priority. To avoid huge glowing history
108 list as same level with previous implementation, prioritizes pessimistic tree
109 operations by purge as the previous, when it seems to be growing huge.
110
111 Experimentally, the history list length starts to affect to performance
112 throughput clearly from about 100000. */
113 #define BTR_CUR_FINE_HISTORY_LENGTH 100000
114
115 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
116 ulint btr_cur_n_non_sea = 0;
117 /** Number of successful adaptive hash index lookups in
118 btr_cur_search_to_nth_level(). */
119 ulint btr_cur_n_sea = 0;
120 /** Old value of btr_cur_n_non_sea. Copied by
121 srv_refresh_innodb_monitor_stats(). Referenced by
122 srv_printf_innodb_monitor(). */
123 ulint btr_cur_n_non_sea_old = 0;
124 /** Old value of btr_cur_n_sea. Copied by
125 srv_refresh_innodb_monitor_stats(). Referenced by
126 srv_printf_innodb_monitor(). */
127 ulint btr_cur_n_sea_old = 0;
128
129 #ifdef UNIV_DEBUG
130 /* Flag to limit optimistic insert records */
131 uint btr_cur_limit_optimistic_insert_debug = 0;
132 #endif /* UNIV_DEBUG */
133
134 /** In the optimistic insert, if the insert does not fit, but this much space
135 can be released by page reorganize, then it is reorganized */
136 #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
137
138 /** The structure of a BLOB part header */
139 /* @{ */
140 /*--------------------------------------*/
141 #define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
142 page */
143 #define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
144 FIL_NULL if none */
145 /*--------------------------------------*/
146 #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
147 part header, in bytes */
148
149 /** Estimated table level stats from sampled value.
150 @param value sampled stats
151 @param index index being sampled
152 @param sample number of sampled rows
153 @param ext_size external stored data size
154 @param not_empty table not empty
155 @return estimated table wide stats from sampled value */
156 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
157 (((value) * static_cast<int64_t>(index->stat_n_leaf_pages) \
158 + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
159
160 /* @} */
161 #endif /* !UNIV_HOTBACKUP */
162
163 #ifndef UNIV_HOTBACKUP
164 /*******************************************************************//**
165 Marks all extern fields in a record as owned by the record. This function
166 should be called if the delete mark of a record is removed: a not delete
167 marked record always owns all its extern fields. */
168 static
169 void
170 btr_cur_unmark_extern_fields(
171 /*=========================*/
172 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
173 part will be updated, or NULL */
174 rec_t* rec, /*!< in/out: record in a clustered index */
175 dict_index_t* index, /*!< in: index of the page */
176 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
177 mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
178 /*******************************************************************//**
179 Adds path information to the cursor for the current page, for which
180 the binary search has been performed. */
181 static
182 void
183 btr_cur_add_path_info(
184 /*==================*/
185 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
186 ulint height, /*!< in: height of the page in tree;
187 0 means leaf node */
188 ulint root_height); /*!< in: root node height in tree */
189 /***********************************************************//**
190 Frees the externally stored fields for a record, if the field is mentioned
191 in the update vector. */
192 static
193 void
194 btr_rec_free_updated_extern_fields(
195 /*===============================*/
196 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
197 X-latched */
198 rec_t* rec, /*!< in: record */
199 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
200 part will be updated, or NULL */
201 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
202 const upd_t* update, /*!< in: update vector */
203 bool rollback,/*!< in: performing rollback? */
204 mtr_t* mtr); /*!< in: mini-transaction handle which contains
205 an X-latch to record page and to the tree */
206 /***********************************************************//**
207 Frees the externally stored fields for a record. */
208 static
209 void
210 btr_rec_free_externally_stored_fields(
211 /*==================================*/
212 dict_index_t* index, /*!< in: index of the data, the index
213 tree MUST be X-latched */
214 rec_t* rec, /*!< in: record */
215 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
216 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
217 part will be updated, or NULL */
218 bool rollback,/*!< in: performing rollback? */
219 mtr_t* mtr); /*!< in: mini-transaction handle which contains
220 an X-latch to record page and to the index
221 tree */
222 #endif /* !UNIV_HOTBACKUP */
223
224 #ifndef UNIV_HOTBACKUP
225 /*==================== B-TREE SEARCH =========================*/
226
227 #if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH
228 #error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH"
229 #endif
230 #if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH
231 #error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH"
232 #endif
233 #if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH
234 #error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH"
235 #endif
236
237 /** Latches the leaf page or pages requested.
238 @param[in] block leaf page where the search converged
239 @param[in] page_id page id of the leaf
240 @param[in] latch_mode BTR_SEARCH_LEAF, ...
241 @param[in] cursor cursor
242 @param[in] mtr mini-transaction
243 @return blocks and savepoints which actually latched. */
244 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,const page_id_t & page_id,const page_size_t & page_size,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)245 btr_cur_latch_leaves(
246 buf_block_t* block,
247 const page_id_t& page_id,
248 const page_size_t& page_size,
249 ulint latch_mode,
250 btr_cur_t* cursor,
251 mtr_t* mtr)
252 {
253 ulint mode;
254 ulint left_page_no;
255 ulint right_page_no;
256 buf_block_t* get_block;
257 page_t* page = buf_block_get_frame(block);
258 bool spatial;
259 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
260
261 spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
262 ut_ad(buf_page_in_file(&block->page));
263
264 switch (latch_mode) {
265 case BTR_SEARCH_LEAF:
266 case BTR_MODIFY_LEAF:
267 case BTR_SEARCH_TREE:
268 if (spatial) {
269 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
270 = mtr_set_savepoint(mtr);
271 }
272
273 mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
274 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
275 get_block = btr_block_get(page_id, page_size, mode,
276 cursor->index, mtr);
277
278 SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
279
280 latch_leaves.blocks[1] = get_block;
281 #ifdef UNIV_BTR_DEBUG
282 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
283 #endif /* UNIV_BTR_DEBUG */
284 if (spatial) {
285 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
286 = get_block;
287 }
288
289 return(latch_leaves);
290 case BTR_MODIFY_TREE:
291 /* It is exclusive for other operations which calls
292 btr_page_set_prev() */
293 ut_ad(mtr_memo_contains_flagged(mtr,
294 dict_index_get_lock(cursor->index),
295 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
296 || dict_table_is_intrinsic(cursor->index->table));
297 /* x-latch also siblings from left to right */
298 left_page_no = btr_page_get_prev(page, mtr);
299
300 if (left_page_no != FIL_NULL) {
301
302 if (spatial) {
303 cursor->rtr_info->tree_savepoints[
304 RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
305 }
306
307 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
308 get_block = btr_block_get(
309 page_id_t(page_id.space(), left_page_no),
310 page_size, RW_X_LATCH, cursor->index, mtr);
311
312 SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
313
314 latch_leaves.blocks[0] = get_block;
315
316 if (spatial) {
317 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
318 = get_block;
319 }
320 }
321
322 if (spatial) {
323 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
324 = mtr_set_savepoint(mtr);
325 }
326
327 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
328 get_block = btr_block_get(
329 page_id, page_size, RW_X_LATCH, cursor->index, mtr);
330
331 SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
332
333 latch_leaves.blocks[1] = get_block;
334
335 #ifdef UNIV_BTR_DEBUG
336 /* Sanity check only after both the blocks are latched. */
337 if (latch_leaves.blocks[0] != NULL) {
338 ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
339 == page_is_comp(page));
340 ut_a(btr_page_get_next(
341 latch_leaves.blocks[0]->frame, mtr)
342 == page_get_page_no(page));
343 }
344 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
345 #endif /* UNIV_BTR_DEBUG */
346
347 if (spatial) {
348 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
349 = get_block;
350 }
351
352 right_page_no = btr_page_get_next(page, mtr);
353
354 if (right_page_no != FIL_NULL) {
355 if (spatial) {
356 cursor->rtr_info->tree_savepoints[
357 RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
358 mtr);
359 }
360 latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
361 get_block = btr_block_get(
362 page_id_t(page_id.space(), right_page_no),
363 page_size, RW_X_LATCH, cursor->index, mtr);
364
365 SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
366
367 latch_leaves.blocks[2] = get_block;
368 #ifdef UNIV_BTR_DEBUG
369 ut_a(page_is_comp(get_block->frame)
370 == page_is_comp(page));
371 ut_a(btr_page_get_prev(get_block->frame, mtr)
372 == page_get_page_no(page));
373 #endif /* UNIV_BTR_DEBUG */
374 if (spatial) {
375 cursor->rtr_info->tree_blocks[
376 RTR_MAX_LEVELS + 2] = get_block;
377 }
378 }
379
380 return(latch_leaves);
381
382 case BTR_SEARCH_PREV:
383 case BTR_MODIFY_PREV:
384 mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
385 /* latch also left sibling */
386 rw_lock_s_lock(&block->lock);
387 left_page_no = btr_page_get_prev(page, mtr);
388 rw_lock_s_unlock(&block->lock);
389
390 if (left_page_no != FIL_NULL) {
391 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
392 get_block = btr_block_get(
393 page_id_t(page_id.space(), left_page_no),
394 page_size, mode, cursor->index, mtr);
395 latch_leaves.blocks[0] = get_block;
396 cursor->left_block = get_block;
397
398 SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
399 }
400
401 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
402 get_block = btr_block_get(page_id, page_size, mode,
403 cursor->index, mtr);
404
405 SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
406
407 latch_leaves.blocks[1] = get_block;
408 #ifdef UNIV_BTR_DEBUG
409 /* Sanity check only after both the blocks are latched. */
410 if (latch_leaves.blocks[0] != NULL) {
411 ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
412 == page_is_comp(page));;
413 ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame, mtr)
414 == page_get_page_no(page));
415 }
416 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
417 #endif /* UNIV_BTR_DEBUG */
418 return(latch_leaves);
419 case BTR_CONT_MODIFY_TREE:
420 ut_ad(dict_index_is_spatial(cursor->index));
421 return(latch_leaves);
422 }
423
424 ut_error;
425 return(latch_leaves);
426 }
427
428 /** Optimistically latches the leaf page or pages requested.
429 @param[in] block guessed buffer block
430 @param[in] modify_clock modify clock value
431 @param[in,out] latch_mode BTR_SEARCH_LEAF, ...
432 @param[in,out] cursor cursor
433 @param[in] file file name
434 @param[in] line line where called
435 @param[in] mtr mini-transaction
436 @return true if success */
437 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)438 btr_cur_optimistic_latch_leaves(
439 buf_block_t* block,
440 ib_uint64_t modify_clock,
441 ulint* latch_mode,
442 btr_cur_t* cursor,
443 const char* file,
444 ulint line,
445 mtr_t* mtr)
446 {
447 ulint mode;
448 ulint left_page_no;
449 ut_ad(block->page.buf_fix_count > 0);
450 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
451
452 switch (*latch_mode) {
453 case BTR_SEARCH_LEAF:
454 case BTR_MODIFY_LEAF:
455 return(buf_page_optimistic_get(*latch_mode, block,
456 modify_clock, file, line, mtr));
457 case BTR_SEARCH_PREV:
458 case BTR_MODIFY_PREV:
459 mode = *latch_mode == BTR_SEARCH_PREV
460 ? RW_S_LATCH : RW_X_LATCH;
461
462 rw_lock_s_lock(&block->lock);
463 if (block->modify_clock != modify_clock) {
464 rw_lock_s_unlock(&block->lock);
465
466 return(false);
467 }
468 left_page_no = btr_page_get_prev(
469 buf_block_get_frame(block), mtr);
470 rw_lock_s_unlock(&block->lock);
471
472 if (left_page_no != FIL_NULL) {
473 const page_id_t page_id(
474 dict_index_get_space(cursor->index),
475 left_page_no);
476
477 cursor->left_block = btr_block_get(
478 page_id,
479 dict_table_page_size(cursor->index->table),
480 mode, cursor->index, mtr);
481 } else {
482 cursor->left_block = NULL;
483 }
484
485 if (buf_page_optimistic_get(mode, block, modify_clock,
486 file, line, mtr)) {
487 if (btr_page_get_prev(buf_block_get_frame(block), mtr)
488 == left_page_no) {
489 /* We've entered this function with the block already buffer-fixed,
490 and buf_page_optimistic_get() buffer-fixes it again. The caller should
491 unfix the block once (to undo their buffer-fixing). */
492 ut_ad(2 <= block->page.buf_fix_count);
493 *latch_mode = mode;
494 return(true);
495 } else {
496 /* release the block, which will also decrement the buf_fix_count once
497 undoing the increment in successful buf_page_optimistic_get() */
498 btr_leaf_page_release(block, mode, mtr);
499 }
500 }
501
502 /* If we are still here then buf_page_optimistic_get() did not buffer-fix
503 the page, but it should still be buffer-fixed as it was before the call.*/
504 ut_ad(0 < block->page.buf_fix_count);
505 /* release the left block */
506 if (cursor->left_block != NULL) {
507 btr_leaf_page_release(cursor->left_block,
508 mode, mtr);
509 }
510
511 return(false);
512
513 default:
514 ut_error;
515 return(false);
516 }
517 }
518
519 /**
520 Gets intention in btr_intention_t from latch_mode, and cleares the intention
521 at the latch_mode.
522 @param latch_mode in/out: pointer to latch_mode
523 @return intention for latching tree */
524 static
525 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)526 btr_cur_get_and_clear_intention(
527 ulint *latch_mode)
528 {
529 btr_intention_t intention;
530
531 switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
532 case BTR_LATCH_FOR_INSERT:
533 intention = BTR_INTENTION_INSERT;
534 break;
535 case BTR_LATCH_FOR_DELETE:
536 intention = BTR_INTENTION_DELETE;
537 break;
538 default:
539 /* both or unknown */
540 intention = BTR_INTENTION_BOTH;
541 }
542 *latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE);
543
544 return(intention);
545 }
546
547 /**
548 Gets the desired latch type for the root leaf (root page is root leaf)
549 at the latch mode.
550 @param latch_mode in: BTR_SEARCH_LEAF, ...
551 @return latch type */
552 static
553 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)554 btr_cur_latch_for_root_leaf(
555 ulint latch_mode)
556 {
557 switch (latch_mode) {
558 case BTR_SEARCH_LEAF:
559 case BTR_SEARCH_TREE:
560 case BTR_SEARCH_PREV:
561 return(RW_S_LATCH);
562 case BTR_MODIFY_LEAF:
563 case BTR_MODIFY_TREE:
564 case BTR_MODIFY_PREV:
565 return(RW_X_LATCH);
566 case BTR_CONT_MODIFY_TREE:
567 case BTR_CONT_SEARCH_TREE:
568 /* A root page should be latched already,
569 and don't need to be latched here.
570 fall through (RW_NO_LATCH) */
571 case BTR_NO_LATCHES:
572 return(RW_NO_LATCH);
573 }
574
575 ut_error;
576 return(RW_NO_LATCH); /* avoid compiler warnings */
577 }
578
579 /** Detects whether the modifying record might need a modifying tree structure.
580 @param[in] index index
581 @param[in] page page
582 @param[in] lock_intention lock intention for the tree operation
583 @param[in] rec record (current node_ptr)
584 @param[in] rec_size size of the record or max size of node_ptr
585 @param[in] page_size page size
586 @param[in] mtr mtr
587 @return true if tree modification is needed */
588 static
589 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,const page_size_t & page_size,mtr_t * mtr)590 btr_cur_will_modify_tree(
591 dict_index_t* index,
592 const page_t* page,
593 btr_intention_t lock_intention,
594 const rec_t* rec,
595 ulint rec_size,
596 const page_size_t& page_size,
597 mtr_t* mtr)
598 {
599 ut_ad(!page_is_leaf(page));
600 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
601 MTR_MEMO_X_LOCK
602 | MTR_MEMO_SX_LOCK)
603 || dict_table_is_intrinsic(index->table));
604
605 /* Pessimistic delete of the first record causes delete & insert
606 of node_ptr at upper level. And a subsequent page shrink is
607 possible. It causes delete of node_ptr at the upper level.
608 So we should pay attention also to 2nd record not only
609 first record and last record. Because if the "delete & insert" are
610 done for the different page, the 2nd record become
611 first record and following compress might delete the record and causes
612 the uppper level node_ptr modification. */
613
614 if (lock_intention <= BTR_INTENTION_BOTH) {
615 ulint margin;
616
617 if (lock_intention == BTR_INTENTION_BOTH) {
618 ulint level = btr_page_get_level(page, mtr);
619
620 /* This value is the worst expectation for the node_ptr
621 records to be deleted from this page. It is used to
622 expect whether the cursor position can be the left_most
623 record in this page or not. */
624 ulint max_nodes_deleted = 0;
625
626 /* By modifying tree operations from the under of this
627 level, logically (2 ^ (level - 1)) opportunities to
628 deleting records in maximum even unreally rare case. */
629 if (level > 7) {
630 /* TODO: adjust this practical limit. */
631 max_nodes_deleted = 64;
632 } else if (level > 0) {
633 max_nodes_deleted = (ulint)1 << (level - 1);
634 }
635
636 /* check delete will cause. (BTR_INTENTION_BOTH
637 or BTR_INTENTION_DELETE) */
638 if (page_get_n_recs(page) <= max_nodes_deleted * 2
639 || page_rec_is_first(rec, page)) {
640 /* The cursor record can be the left most record
641 in this page. */
642 return(true);
643 }
644
645 if (fil_page_get_prev(page) != FIL_NULL
646 && page_rec_distance_is_at_most(
647 page_get_infimum_rec(page), rec,
648 max_nodes_deleted)) {
649 return (true);
650 }
651
652 if (fil_page_get_next(page) != FIL_NULL
653 && page_rec_distance_is_at_most(
654 rec, page_get_supremum_rec(page),
655 max_nodes_deleted)) {
656 return (true);
657 }
658
659 /* Delete at leftmost record in a page causes delete
660 & insert at its parent page. After that, the delete
661 might cause btr_compress() and delete record at its
662 parent page. Thus we should consider max deletes. */
663
664 margin = rec_size * max_nodes_deleted;
665 } else {
666 ut_ad(lock_intention == BTR_INTENTION_DELETE);
667
668 margin = rec_size;
669 }
670 /* Safe because we already have SX latch of the index tree */
671 if (page_get_data_size(page)
672 < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
673 || (fil_page_get_next(page) == FIL_NULL
674 && fil_page_get_prev(page) == FIL_NULL)) {
675 return(true);
676 }
677 }
678
679 if (lock_intention >= BTR_INTENTION_BOTH) {
680 /* check insert will cause. BTR_INTENTION_BOTH
681 or BTR_INTENTION_INSERT*/
682
683 /* Once we invoke the btr_cur_limit_optimistic_insert_debug,
684 we should check it here in advance, since the max allowable
685 records in a page is limited. */
686 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
687 return(true));
688
689 /* needs 2 records' space for the case the single split and
690 insert cannot fit.
691 page_get_max_insert_size_after_reorganize() includes space
692 for page directory already */
693 ulint max_size
694 = page_get_max_insert_size_after_reorganize(page, 2);
695
696 if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
697 || max_size < rec_size * 2) {
698 return(true);
699 }
700 /* TODO: optimize this condition for compressed page.
701 this is based on the worst compress rate.
702 currently looking only uncompressed page, but we can look
703 also compressed page page_zip_available() if already in the
704 buffer pool */
705 /* needs 2 records' space also for worst compress rate. */
706 if (page_size.is_compressed()
707 && page_zip_empty_size(index->n_fields,
708 page_size.physical())
709 < rec_size * 2 + page_get_data_size(page)
710 + page_dir_calc_reserved_space(
711 page_get_n_recs(page) + 2) + 1) {
712 return(true);
713 }
714 }
715
716 return(false);
717 }
718
719 /** Detects whether the modifying record might need a opposite modification
720 to the intention.
721 @param[in] page page
722 @param[in] lock_intention lock intention for the tree operation
723 @param[in] rec record (current node_ptr)
724 @return true if tree modification is needed */
725 static
726 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)727 btr_cur_need_opposite_intention(
728 const page_t* page,
729 btr_intention_t lock_intention,
730 const rec_t* rec)
731 {
732 switch (lock_intention) {
733 case BTR_INTENTION_DELETE:
734 return((mach_read_from_4(page + FIL_PAGE_PREV) != FIL_NULL
735 && page_rec_is_first(rec, page))
736 || (mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
737 && page_rec_is_last(rec, page)));
738 case BTR_INTENTION_INSERT:
739 return(mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
740 && page_rec_is_last(rec, page));
741 case BTR_INTENTION_BOTH:
742 return(false);
743 }
744
745 ut_error;
746 return(false);
747 }
748
749 /********************************************************************//**
750 Searches an index tree and positions a tree cursor on a given level.
751 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
752 to node pointer page number fields on the upper levels of the tree!
753 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
754 cursor->up_match and cursor->low_match both will have sensible values.
755 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
756
757 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
758 search tuple should be performed in the B-tree. InnoDB does an insert
759 immediately after the cursor. Thus, the cursor may end up on a user record,
760 or on a page infimum record. */
761 dberr_t
btr_cur_search_to_nth_level(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,ulint has_search_latch,const char * file,ulint line,mtr_t * mtr)762 btr_cur_search_to_nth_level(
763 /*========================*/
764 dict_index_t* index, /*!< in: index */
765 ulint level, /*!< in: the tree level of search */
766 const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
767 tuple must be set so that it cannot get
768 compared to the node ptr page number field! */
769 page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
770 Inserts should always be made using
771 PAGE_CUR_LE to search the position! */
772 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
773 at most one of BTR_INSERT, BTR_DELETE_MARK,
774 BTR_DELETE, or BTR_ESTIMATE;
775 cursor->left_block is used to store a pointer
776 to the left neighbor page, in the cases
777 BTR_SEARCH_PREV and BTR_MODIFY_PREV;
778 NOTE that if has_search_latch
779 is != 0, we maybe do not have a latch set
780 on the cursor page, we assume
781 the caller uses his search latch
782 to protect the record! */
783 btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
784 s- or x-latched, but see also above! */
785 ulint has_search_latch,
786 /*!< in: info on the latch mode the
787 caller currently has on search system:
788 RW_S_LATCH, or 0 */
789 const char* file, /*!< in: file name */
790 ulint line, /*!< in: line where called */
791 mtr_t* mtr) /*!< in: mtr */
792 {
793 page_t* page = NULL; /* remove warning */
794 buf_block_t* block;
795 ulint height;
796 ulint up_match;
797 ulint up_bytes;
798 ulint low_match;
799 ulint low_bytes;
800 ulint savepoint;
801 ulint rw_latch;
802 page_cur_mode_t page_mode;
803 page_cur_mode_t search_mode = PAGE_CUR_UNSUPP;
804 ulint buf_mode;
805 ulint estimate;
806 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
807 page_cur_t* page_cursor;
808 btr_op_t btr_op;
809 ulint root_height = 0; /* remove warning */
810 dberr_t err = DB_SUCCESS;
811
812 ulint upper_rw_latch, root_leaf_rw_latch;
813 btr_intention_t lock_intention;
814 bool modify_external;
815 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
816 ulint tree_savepoints[BTR_MAX_LEVELS];
817 ulint n_blocks = 0;
818 ulint n_releases = 0;
819 bool detected_same_key_root = false;
820
821 bool retrying_for_search_prev = false;
822 ulint leftmost_from_level = 0;
823 buf_block_t** prev_tree_blocks = NULL;
824 ulint* prev_tree_savepoints = NULL;
825 ulint prev_n_blocks = 0;
826 ulint prev_n_releases = 0;
827 bool need_path = true;
828 bool rtree_parent_modified = false;
829 bool mbr_adj = false;
830 bool found = false;
831
832 DBUG_ENTER("btr_cur_search_to_nth_level");
833
834 btr_search_t* info;
835 mem_heap_t* heap = NULL;
836 ulint offsets_[REC_OFFS_NORMAL_SIZE];
837 ulint* offsets = offsets_;
838 ulint offsets2_[REC_OFFS_NORMAL_SIZE];
839 ulint* offsets2 = offsets2_;
840 rec_offs_init(offsets_);
841 rec_offs_init(offsets2_);
842 /* Currently, PAGE_CUR_LE is the only search mode used for searches
843 ending to upper levels */
844
845 ut_ad(level == 0 || mode == PAGE_CUR_LE
846 || RTREE_SEARCH_MODE(mode));
847 ut_ad(dict_index_check_search_tuple(index, tuple));
848 ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
849 ut_ad(dtuple_check_typed(tuple));
850 ut_ad(!(index->type & DICT_FTS));
851 ut_ad(index->page != FIL_NULL);
852
853 UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
854 UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
855 UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
856 UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
857 #ifdef UNIV_DEBUG
858 cursor->up_match = ULINT_UNDEFINED;
859 cursor->low_match = ULINT_UNDEFINED;
860 #endif /* UNIV_DEBUG */
861
862 ibool s_latch_by_caller;
863
864 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
865
866 ut_ad(!s_latch_by_caller
867 || srv_read_only_mode
868 || mtr_memo_contains_flagged(mtr,
869 dict_index_get_lock(index),
870 MTR_MEMO_S_LOCK
871 | MTR_MEMO_SX_LOCK));
872
873 /* These flags are mutually exclusive, they are lumped together
874 with the latch mode for historical reasons. It's possible for
875 none of the flags to be set. */
876 switch (UNIV_EXPECT(latch_mode
877 & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
878 0)) {
879 case 0:
880 btr_op = BTR_NO_OP;
881 break;
882 case BTR_INSERT:
883 btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
884 ? BTR_INSERT_IGNORE_UNIQUE_OP
885 : BTR_INSERT_OP;
886 break;
887 case BTR_DELETE:
888 btr_op = BTR_DELETE_OP;
889 ut_a(cursor->purge_node);
890 break;
891 case BTR_DELETE_MARK:
892 btr_op = BTR_DELMARK_OP;
893 break;
894 default:
895 /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
896 should be specified at a time */
897 ut_error;
898 }
899
900 /* Operations on the insert buffer tree cannot be buffered. */
901 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
902 /* Operations on the clustered index cannot be buffered. */
903 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
904 /* Operations on the temporary table(indexes) cannot be buffered. */
905 ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table));
906 /* Operation on the spatial index cannot be buffered. */
907 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
908
909 estimate = latch_mode & BTR_ESTIMATE;
910
911 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
912
913 modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
914
915 /* Turn the flags unrelated to the latch mode off. */
916 latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
917
918 ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
919
920 ut_ad(!s_latch_by_caller
921 || latch_mode == BTR_SEARCH_LEAF
922 || latch_mode == BTR_SEARCH_TREE
923 || latch_mode == BTR_MODIFY_LEAF);
924
925 cursor->flag = BTR_CUR_BINARY;
926 cursor->index = index;
927
928 info = btr_search_get_info(index);
929
930 # ifdef UNIV_SEARCH_PERF_STAT
931 info->n_searches++;
932 # endif
933 /* Use of AHI is disabled for intrinsic table as these tables re-use
934 the index-id and AHI validation is based on index-id. */
935 if (rw_lock_get_writer(btr_get_search_latch(index))
936 == RW_LOCK_NOT_LOCKED
937 && latch_mode <= BTR_MODIFY_LEAF
938 && info->last_hash_succ
939 && !index->disable_ahi
940 && !estimate
941 # ifdef PAGE_CUR_LE_OR_EXTENDS
942 && mode != PAGE_CUR_LE_OR_EXTENDS
943 # endif /* PAGE_CUR_LE_OR_EXTENDS */
944 && !dict_index_is_spatial(index)
945 /* If !has_search_latch, we do a dirty read of
946 btr_search_enabled below, and btr_search_guess_on_hash()
947 will have to check it again. */
948 && UNIV_LIKELY(btr_search_enabled)
949 && !modify_external
950 && btr_search_guess_on_hash(index, info, tuple, mode,
951 latch_mode, cursor,
952 has_search_latch, mtr)) {
953
954 /* Search using the hash index succeeded */
955
956 ut_ad(cursor->up_match != ULINT_UNDEFINED
957 || mode != PAGE_CUR_GE);
958 ut_ad(cursor->up_match != ULINT_UNDEFINED
959 || mode != PAGE_CUR_LE);
960 ut_ad(cursor->low_match != ULINT_UNDEFINED
961 || mode != PAGE_CUR_LE);
962 btr_cur_n_sea++;
963
964 DBUG_RETURN(err);
965 }
966 btr_cur_n_non_sea++;
967
968 /* If the hash search did not succeed, do binary search down the
969 tree */
970
971 if (has_search_latch) {
972 /* Release possible search latch to obey latching order */
973 rw_lock_s_unlock(btr_get_search_latch(index));
974 }
975
976 /* Store the position of the tree latch we push to mtr so that we
977 know how to release it when we have latched leaf node(s) */
978
979 savepoint = mtr_set_savepoint(mtr);
980
981 switch (latch_mode) {
982 case BTR_MODIFY_TREE:
983 /* Most of delete-intended operations are purging.
984 Free blocks and read IO bandwidth should be prior
985 for them, when the history list is glowing huge. */
986 if (lock_intention == BTR_INTENTION_DELETE
987 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
988 && buf_get_n_pending_read_ios()) {
989 mtr_x_lock(dict_index_get_lock(index), mtr);
990 } else if (dict_index_is_spatial(index)
991 && lock_intention <= BTR_INTENTION_BOTH) {
992 /* X lock the if there is possibility of
993 pessimistic delete on spatial index. As we could
994 lock upward for the tree */
995
996 mtr_x_lock(dict_index_get_lock(index), mtr);
997 } else {
998 mtr_sx_lock(dict_index_get_lock(index), mtr);
999 }
1000 upper_rw_latch = RW_X_LATCH;
1001 break;
1002 case BTR_CONT_MODIFY_TREE:
1003 case BTR_CONT_SEARCH_TREE:
1004 /* Do nothing */
1005 ut_ad(srv_read_only_mode
1006 || mtr_memo_contains_flagged(mtr,
1007 dict_index_get_lock(index),
1008 MTR_MEMO_X_LOCK
1009 | MTR_MEMO_SX_LOCK));
1010 if (dict_index_is_spatial(index)
1011 && latch_mode == BTR_CONT_MODIFY_TREE) {
1012 /* If we are about to locating parent page for split
1013 and/or merge operation for R-Tree index, X latch
1014 the parent */
1015 upper_rw_latch = RW_X_LATCH;
1016 } else {
1017 upper_rw_latch = RW_NO_LATCH;
1018 }
1019 break;
1020 default:
1021 if (!srv_read_only_mode) {
1022 if (s_latch_by_caller) {
1023 ut_ad(rw_lock_own(dict_index_get_lock(index),
1024 RW_LOCK_S));
1025 } else if (!modify_external) {
1026 /* BTR_SEARCH_TREE is intended to be used with
1027 BTR_ALREADY_S_LATCHED */
1028 ut_ad(latch_mode != BTR_SEARCH_TREE);
1029
1030 mtr_s_lock(dict_index_get_lock(index), mtr);
1031 } else {
1032 /* BTR_MODIFY_EXTERNAL needs to be excluded */
1033 mtr_sx_lock(dict_index_get_lock(index), mtr);
1034 }
1035 upper_rw_latch = RW_S_LATCH;
1036 } else {
1037 upper_rw_latch = RW_NO_LATCH;
1038 }
1039 }
1040 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1041
1042 page_cursor = btr_cur_get_page_cur(cursor);
1043
1044 const ulint space = dict_index_get_space(index);
1045 const page_size_t page_size(dict_table_page_size(index->table));
1046
1047 /* Start with the root page. */
1048 page_id_t page_id(space, dict_index_get_page(index));
1049
1050 if (root_leaf_rw_latch == RW_X_LATCH) {
1051 node_ptr_max_size = dict_index_node_ptr_max_size(index);
1052 }
1053
1054 up_match = 0;
1055 up_bytes = 0;
1056 low_match = 0;
1057 low_bytes = 0;
1058
1059 height = ULINT_UNDEFINED;
1060
1061 /* We use these modified search modes on non-leaf levels of the
1062 B-tree. These let us end up in the right B-tree leaf. In that leaf
1063 we use the original search mode. */
1064
1065 switch (mode) {
1066 case PAGE_CUR_GE:
1067 page_mode = PAGE_CUR_L;
1068 break;
1069 case PAGE_CUR_G:
1070 page_mode = PAGE_CUR_LE;
1071 break;
1072 default:
1073 #ifdef PAGE_CUR_LE_OR_EXTENDS
1074 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1075 || RTREE_SEARCH_MODE(mode)
1076 || mode == PAGE_CUR_LE_OR_EXTENDS);
1077 #else /* PAGE_CUR_LE_OR_EXTENDS */
1078 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1079 || RTREE_SEARCH_MODE(mode));
1080 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1081 page_mode = mode;
1082 break;
1083 }
1084
1085 /* Loop and search until we arrive at the desired level */
1086 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1087
1088 search_loop:
1089 buf_mode = BUF_GET;
1090 rw_latch = RW_NO_LATCH;
1091 rtree_parent_modified = false;
1092
1093 if (height != 0) {
1094 /* We are about to fetch the root or a non-leaf page. */
1095 if ((latch_mode != BTR_MODIFY_TREE
1096 || height == level)
1097 && !retrying_for_search_prev) {
1098 /* If doesn't have SX or X latch of index,
1099 each pages should be latched before reading. */
1100 if (modify_external
1101 && height == ULINT_UNDEFINED
1102 && upper_rw_latch == RW_S_LATCH) {
1103 /* needs sx-latch of root page
1104 for fseg operation */
1105 rw_latch = RW_SX_LATCH;
1106 } else {
1107 rw_latch = upper_rw_latch;
1108 }
1109 }
1110 } else if (latch_mode <= BTR_MODIFY_LEAF) {
1111 rw_latch = latch_mode;
1112
1113 if (btr_op != BTR_NO_OP
1114 && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1115
1116 /* Try to buffer the operation if the leaf
1117 page is not in the buffer pool. */
1118
1119 buf_mode = btr_op == BTR_DELETE_OP
1120 ? BUF_GET_IF_IN_POOL_OR_WATCH
1121 : BUF_GET_IF_IN_POOL;
1122 }
1123 }
1124
1125 retry_page_get:
1126 ut_ad(n_blocks < BTR_MAX_LEVELS);
1127 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1128 block = buf_page_get_gen(
1129 page_id, page_size, rw_latch,
1130 (height == ULINT_UNDEFINED ? info->root_guess : NULL),
1131 buf_mode, file, line, mtr, false, &err
1132 );
1133
1134 tree_blocks[n_blocks] = block;
1135
1136 if (err != DB_SUCCESS) {
1137 ut_ad(block == NULL);
1138 if (err == DB_DECRYPTION_FAILED) {
1139 ib::warn() << "Table is encrypted but encryption service or"
1140 " used key_id is not available. "
1141 " Can't continue reading table.";
1142
1143 page_cursor->block = 0;
1144 page_cursor->rec = 0;
1145 index->table->set_file_unreadable();
1146 if (estimate) {
1147
1148 cursor->path_arr->nth_rec =
1149 ULINT_UNDEFINED;
1150 }
1151 }
1152
1153 goto func_exit;
1154 }
1155
1156 if (block == NULL) {
1157 SRV_CORRUPT_TABLE_CHECK(buf_mode == BUF_GET_IF_IN_POOL ||
1158 buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH,
1159 {
1160 page_cursor->block = 0;
1161 page_cursor->rec = 0;
1162 if (estimate) {
1163
1164 cursor->path_arr->nth_rec =
1165 ULINT_UNDEFINED;
1166 }
1167
1168 goto func_exit;
1169 });
1170
1171 /* This must be a search to perform an insert/delete
1172 mark/ delete; try using the insert/delete buffer */
1173
1174 ut_ad(height == 0);
1175 ut_ad(cursor->thr);
1176
1177 switch (btr_op) {
1178 case BTR_INSERT_OP:
1179 case BTR_INSERT_IGNORE_UNIQUE_OP:
1180 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1181 ut_ad(!dict_index_is_spatial(index));
1182
1183 if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1184 page_id, page_size, cursor->thr)) {
1185
1186 cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1187
1188 goto func_exit;
1189 }
1190 break;
1191
1192 case BTR_DELMARK_OP:
1193 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1194 ut_ad(!dict_index_is_spatial(index));
1195
1196 if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1197 index, page_id, page_size,
1198 cursor->thr)) {
1199
1200 cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1201
1202 goto func_exit;
1203 }
1204
1205 break;
1206
1207 case BTR_DELETE_OP:
1208 ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1209 ut_ad(!dict_index_is_spatial(index));
1210
1211 if (!row_purge_poss_sec(cursor->purge_node,
1212 index, tuple)) {
1213
1214 /* The record cannot be purged yet. */
1215 cursor->flag = BTR_CUR_DELETE_REF;
1216 } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1217 index, page_id, page_size,
1218 cursor->thr)) {
1219
1220 /* The purge was buffered. */
1221 cursor->flag = BTR_CUR_DELETE_IBUF;
1222 } else {
1223 /* The purge could not be buffered. */
1224 buf_pool_watch_unset(page_id);
1225 break;
1226 }
1227
1228 buf_pool_watch_unset(page_id);
1229 goto func_exit;
1230
1231 default:
1232 ut_error;
1233 }
1234
1235 /* Insert to the insert/delete buffer did not succeed, we
1236 must read the page from disk. */
1237
1238 buf_mode = BUF_GET;
1239
1240 goto retry_page_get;
1241 }
1242
1243 if (retrying_for_search_prev && height != 0) {
1244 /* also latch left sibling */
1245 ulint left_page_no;
1246 buf_block_t* get_block;
1247
1248 ut_ad(rw_latch == RW_NO_LATCH);
1249
1250 rw_latch = upper_rw_latch;
1251
1252 rw_lock_s_lock(&block->lock);
1253 left_page_no = btr_page_get_prev(
1254 buf_block_get_frame(block), mtr);
1255 rw_lock_s_unlock(&block->lock);
1256
1257 if (left_page_no != FIL_NULL) {
1258 ut_ad(prev_n_blocks < leftmost_from_level);
1259
1260 prev_tree_savepoints[prev_n_blocks]
1261 = mtr_set_savepoint(mtr);
1262 get_block = buf_page_get_gen(
1263 page_id_t(page_id.space(), left_page_no),
1264 page_size, rw_latch, NULL, buf_mode,
1265 file, line, mtr, false, &err);
1266 prev_tree_blocks[prev_n_blocks] = get_block;
1267 prev_n_blocks++;
1268
1269 if (err != DB_SUCCESS) {
1270 if (err == DB_DECRYPTION_FAILED) {
1271 ib::warn() << "Table is encrypted but encryption service or"
1272 " used key_id is not available. "
1273 " Can't continue reading table.";
1274 if (estimate) {
1275
1276 page_cursor->block = 0;
1277 page_cursor->rec = 0;
1278 cursor->path_arr->nth_rec =
1279 ULINT_UNDEFINED;
1280 }
1281 index->table->set_file_unreadable();
1282 }
1283 goto func_exit;
1284 }
1285
1286
1287 /* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1288 without their parent page's lock. So, not needed to
1289 retry here, because we have the parent page's lock. */
1290 }
1291
1292 /* release RW_NO_LATCH page and lock with RW_S_LATCH */
1293 mtr_release_block_at_savepoint(
1294 mtr, tree_savepoints[n_blocks],
1295 tree_blocks[n_blocks]);
1296
1297 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1298 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
1299 buf_mode, file, line, mtr, false, &err);
1300 tree_blocks[n_blocks] = block;
1301
1302 if (err != DB_SUCCESS) {
1303 if (err == DB_DECRYPTION_FAILED) {
1304 ib::warn() << "Table is encrypted but encryption service or"
1305 " used key_id is not available. "
1306 " Can't continue reading table.";
1307 if (estimate) {
1308 page_cursor->block = 0;
1309 page_cursor->rec = 0;
1310
1311 cursor->path_arr->nth_rec =
1312 ULINT_UNDEFINED;
1313 }
1314 index->table->set_file_unreadable();
1315 }
1316
1317 goto func_exit;
1318 }
1319 }
1320
1321 page = buf_block_get_frame(block);
1322
1323 SRV_CORRUPT_TABLE_CHECK(page,
1324 {
1325 page_cursor->block = 0;
1326 page_cursor->rec = 0;
1327
1328 if (estimate) {
1329
1330 cursor->path_arr->nth_rec = ULINT_UNDEFINED;
1331 }
1332
1333 goto func_exit;
1334 });
1335
1336 if (height == ULINT_UNDEFINED
1337 && page_is_leaf(page)
1338 && rw_latch != RW_NO_LATCH
1339 && rw_latch != root_leaf_rw_latch) {
1340 /* We should retry to get the page, because the root page
1341 is latched with different level as a leaf page. */
1342 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1343 ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1344 ut_ad(rw_latch == RW_S_LATCH || modify_external);
1345
1346 ut_ad(n_blocks == 0);
1347 mtr_release_block_at_savepoint(
1348 mtr, tree_savepoints[n_blocks],
1349 tree_blocks[n_blocks]);
1350
1351 upper_rw_latch = root_leaf_rw_latch;
1352 goto search_loop;
1353 }
1354
1355 if (rw_latch != RW_NO_LATCH) {
1356 #ifdef UNIV_ZIP_DEBUG
1357 const page_zip_des_t* page_zip
1358 = buf_block_get_page_zip(block);
1359 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1360 #endif /* UNIV_ZIP_DEBUG */
1361
1362 buf_block_dbg_add_level(
1363 block, dict_index_is_ibuf(index)
1364 ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1365 }
1366
1367 ut_ad(fil_page_index_page_check(page));
1368 ut_ad(index->id == btr_page_get_index_id(page));
1369
1370 if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
1371 /* We are in the root node */
1372
1373 height = btr_page_get_level(page, mtr);
1374 root_height = height;
1375 cursor->tree_height = root_height + 1;
1376
1377 if (dict_index_is_spatial(index)) {
1378 ut_ad(cursor->rtr_info);
1379
1380 node_seq_t seq_no = rtr_get_current_ssn_id(index);
1381
1382 /* If SSN in memory is not initialized, fetch
1383 it from root page */
1384 if (seq_no < 1) {
1385 node_seq_t root_seq_no;
1386
1387 root_seq_no = page_get_ssn_id(page);
1388
1389 mutex_enter(&(index->rtr_ssn.mutex));
1390 index->rtr_ssn.seq_no = root_seq_no + 1;
1391 mutex_exit(&(index->rtr_ssn.mutex));
1392 }
1393
1394 /* Save the MBR */
1395 cursor->rtr_info->thr = cursor->thr;
1396 rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1397 }
1398
1399 info->root_guess = block;
1400 }
1401
1402 if (height == 0) {
1403 if (rw_latch == RW_NO_LATCH) {
1404
1405 latch_leaves = btr_cur_latch_leaves(
1406 block, page_id, page_size, latch_mode,
1407 cursor, mtr);
1408 }
1409
1410 switch (latch_mode) {
1411 case BTR_MODIFY_TREE:
1412 case BTR_CONT_MODIFY_TREE:
1413 case BTR_CONT_SEARCH_TREE:
1414 break;
1415 default:
1416 if (!s_latch_by_caller
1417 && !srv_read_only_mode
1418 && !modify_external) {
1419 /* Release the tree s-latch */
1420 /* NOTE: BTR_MODIFY_EXTERNAL
1421 needs to keep tree sx-latch */
1422 mtr_release_s_latch_at_savepoint(
1423 mtr, savepoint,
1424 dict_index_get_lock(index));
1425 }
1426
1427 /* release upper blocks */
1428 if (retrying_for_search_prev) {
1429 for (;
1430 prev_n_releases < prev_n_blocks;
1431 prev_n_releases++) {
1432 mtr_release_block_at_savepoint(
1433 mtr,
1434 prev_tree_savepoints[
1435 prev_n_releases],
1436 prev_tree_blocks[
1437 prev_n_releases]);
1438 }
1439 }
1440
1441 for (; n_releases < n_blocks; n_releases++) {
1442 if (n_releases == 0 && modify_external) {
1443 /* keep latch of root page */
1444 ut_ad(mtr_memo_contains_flagged(
1445 mtr, tree_blocks[n_releases],
1446 MTR_MEMO_PAGE_SX_FIX
1447 | MTR_MEMO_PAGE_X_FIX));
1448 continue;
1449 }
1450
1451 mtr_release_block_at_savepoint(
1452 mtr, tree_savepoints[n_releases],
1453 tree_blocks[n_releases]);
1454 }
1455 }
1456
1457 page_mode = mode;
1458 }
1459
1460 if (dict_index_is_spatial(index)) {
1461 /* Remember the page search mode */
1462 search_mode = page_mode;
1463
1464 /* Some adjustment on search mode, when the
1465 page search mode is PAGE_CUR_RTREE_LOCATE
1466 or PAGE_CUR_RTREE_INSERT, as we are searching
1467 with MBRs. When it is not the target level, we
1468 should search all sub-trees that "CONTAIN" the
1469 search range/MBR. When it is at the target
1470 level, the search becomes PAGE_CUR_LE */
1471 if (page_mode == PAGE_CUR_RTREE_LOCATE
1472 && level == height) {
1473 if (level == 0) {
1474 page_mode = PAGE_CUR_LE;
1475 } else {
1476 page_mode = PAGE_CUR_RTREE_GET_FATHER;
1477 }
1478 }
1479
1480 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1481 page_mode = (level == height)
1482 ? PAGE_CUR_LE
1483 : PAGE_CUR_RTREE_INSERT;
1484
1485 ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1486 }
1487
1488 /* "need_path" indicates if we need to tracking the parent
1489 pages, if it is not spatial comparison, then no need to
1490 track it */
1491 if (page_mode < PAGE_CUR_CONTAIN) {
1492 need_path = false;
1493 }
1494
1495 up_match = 0;
1496 low_match = 0;
1497
1498 if (latch_mode == BTR_MODIFY_TREE
1499 || latch_mode == BTR_CONT_MODIFY_TREE
1500 || latch_mode == BTR_CONT_SEARCH_TREE) {
1501 /* Tree are locked, no need for Page Lock to protect
1502 the "path" */
1503 cursor->rtr_info->need_page_lock = false;
1504 }
1505 }
1506
1507 if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1508 ut_ad(need_path);
1509 found = rtr_cur_search_with_match(
1510 block, index, tuple, page_mode, page_cursor,
1511 cursor->rtr_info);
1512
1513 /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1514 if (search_mode == PAGE_CUR_RTREE_INSERT
1515 && cursor->rtr_info->mbr_adj) {
1516 if (latch_mode & BTR_MODIFY_LEAF) {
1517 /* Parent MBR needs updated, should retry
1518 with BTR_MODIFY_TREE */
1519 goto func_exit;
1520 } else if (latch_mode & BTR_MODIFY_TREE) {
1521 rtree_parent_modified = true;
1522 cursor->rtr_info->mbr_adj = false;
1523 mbr_adj = true;
1524 } else {
1525 ut_ad(0);
1526 }
1527 }
1528
1529 if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1530 cursor->low_match =
1531 DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1532 }
1533 } else if (height == 0 && btr_search_enabled
1534 && !dict_index_is_spatial(index)) {
1535 /* The adaptive hash index is only used when searching
1536 for leaf pages (height==0), but not in r-trees.
1537 We only need the byte prefix comparison for the purpose
1538 of updating the adaptive hash index. */
1539 page_cur_search_with_match_bytes(
1540 block, index, tuple, page_mode, &up_match, &up_bytes,
1541 &low_match, &low_bytes, page_cursor);
1542 } else {
1543 /* Search for complete index fields. */
1544 up_bytes = low_bytes = 0;
1545 page_cur_search_with_match(
1546 block, index, tuple, page_mode, &up_match,
1547 &low_match, page_cursor,
1548 need_path ? cursor->rtr_info : NULL);
1549 }
1550
1551 if (estimate) {
1552 btr_cur_add_path_info(cursor, height, root_height);
1553 }
1554
1555 /* If this is the desired level, leave the loop */
1556
1557 ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
1558 mtr));
1559
1560 /* Add Predicate lock if it is serializable isolation
1561 and only if it is in the search case */
1562 if (dict_index_is_spatial(index)
1563 && cursor->rtr_info->need_prdt_lock
1564 && mode != PAGE_CUR_RTREE_INSERT
1565 && mode != PAGE_CUR_RTREE_LOCATE
1566 && mode >= PAGE_CUR_CONTAIN) {
1567 trx_t* trx = thr_get_trx(cursor->thr);
1568 lock_prdt_t prdt;
1569
1570 lock_mutex_enter();
1571 lock_init_prdt_from_mbr(
1572 &prdt, &cursor->rtr_info->mbr, mode,
1573 trx->lock.lock_heap);
1574 lock_mutex_exit();
1575
1576 if (rw_latch == RW_NO_LATCH && height != 0) {
1577 rw_lock_s_lock(&(block->lock));
1578 }
1579
1580 lock_prdt_lock(block, &prdt, index, LOCK_S,
1581 LOCK_PREDICATE, cursor->thr, mtr);
1582
1583 if (rw_latch == RW_NO_LATCH && height != 0) {
1584 rw_lock_s_unlock(&(block->lock));
1585 }
1586 }
1587
1588 if (level != height) {
1589
1590 const rec_t* node_ptr;
1591 ut_ad(height > 0);
1592
1593 height--;
1594
1595 node_ptr = page_cur_get_rec(page_cursor);
1596
1597 offsets = rec_get_offsets(
1598 node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
1599
1600 /* If the rec is the first or last in the page for
1601 pessimistic delete intention, it might cause node_ptr insert
1602 for the upper level. We should change the intention and retry.
1603 */
1604 if (latch_mode == BTR_MODIFY_TREE
1605 && btr_cur_need_opposite_intention(
1606 page, lock_intention, node_ptr)) {
1607
1608 need_opposite_intention:
1609 ut_ad(upper_rw_latch == RW_X_LATCH);
1610
1611 if (n_releases > 0) {
1612 /* release root block */
1613 mtr_release_block_at_savepoint(
1614 mtr, tree_savepoints[0],
1615 tree_blocks[0]);
1616 }
1617
1618 /* release all blocks */
1619 for (; n_releases <= n_blocks; n_releases++) {
1620 mtr_release_block_at_savepoint(
1621 mtr, tree_savepoints[n_releases],
1622 tree_blocks[n_releases]);
1623 }
1624
1625 lock_intention = BTR_INTENTION_BOTH;
1626
1627 page_id.reset(space, dict_index_get_page(index));
1628 up_match = 0;
1629 low_match = 0;
1630 height = ULINT_UNDEFINED;
1631
1632 n_blocks = 0;
1633 n_releases = 0;
1634
1635 goto search_loop;
1636 }
1637
1638 if (dict_index_is_spatial(index)) {
1639 if (page_rec_is_supremum(node_ptr)) {
1640 cursor->low_match = 0;
1641 cursor->up_match = 0;
1642 goto func_exit;
1643 }
1644
1645 /* If we are doing insertion or record locating,
1646 remember the tree nodes we visited */
1647 if (page_mode == PAGE_CUR_RTREE_INSERT
1648 || (search_mode == PAGE_CUR_RTREE_LOCATE
1649 && (latch_mode != BTR_MODIFY_LEAF))) {
1650 bool add_latch = false;
1651
1652 if (latch_mode == BTR_MODIFY_TREE
1653 && rw_latch == RW_NO_LATCH) {
1654 ut_ad(mtr_memo_contains_flagged(
1655 mtr, dict_index_get_lock(index),
1656 MTR_MEMO_X_LOCK
1657 | MTR_MEMO_SX_LOCK));
1658 rw_lock_s_lock(&block->lock);
1659 add_latch = true;
1660 }
1661
1662 /* Store the parent cursor location */
1663 #ifdef UNIV_DEBUG
1664 ulint num_stored = rtr_store_parent_path(
1665 block, cursor, latch_mode,
1666 height + 1, mtr);
1667 #else
1668 rtr_store_parent_path(
1669 block, cursor, latch_mode,
1670 height + 1, mtr);
1671 #endif
1672
1673 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1674 btr_pcur_t* r_cursor =
1675 rtr_get_parent_cursor(
1676 cursor, height + 1,
1677 true);
1678 /* If it is insertion, there should
1679 be only one parent for each level
1680 traverse */
1681 #ifdef UNIV_DEBUG
1682 ut_ad(num_stored == 1);
1683 #endif
1684
1685 node_ptr = btr_pcur_get_rec(r_cursor);
1686
1687 }
1688
1689 if (add_latch) {
1690 rw_lock_s_unlock(&block->lock);
1691 }
1692
1693 ut_ad(!page_rec_is_supremum(node_ptr));
1694 }
1695
1696 ut_ad(page_mode == search_mode
1697 || (page_mode == PAGE_CUR_WITHIN
1698 && search_mode == PAGE_CUR_RTREE_LOCATE));
1699
1700 page_mode = search_mode;
1701 }
1702
1703 /* If the first or the last record of the page
1704 or the same key value to the first record or last record,
1705 the another page might be choosen when BTR_CONT_MODIFY_TREE.
1706 So, the parent page should not released to avoiding deadlock
1707 with blocking the another search with the same key value. */
1708 if (!detected_same_key_root
1709 && lock_intention == BTR_INTENTION_BOTH
1710 && !dict_index_is_unique(index)
1711 && latch_mode == BTR_MODIFY_TREE
1712 && (up_match >= rec_offs_n_fields(offsets) - 1
1713 || low_match >= rec_offs_n_fields(offsets) - 1)) {
1714 const rec_t* first_rec
1715 = page_rec_get_next_const(
1716 page_get_infimum_rec(
1717 page));
1718 ulint matched_fields;
1719
1720 ut_ad(upper_rw_latch == RW_X_LATCH);
1721
1722 if (node_ptr == first_rec
1723 || page_rec_is_last(node_ptr, page)) {
1724 detected_same_key_root = true;
1725 } else {
1726 matched_fields = 0;
1727
1728 offsets2 = rec_get_offsets(
1729 first_rec, index, offsets2,
1730 ULINT_UNDEFINED, &heap);
1731 cmp_rec_rec_with_match(node_ptr, first_rec,
1732 offsets, offsets2, index,
1733 page_is_spatial_non_leaf(first_rec, index),
1734 false, &matched_fields);
1735
1736 if (matched_fields
1737 >= rec_offs_n_fields(offsets) - 1) {
1738 detected_same_key_root = true;
1739 } else {
1740 const rec_t* last_rec;
1741
1742 last_rec = page_rec_get_prev_const(
1743 page_get_supremum_rec(
1744 page));
1745
1746 matched_fields = 0;
1747
1748 offsets2 = rec_get_offsets(
1749 last_rec, index, offsets2,
1750 ULINT_UNDEFINED, &heap);
1751 cmp_rec_rec_with_match(
1752 node_ptr, last_rec,
1753 offsets, offsets2, index,
1754 page_is_spatial_non_leaf(last_rec, index),
1755 false, &matched_fields);
1756 if (matched_fields
1757 >= rec_offs_n_fields(offsets) - 1) {
1758 detected_same_key_root = true;
1759 }
1760 }
1761 }
1762 }
1763
1764 /* If the page might cause modify_tree,
1765 we should not release the parent page's lock. */
1766 if (!detected_same_key_root
1767 && latch_mode == BTR_MODIFY_TREE
1768 && !btr_cur_will_modify_tree(
1769 index, page, lock_intention, node_ptr,
1770 node_ptr_max_size, page_size, mtr)
1771 && !rtree_parent_modified) {
1772 ut_ad(upper_rw_latch == RW_X_LATCH);
1773 ut_ad(n_releases <= n_blocks);
1774
1775 /* we can release upper blocks */
1776 for (; n_releases < n_blocks; n_releases++) {
1777 if (n_releases == 0) {
1778 /* we should not release root page
1779 to pin to same block. */
1780 continue;
1781 }
1782
1783 /* release unused blocks to unpin */
1784 mtr_release_block_at_savepoint(
1785 mtr, tree_savepoints[n_releases],
1786 tree_blocks[n_releases]);
1787 }
1788 }
1789
1790 if (height == level
1791 && latch_mode == BTR_MODIFY_TREE) {
1792 ut_ad(upper_rw_latch == RW_X_LATCH);
1793 /* we should sx-latch root page, if released already.
1794 It contains seg_header. */
1795 if (n_releases > 0) {
1796 mtr_block_sx_latch_at_savepoint(
1797 mtr, tree_savepoints[0],
1798 tree_blocks[0]);
1799 }
1800
1801 /* x-latch the branch blocks not released yet. */
1802 for (ulint i = n_releases; i <= n_blocks; i++) {
1803 mtr_block_x_latch_at_savepoint(
1804 mtr, tree_savepoints[i],
1805 tree_blocks[i]);
1806 }
1807 }
1808
1809 /* We should consider prev_page of parent page, if the node_ptr
1810 is the leftmost of the page. because BTR_SEARCH_PREV and
1811 BTR_MODIFY_PREV latches prev_page of the leaf page. */
1812 if ((latch_mode == BTR_SEARCH_PREV
1813 || latch_mode == BTR_MODIFY_PREV)
1814 && !retrying_for_search_prev) {
1815 /* block should be latched for consistent
1816 btr_page_get_prev() */
1817 ut_ad(mtr_memo_contains_flagged(mtr, block,
1818 MTR_MEMO_PAGE_S_FIX
1819 | MTR_MEMO_PAGE_X_FIX));
1820
1821 if (btr_page_get_prev(page, mtr) != FIL_NULL
1822 && page_rec_is_first(node_ptr, page)) {
1823
1824 if (leftmost_from_level == 0) {
1825 leftmost_from_level = height + 1;
1826 }
1827 } else {
1828 leftmost_from_level = 0;
1829 }
1830
1831 if (height == 0 && leftmost_from_level > 0) {
1832 /* should retry to get also prev_page
1833 from level==leftmost_from_level. */
1834 retrying_for_search_prev = true;
1835
1836 prev_tree_blocks = static_cast<buf_block_t**>(
1837 ut_malloc_nokey(sizeof(buf_block_t*)
1838 * leftmost_from_level));
1839
1840 prev_tree_savepoints = static_cast<ulint*>(
1841 ut_malloc_nokey(sizeof(ulint)
1842 * leftmost_from_level));
1843
1844 /* back to the level (leftmost_from_level+1) */
1845 ulint idx = n_blocks
1846 - (leftmost_from_level - 1);
1847
1848 page_id.reset(
1849 space,
1850 tree_blocks[idx]->page.id.page_no());
1851
1852 for (ulint i = n_blocks
1853 - (leftmost_from_level - 1);
1854 i <= n_blocks; i++) {
1855 mtr_release_block_at_savepoint(
1856 mtr, tree_savepoints[i],
1857 tree_blocks[i]);
1858 }
1859
1860 n_blocks -= (leftmost_from_level - 1);
1861 height = leftmost_from_level;
1862 ut_ad(n_releases == 0);
1863
1864 /* replay up_match, low_match */
1865 up_match = 0;
1866 low_match = 0;
1867 rtr_info_t* rtr_info = need_path
1868 ? cursor->rtr_info : NULL;
1869
1870 for (ulint i = 0; i < n_blocks; i++) {
1871 page_cur_search_with_match(
1872 tree_blocks[i], index, tuple,
1873 page_mode, &up_match,
1874 &low_match, page_cursor,
1875 rtr_info);
1876 }
1877
1878 goto search_loop;
1879 }
1880 }
1881
1882 /* Go to the child node */
1883 page_id.reset(
1884 space,
1885 btr_node_ptr_get_child_page_no(node_ptr, offsets));
1886
1887 n_blocks++;
1888
1889 if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
1890 /* We're doing a search on an ibuf tree and we're one
1891 level above the leaf page. */
1892
1893 ut_ad(level == 0);
1894
1895 buf_mode = BUF_GET;
1896 rw_latch = RW_NO_LATCH;
1897 goto retry_page_get;
1898 }
1899
1900 if (dict_index_is_spatial(index)
1901 && page_mode >= PAGE_CUR_CONTAIN
1902 && page_mode != PAGE_CUR_RTREE_INSERT) {
1903 ut_ad(need_path);
1904 rtr_node_path_t* path =
1905 cursor->rtr_info->path;
1906
1907 if (!path->empty() && found) {
1908 #ifdef UNIV_DEBUG
1909 node_visit_t last_visit = path->back();
1910
1911 ut_ad(last_visit.page_no == page_id.page_no());
1912 #endif /* UNIV_DEBUG */
1913
1914 path->pop_back();
1915
1916 #ifdef UNIV_DEBUG
1917 if (page_mode == PAGE_CUR_RTREE_LOCATE
1918 && (latch_mode != BTR_MODIFY_LEAF)) {
1919 btr_pcur_t* cur
1920 = cursor->rtr_info->parent_path->back(
1921 ).cursor;
1922 rec_t* my_node_ptr
1923 = btr_pcur_get_rec(cur);
1924
1925 offsets = rec_get_offsets(
1926 my_node_ptr, index, offsets,
1927 ULINT_UNDEFINED, &heap);
1928
1929 ulint my_page_no
1930 = btr_node_ptr_get_child_page_no(
1931 my_node_ptr, offsets);
1932
1933 ut_ad(page_id.page_no() == my_page_no);
1934
1935 }
1936 #endif
1937 }
1938 }
1939
1940 goto search_loop;
1941 } else if (!dict_index_is_spatial(index)
1942 && latch_mode == BTR_MODIFY_TREE
1943 && lock_intention == BTR_INTENTION_INSERT
1944 && mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
1945 && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
1946
1947 /* btr_insert_into_right_sibling() might cause
1948 deleting node_ptr at upper level */
1949
1950 if (height == 0) {
1951 /* release the leaf pages if latched */
1952 for (uint i = 0; i < 3; i++) {
1953 if (latch_leaves.blocks[i] != NULL) {
1954 mtr_release_block_at_savepoint(
1955 mtr, latch_leaves.savepoints[i],
1956 latch_leaves.blocks[i]);
1957 latch_leaves.blocks[i] = NULL;
1958 }
1959 }
1960 }
1961
1962 goto need_opposite_intention;
1963 }
1964
1965 if (level != 0) {
1966 if (upper_rw_latch == RW_NO_LATCH) {
1967 /* latch the page */
1968 buf_block_t* child_block;
1969
1970 if (latch_mode == BTR_CONT_MODIFY_TREE) {
1971 child_block = btr_block_get(
1972 page_id, page_size, RW_X_LATCH,
1973 index, mtr);
1974 } else {
1975 ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
1976 child_block = btr_block_get(
1977 page_id, page_size, RW_SX_LATCH,
1978 index, mtr);
1979 }
1980
1981 btr_assert_not_corrupted(child_block, index);
1982 } else {
1983 ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
1984 btr_assert_not_corrupted(block, index);
1985
1986 if (s_latch_by_caller) {
1987 ut_ad(latch_mode == BTR_SEARCH_TREE);
1988 /* to exclude modifying tree operations
1989 should sx-latch the index. */
1990 ut_ad(mtr_memo_contains(
1991 mtr, dict_index_get_lock(index),
1992 MTR_MEMO_SX_LOCK));
1993 /* because has sx-latch of index,
1994 can release upper blocks. */
1995 for (; n_releases < n_blocks; n_releases++) {
1996 mtr_release_block_at_savepoint(
1997 mtr,
1998 tree_savepoints[n_releases],
1999 tree_blocks[n_releases]);
2000 }
2001 }
2002 }
2003
2004 if (page_mode <= PAGE_CUR_LE) {
2005 cursor->low_match = low_match;
2006 cursor->up_match = up_match;
2007 }
2008 } else {
2009 cursor->low_match = low_match;
2010 cursor->low_bytes = low_bytes;
2011 cursor->up_match = up_match;
2012 cursor->up_bytes = up_bytes;
2013
2014 /* We do a dirty read of btr_search_enabled here. We
2015 will properly check btr_search_enabled again in
2016 btr_search_build_page_hash_index() before building a
2017 page hash index, while holding search latch. */
2018 if (btr_search_enabled && !index->disable_ahi) {
2019 btr_search_info_update(index, cursor);
2020 }
2021 ut_ad(cursor->up_match != ULINT_UNDEFINED
2022 || mode != PAGE_CUR_GE);
2023 ut_ad(cursor->up_match != ULINT_UNDEFINED
2024 || mode != PAGE_CUR_LE);
2025 ut_ad(cursor->low_match != ULINT_UNDEFINED
2026 || mode != PAGE_CUR_LE);
2027 }
2028
2029 /* For spatial index, remember what blocks are still latched */
2030 if (dict_index_is_spatial(index)
2031 && (latch_mode == BTR_MODIFY_TREE
2032 || latch_mode == BTR_MODIFY_LEAF)) {
2033 for (ulint i = 0; i < n_releases; i++) {
2034 cursor->rtr_info->tree_blocks[i] = NULL;
2035 cursor->rtr_info->tree_savepoints[i] = 0;
2036 }
2037
2038 for (ulint i = n_releases; i <= n_blocks; i++) {
2039 cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
2040 cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
2041 }
2042 }
2043
2044 func_exit:
2045
2046 if (UNIV_LIKELY_NULL(heap)) {
2047 mem_heap_free(heap);
2048 }
2049
2050 if (retrying_for_search_prev) {
2051 ut_free(prev_tree_blocks);
2052 ut_free(prev_tree_savepoints);
2053 }
2054
2055 if (has_search_latch) {
2056
2057 rw_lock_s_lock(btr_get_search_latch(index));
2058 }
2059
2060 if (mbr_adj) {
2061 /* remember that we will need to adjust parent MBR */
2062 cursor->rtr_info->mbr_adj = true;
2063 }
2064
2065 DBUG_RETURN(err);
2066 }
2067
2068 /** Searches an index tree and positions a tree cursor on a given level.
2069 This function will avoid latching the traversal path and so should be
2070 used only for cases where-in latching is not needed.
2071
2072 @param[in,out] index index
2073 @param[in] level the tree level of search
2074 @param[in] tuple data tuple; Note: n_fields_cmp in compared
2075 to the node ptr page node field
2076 @param[in] mode PAGE_CUR_L, ....
2077 Insert should always be made using PAGE_CUR_LE
2078 to search the position.
2079 @param[in,out] cursor tree cursor; points to record of interest.
2080 @param[in] file file name
2081 @param[in[ line line where called from
2082 @param[in,out] mtr mtr
2083 @param[in] mark_dirty
2084 if true then mark the block as dirty */
2085 void
btr_cur_search_to_nth_level_with_no_latch(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr,bool mark_dirty)2086 btr_cur_search_to_nth_level_with_no_latch(
2087 dict_index_t* index,
2088 ulint level,
2089 const dtuple_t* tuple,
2090 page_cur_mode_t mode,
2091 btr_cur_t* cursor,
2092 const char* file,
2093 ulint line,
2094 mtr_t* mtr,
2095 bool mark_dirty)
2096 {
2097 page_t* page = NULL; /* remove warning */
2098 buf_block_t* block;
2099 ulint height;
2100 ulint up_match;
2101 ulint low_match;
2102 ulint rw_latch;
2103 page_cur_mode_t page_mode;
2104 ulint buf_mode;
2105 page_cur_t* page_cursor;
2106 ulint root_height = 0; /* remove warning */
2107 ulint n_blocks = 0;
2108
2109 mem_heap_t* heap = NULL;
2110 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2111 ulint* offsets = offsets_;
2112 rec_offs_init(offsets_);
2113
2114 DBUG_ENTER("btr_cur_search_to_nth_level_with_no_latch");
2115
2116 ut_ad(dict_table_is_intrinsic(index->table));
2117 ut_ad(level == 0 || mode == PAGE_CUR_LE);
2118 ut_ad(dict_index_check_search_tuple(index, tuple));
2119 ut_ad(dtuple_check_typed(tuple));
2120 ut_ad(index->page != FIL_NULL);
2121
2122 UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
2123 UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
2124 #ifdef UNIV_DEBUG
2125 cursor->up_match = ULINT_UNDEFINED;
2126 cursor->low_match = ULINT_UNDEFINED;
2127 #endif /* UNIV_DEBUG */
2128
2129 cursor->flag = BTR_CUR_BINARY;
2130 cursor->index = index;
2131
2132 page_cursor = btr_cur_get_page_cur(cursor);
2133
2134 const ulint space = dict_index_get_space(index);
2135 const page_size_t page_size(dict_table_page_size(index->table));
2136 /* Start with the root page. */
2137 page_id_t page_id(space, dict_index_get_page(index));
2138
2139 up_match = 0;
2140 low_match = 0;
2141
2142 height = ULINT_UNDEFINED;
2143
2144 /* We use these modified search modes on non-leaf levels of the
2145 B-tree. These let us end up in the right B-tree leaf. In that leaf
2146 we use the original search mode. */
2147
2148 switch (mode) {
2149 case PAGE_CUR_GE:
2150 page_mode = PAGE_CUR_L;
2151 break;
2152 case PAGE_CUR_G:
2153 page_mode = PAGE_CUR_LE;
2154 break;
2155 default:
2156 page_mode = mode;
2157 break;
2158 }
2159
2160 /* Loop and search until we arrive at the desired level */
2161 bool at_desired_level = false;
2162 while (!at_desired_level) {
2163 buf_mode = BUF_GET;
2164 rw_latch = RW_NO_LATCH;
2165
2166 ut_ad(n_blocks < BTR_MAX_LEVELS);
2167
2168 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2169 buf_mode, file, line, mtr, mark_dirty);
2170
2171 page = buf_block_get_frame(block);
2172
2173 if (height == ULINT_UNDEFINED) {
2174 /* We are in the root node */
2175
2176 height = btr_page_get_level(page, mtr);
2177 root_height = height;
2178 cursor->tree_height = root_height + 1;
2179 }
2180
2181 if (height == 0) {
2182 /* On leaf level. Switch back to original search mode.*/
2183 page_mode = mode;
2184 }
2185
2186 page_cur_search_with_match(
2187 block, index, tuple, page_mode, &up_match,
2188 &low_match, page_cursor, NULL);
2189
2190 ut_ad(height == btr_page_get_level(
2191 page_cur_get_page(page_cursor), mtr));
2192
2193 if (level != height) {
2194
2195 const rec_t* node_ptr;
2196 ut_ad(height > 0);
2197
2198 height--;
2199
2200 node_ptr = page_cur_get_rec(page_cursor);
2201
2202 offsets = rec_get_offsets(
2203 node_ptr, index, offsets,
2204 ULINT_UNDEFINED, &heap);
2205
2206 /* Go to the child node */
2207 page_id.reset(space, btr_node_ptr_get_child_page_no(
2208 node_ptr, offsets));
2209
2210 n_blocks++;
2211 } else {
2212 /* If this is the desired level, leave the loop */
2213 at_desired_level = true;
2214 }
2215 }
2216
2217 cursor->low_match = low_match;
2218 cursor->up_match = up_match;
2219
2220 if (heap != NULL) {
2221 mem_heap_free(heap);
2222 }
2223
2224 DBUG_VOID_RETURN;
2225 }
2226
2227 /*****************************************************************//**
2228 Opens a cursor at either end of an index. */
2229 dberr_t
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2230 btr_cur_open_at_index_side_func(
2231 /*============================*/
2232 bool from_left, /*!< in: true if open to the low end,
2233 false if to the high end */
2234 dict_index_t* index, /*!< in: index */
2235 ulint latch_mode, /*!< in: latch mode */
2236 btr_cur_t* cursor, /*!< in/out: cursor */
2237 ulint level, /*!< in: level to search for
2238 (0=leaf). */
2239 const char* file, /*!< in: file name */
2240 ulint line, /*!< in: line where called */
2241 mtr_t* mtr) /*!< in/out: mini-transaction */
2242 {
2243 page_cur_t* page_cursor;
2244 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2245 ulint height;
2246 ulint root_height = 0; /* remove warning */
2247 rec_t* node_ptr;
2248 ulint estimate;
2249 ulint savepoint;
2250 ulint upper_rw_latch, root_leaf_rw_latch;
2251 btr_intention_t lock_intention;
2252 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2253 ulint tree_savepoints[BTR_MAX_LEVELS];
2254 ulint n_blocks = 0;
2255 ulint n_releases = 0;
2256 mem_heap_t* heap = NULL;
2257 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2258 ulint* offsets = offsets_;
2259 dberr_t err = DB_SUCCESS;
2260 rec_offs_init(offsets_);
2261
2262 estimate = latch_mode & BTR_ESTIMATE;
2263 latch_mode &= ~BTR_ESTIMATE;
2264
2265 ut_ad(level != ULINT_UNDEFINED);
2266
2267 bool s_latch_by_caller;
2268
2269 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2270 latch_mode &= ~BTR_ALREADY_S_LATCHED;
2271
2272 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2273
2274 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2275
2276 /* This function doesn't need to lock left page of the leaf page */
2277 if (latch_mode == BTR_SEARCH_PREV) {
2278 latch_mode = BTR_SEARCH_LEAF;
2279 } else if (latch_mode == BTR_MODIFY_PREV) {
2280 latch_mode = BTR_MODIFY_LEAF;
2281 }
2282
2283 /* Store the position of the tree latch we push to mtr so that we
2284 know how to release it when we have latched the leaf node */
2285
2286 savepoint = mtr_set_savepoint(mtr);
2287
2288 switch (latch_mode) {
2289 case BTR_CONT_MODIFY_TREE:
2290 case BTR_CONT_SEARCH_TREE:
2291 upper_rw_latch = RW_NO_LATCH;
2292 break;
2293 case BTR_MODIFY_TREE:
2294 /* Most of delete-intended operations are purging.
2295 Free blocks and read IO bandwidth should be prior
2296 for them, when the history list is glowing huge. */
2297 if (lock_intention == BTR_INTENTION_DELETE
2298 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2299 && buf_get_n_pending_read_ios()) {
2300 mtr_x_lock(dict_index_get_lock(index), mtr);
2301 } else {
2302 mtr_sx_lock(dict_index_get_lock(index), mtr);
2303 }
2304 upper_rw_latch = RW_X_LATCH;
2305 break;
2306 default:
2307 ut_ad(!s_latch_by_caller
2308 || mtr_memo_contains_flagged(mtr,
2309 dict_index_get_lock(index),
2310 MTR_MEMO_SX_LOCK
2311 | MTR_MEMO_S_LOCK));
2312 if (!srv_read_only_mode) {
2313 if (!s_latch_by_caller) {
2314 /* BTR_SEARCH_TREE is intended to be used with
2315 BTR_ALREADY_S_LATCHED */
2316 ut_ad(latch_mode != BTR_SEARCH_TREE);
2317
2318 mtr_s_lock(dict_index_get_lock(index), mtr);
2319 }
2320 upper_rw_latch = RW_S_LATCH;
2321 } else {
2322 upper_rw_latch = RW_NO_LATCH;
2323 }
2324 }
2325 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2326
2327 page_cursor = btr_cur_get_page_cur(cursor);
2328 cursor->index = index;
2329
2330 page_id_t page_id(dict_index_get_space(index),
2331 dict_index_get_page(index));
2332 const page_size_t& page_size = dict_table_page_size(index->table);
2333
2334 if (root_leaf_rw_latch == RW_X_LATCH) {
2335 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2336 }
2337
2338 height = ULINT_UNDEFINED;
2339
2340 for (;;) {
2341 buf_block_t* block;
2342 page_t* page;
2343 ulint rw_latch;
2344
2345 ut_ad(n_blocks < BTR_MAX_LEVELS);
2346
2347 if (height != 0
2348 && (latch_mode != BTR_MODIFY_TREE
2349 || height == level)) {
2350 rw_latch = upper_rw_latch;
2351 } else {
2352 rw_latch = RW_NO_LATCH;
2353 }
2354
2355 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2356 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2357 BUF_GET, file, line, mtr, false, &err);
2358 tree_blocks[n_blocks] = block;
2359
2360 if (err != DB_SUCCESS) {
2361 if (err == DB_DECRYPTION_FAILED) {
2362 ib::warn() << "Table is encrypted but encryption service or"
2363 " used key_id is not available. "
2364 " Can't continue reading table.";
2365 page_cursor->block = 0;
2366 page_cursor->rec = 0;
2367 if (estimate) {
2368
2369 cursor->path_arr->nth_rec = ULINT_UNDEFINED;
2370 }
2371
2372 index->table->set_file_unreadable();
2373 }
2374 goto exit_loop;
2375 }
2376
2377 page = buf_block_get_frame(block);
2378
2379 SRV_CORRUPT_TABLE_CHECK(page,
2380 {
2381 page_cursor->block = 0;
2382 page_cursor->rec = 0;
2383
2384 if (estimate) {
2385
2386 cursor->path_arr->nth_rec = ULINT_UNDEFINED;
2387 }
2388 /* Can't use break with the macro */
2389 goto exit_loop;
2390 });
2391
2392 if (height == ULINT_UNDEFINED
2393 && btr_page_get_level(page, mtr) == 0
2394 && rw_latch != RW_NO_LATCH
2395 && rw_latch != root_leaf_rw_latch) {
2396 /* We should retry to get the page, because the root page
2397 is latched with different level as a leaf page. */
2398 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2399 ut_ad(rw_latch == RW_S_LATCH);
2400
2401 ut_ad(n_blocks == 0);
2402 mtr_release_block_at_savepoint(
2403 mtr, tree_savepoints[n_blocks],
2404 tree_blocks[n_blocks]);
2405
2406 upper_rw_latch = root_leaf_rw_latch;
2407 continue;
2408 }
2409
2410 ut_ad(fil_page_index_page_check(page));
2411 ut_ad(index->id == btr_page_get_index_id(page));
2412
2413 if (height == ULINT_UNDEFINED) {
2414 /* We are in the root node */
2415
2416 height = btr_page_get_level(page, mtr);
2417 root_height = height;
2418 ut_a(height >= level);
2419 } else {
2420 /* TODO: flag the index corrupted if this fails */
2421 ut_ad(height == btr_page_get_level(page, mtr));
2422 }
2423
2424 if (height == level) {
2425 if (srv_read_only_mode) {
2426 btr_cur_latch_leaves(
2427 block, page_id, page_size,
2428 latch_mode, cursor, mtr);
2429 } else if (height == 0) {
2430 if (rw_latch == RW_NO_LATCH) {
2431 btr_cur_latch_leaves(
2432 block, page_id, page_size,
2433 latch_mode, cursor, mtr);
2434 }
2435 /* In versions <= 3.23.52 we had
2436 forgotten to release the tree latch
2437 here. If in an index scan we had to
2438 scan far to find a record visible to
2439 the current transaction, that could
2440 starve others waiting for the tree
2441 latch. */
2442
2443 switch (latch_mode) {
2444 case BTR_MODIFY_TREE:
2445 case BTR_CONT_MODIFY_TREE:
2446 case BTR_CONT_SEARCH_TREE:
2447 break;
2448 default:
2449 if (!s_latch_by_caller) {
2450 /* Release the tree s-latch */
2451 mtr_release_s_latch_at_savepoint(
2452 mtr, savepoint,
2453 dict_index_get_lock(
2454 index));
2455 }
2456
2457 /* release upper blocks */
2458 for (; n_releases < n_blocks;
2459 n_releases++) {
2460 mtr_release_block_at_savepoint(
2461 mtr,
2462 tree_savepoints[
2463 n_releases],
2464 tree_blocks[
2465 n_releases]);
2466 }
2467 }
2468 } else { /* height != 0 */
2469 /* We already have the block latched. */
2470 ut_ad(latch_mode == BTR_SEARCH_TREE);
2471 ut_ad(s_latch_by_caller);
2472 ut_ad(upper_rw_latch == RW_S_LATCH);
2473
2474 ut_ad(mtr_memo_contains(mtr, block,
2475 upper_rw_latch));
2476
2477 if (s_latch_by_caller) {
2478 /* to exclude modifying tree operations
2479 should sx-latch the index. */
2480 ut_ad(mtr_memo_contains(
2481 mtr,
2482 dict_index_get_lock(index),
2483 MTR_MEMO_SX_LOCK));
2484 /* because has sx-latch of index,
2485 can release upper blocks. */
2486 for (; n_releases < n_blocks;
2487 n_releases++) {
2488 mtr_release_block_at_savepoint(
2489 mtr,
2490 tree_savepoints[
2491 n_releases],
2492 tree_blocks[
2493 n_releases]);
2494 }
2495 }
2496 }
2497 }
2498
2499 if (from_left) {
2500 page_cur_set_before_first(block, page_cursor);
2501 } else {
2502 page_cur_set_after_last(block, page_cursor);
2503 }
2504
2505 if (height == level) {
2506 if (estimate) {
2507 btr_cur_add_path_info(cursor, height,
2508 root_height);
2509 }
2510
2511 break;
2512 }
2513
2514 ut_ad(height > 0);
2515
2516 if (from_left) {
2517 page_cur_move_to_next(page_cursor);
2518 } else {
2519 page_cur_move_to_prev(page_cursor);
2520 }
2521
2522 if (estimate) {
2523 btr_cur_add_path_info(cursor, height, root_height);
2524 }
2525
2526 height--;
2527
2528 node_ptr = page_cur_get_rec(page_cursor);
2529 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2530 ULINT_UNDEFINED, &heap);
2531
2532 /* If the rec is the first or last in the page for
2533 pessimistic delete intention, it might cause node_ptr insert
2534 for the upper level. We should change the intention and retry.
2535 */
2536 if (latch_mode == BTR_MODIFY_TREE
2537 && btr_cur_need_opposite_intention(
2538 page, lock_intention, node_ptr)) {
2539
2540 ut_ad(upper_rw_latch == RW_X_LATCH);
2541 /* release all blocks */
2542 for (; n_releases <= n_blocks; n_releases++) {
2543 mtr_release_block_at_savepoint(
2544 mtr, tree_savepoints[n_releases],
2545 tree_blocks[n_releases]);
2546 }
2547
2548 lock_intention = BTR_INTENTION_BOTH;
2549
2550 page_id.set_page_no(dict_index_get_page(index));
2551
2552 height = ULINT_UNDEFINED;
2553
2554 n_blocks = 0;
2555 n_releases = 0;
2556
2557 continue;
2558 }
2559
2560 if (latch_mode == BTR_MODIFY_TREE
2561 && !btr_cur_will_modify_tree(
2562 cursor->index, page, lock_intention, node_ptr,
2563 node_ptr_max_size, page_size, mtr)) {
2564 ut_ad(upper_rw_latch == RW_X_LATCH);
2565 ut_ad(n_releases <= n_blocks);
2566
2567 /* we can release upper blocks */
2568 for (; n_releases < n_blocks; n_releases++) {
2569 if (n_releases == 0) {
2570 /* we should not release root page
2571 to pin to same block. */
2572 continue;
2573 }
2574
2575 /* release unused blocks to unpin */
2576 mtr_release_block_at_savepoint(
2577 mtr, tree_savepoints[n_releases],
2578 tree_blocks[n_releases]);
2579 }
2580 }
2581
2582 if (height == level
2583 && latch_mode == BTR_MODIFY_TREE) {
2584 ut_ad(upper_rw_latch == RW_X_LATCH);
2585 /* we should sx-latch root page, if released already.
2586 It contains seg_header. */
2587 if (n_releases > 0) {
2588 mtr_block_sx_latch_at_savepoint(
2589 mtr, tree_savepoints[0],
2590 tree_blocks[0]);
2591 }
2592
2593 /* x-latch the branch blocks not released yet. */
2594 for (ulint i = n_releases; i <= n_blocks; i++) {
2595 mtr_block_x_latch_at_savepoint(
2596 mtr, tree_savepoints[i],
2597 tree_blocks[i]);
2598 }
2599 }
2600
2601 /* Go to the child node */
2602 page_id.set_page_no(
2603 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2604
2605 n_blocks++;
2606 }
2607
2608 exit_loop:
2609 if (heap) {
2610 mem_heap_free(heap);
2611 }
2612
2613 return err;
2614 }
2615
2616 /** Opens a cursor at either end of an index.
2617 Avoid taking latches on buffer, just pin (by incrementing fix_count)
2618 to keep them in buffer pool. This mode is used by intrinsic table
2619 as they are not shared and so there is no need of latching.
2620 @param[in] from_left true if open to low end, false if open
2621 to high end.
2622 @param[in] index index
2623 @param[in,out] cursor cursor
2624 @param[in] file file name
2625 @param[in] line line where called
2626 @param[in,out] mtr mini transaction
2627 */
2628 void
btr_cur_open_at_index_side_with_no_latch_func(bool from_left,dict_index_t * index,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2629 btr_cur_open_at_index_side_with_no_latch_func(
2630 bool from_left,
2631 dict_index_t* index,
2632 btr_cur_t* cursor,
2633 ulint level,
2634 const char* file,
2635 ulint line,
2636 mtr_t* mtr)
2637 {
2638 page_cur_t* page_cursor;
2639 ulint height;
2640 rec_t* node_ptr;
2641 ulint n_blocks = 0;
2642 mem_heap_t* heap = NULL;
2643 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2644 ulint* offsets = offsets_;
2645 rec_offs_init(offsets_);
2646
2647 ut_ad(level != ULINT_UNDEFINED);
2648
2649 page_cursor = btr_cur_get_page_cur(cursor);
2650 cursor->index = index;
2651 page_id_t page_id(dict_index_get_space(index),
2652 dict_index_get_page(index));
2653 const page_size_t& page_size = dict_table_page_size(index->table);
2654
2655 height = ULINT_UNDEFINED;
2656
2657 for (;;) {
2658 buf_block_t* block;
2659 page_t* page;
2660 ulint rw_latch = RW_NO_LATCH;
2661
2662 ut_ad(n_blocks < BTR_MAX_LEVELS);
2663
2664 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2665 BUF_GET, file, line, mtr);
2666
2667 page = buf_block_get_frame(block);
2668
2669 ut_ad(fil_page_index_page_check(page));
2670 ut_ad(index->id == btr_page_get_index_id(page));
2671
2672 if (height == ULINT_UNDEFINED) {
2673 /* We are in the root node */
2674
2675 height = btr_page_get_level(page, mtr);
2676 ut_a(height >= level);
2677 } else {
2678 /* TODO: flag the index corrupted if this fails */
2679 ut_ad(height == btr_page_get_level(page, mtr));
2680 }
2681
2682 if (from_left) {
2683 page_cur_set_before_first(block, page_cursor);
2684 } else {
2685 page_cur_set_after_last(block, page_cursor);
2686 }
2687
2688 if (height == level) {
2689 break;
2690 }
2691
2692 ut_ad(height > 0);
2693
2694 if (from_left) {
2695 page_cur_move_to_next(page_cursor);
2696 } else {
2697 page_cur_move_to_prev(page_cursor);
2698 }
2699
2700 height--;
2701
2702 node_ptr = page_cur_get_rec(page_cursor);
2703 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2704 ULINT_UNDEFINED, &heap);
2705
2706 /* Go to the child node */
2707 page_id.set_page_no(
2708 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2709
2710 n_blocks++;
2711 }
2712
2713 if (heap != NULL) {
2714 mem_heap_free(heap);
2715 }
2716 }
2717
2718 /**********************************************************************//**
2719 Positions a cursor at a randomly chosen position within a B-tree.
2720 @return true if the index is available and we have put the cursor, false
2721 if the index is unavailable */
2722 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)2723 btr_cur_open_at_rnd_pos_func(
2724 /*=========================*/
2725 dict_index_t* index, /*!< in: index */
2726 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
2727 btr_cur_t* cursor, /*!< in/out: B-tree cursor */
2728 const char* file, /*!< in: file name */
2729 ulint line, /*!< in: line where called */
2730 mtr_t* mtr) /*!< in: mtr */
2731 {
2732 page_cur_t* page_cursor;
2733 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2734 ulint height;
2735 rec_t* node_ptr;
2736 ulint savepoint;
2737 ulint upper_rw_latch, root_leaf_rw_latch;
2738 btr_intention_t lock_intention;
2739 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2740 ulint tree_savepoints[BTR_MAX_LEVELS];
2741 ulint n_blocks = 0;
2742 ulint n_releases = 0;
2743 mem_heap_t* heap = NULL;
2744 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2745 ulint* offsets = offsets_;
2746 rec_offs_init(offsets_);
2747
2748 ut_ad(!dict_index_is_spatial(index));
2749
2750 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2751
2752 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2753
2754 savepoint = mtr_set_savepoint(mtr);
2755
2756 switch (latch_mode) {
2757 case BTR_MODIFY_TREE:
2758 /* Most of delete-intended operations are purging.
2759 Free blocks and read IO bandwidth should be prior
2760 for them, when the history list is glowing huge. */
2761 if (lock_intention == BTR_INTENTION_DELETE
2762 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2763 && buf_get_n_pending_read_ios()) {
2764 mtr_x_lock(dict_index_get_lock(index), mtr);
2765 } else {
2766 mtr_sx_lock(dict_index_get_lock(index), mtr);
2767 }
2768 upper_rw_latch = RW_X_LATCH;
2769 break;
2770 case BTR_SEARCH_PREV:
2771 case BTR_MODIFY_PREV:
2772 /* This function doesn't support left uncle
2773 page lock for left leaf page lock, when
2774 needed. */
2775 case BTR_SEARCH_TREE:
2776 case BTR_CONT_MODIFY_TREE:
2777 case BTR_CONT_SEARCH_TREE:
2778 ut_ad(0);
2779 /* fall through */
2780 default:
2781 if (!srv_read_only_mode) {
2782 mtr_s_lock(dict_index_get_lock(index), mtr);
2783 upper_rw_latch = RW_S_LATCH;
2784 } else {
2785 upper_rw_latch = RW_NO_LATCH;
2786 }
2787 }
2788
2789 DBUG_EXECUTE_IF("test_index_is_unavailable",
2790 return(false););
2791
2792 if (index->page == FIL_NULL) {
2793 /* Since we don't hold index lock until just now, the index
2794 could be modified by others, for example, if this is a
2795 statistics updater for referenced table, it could be marked
2796 as unavailable by 'DROP TABLE' in the mean time, since
2797 we don't hold lock for statistics updater */
2798 return(false);
2799 }
2800
2801 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2802
2803 page_cursor = btr_cur_get_page_cur(cursor);
2804 cursor->index = index;
2805
2806 page_id_t page_id(dict_index_get_space(index),
2807 dict_index_get_page(index));
2808 const page_size_t& page_size = dict_table_page_size(index->table);
2809 dberr_t err = DB_SUCCESS;
2810
2811 if (root_leaf_rw_latch == RW_X_LATCH) {
2812 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2813 }
2814
2815 height = ULINT_UNDEFINED;
2816
2817 for (;;) {
2818 buf_block_t* block;
2819 page_t* page;
2820 ulint rw_latch;
2821
2822 ut_ad(n_blocks < BTR_MAX_LEVELS);
2823
2824 if (height != 0
2825 && latch_mode != BTR_MODIFY_TREE) {
2826 rw_latch = upper_rw_latch;
2827 } else {
2828 rw_latch = RW_NO_LATCH;
2829 }
2830
2831 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2832 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2833 BUF_GET, file, line, mtr, false, &err);
2834 tree_blocks[n_blocks] = block;
2835
2836 ut_ad((block != NULL) == (err == DB_SUCCESS));
2837
2838 if (err != DB_SUCCESS) {
2839 if (err == DB_DECRYPTION_FAILED) {
2840 ib::warn() << "Table %s is encrypted but encryption service or"
2841 " used key_id is not available. "
2842 " Can't continue reading table.";
2843 page_cursor->block = 0;
2844 page_cursor->rec = 0;
2845 index->table->set_file_unreadable();
2846 }
2847
2848 goto exit_loop;
2849 }
2850
2851
2852 page = buf_block_get_frame(block);
2853
2854 SRV_CORRUPT_TABLE_CHECK(page,
2855 {
2856 page_cursor->block = 0;
2857 page_cursor->rec = 0;
2858
2859 goto exit_loop;
2860 });
2861
2862 if (height == ULINT_UNDEFINED
2863 && btr_page_get_level(page, mtr) == 0
2864 && rw_latch != RW_NO_LATCH
2865 && rw_latch != root_leaf_rw_latch) {
2866 /* We should retry to get the page, because the root page
2867 is latched with different level as a leaf page. */
2868 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2869 ut_ad(rw_latch == RW_S_LATCH);
2870
2871 ut_ad(n_blocks == 0);
2872 mtr_release_block_at_savepoint(
2873 mtr, tree_savepoints[n_blocks],
2874 tree_blocks[n_blocks]);
2875
2876 upper_rw_latch = root_leaf_rw_latch;
2877 continue;
2878 }
2879
2880 ut_ad(fil_page_index_page_check(page));
2881 ut_ad(index->id == btr_page_get_index_id(page));
2882
2883 if (height == ULINT_UNDEFINED) {
2884 /* We are in the root node */
2885
2886 height = btr_page_get_level(page, mtr);
2887 }
2888
2889 if (height == 0) {
2890 if (rw_latch == RW_NO_LATCH
2891 || srv_read_only_mode) {
2892 btr_cur_latch_leaves(
2893 block, page_id, page_size,
2894 latch_mode, cursor, mtr);
2895 }
2896
2897 /* btr_cur_open_at_index_side_func() and
2898 btr_cur_search_to_nth_level() release
2899 tree s-latch here.*/
2900 switch (latch_mode) {
2901 case BTR_MODIFY_TREE:
2902 case BTR_CONT_MODIFY_TREE:
2903 case BTR_CONT_SEARCH_TREE:
2904 break;
2905 default:
2906 /* Release the tree s-latch */
2907 if (!srv_read_only_mode) {
2908 mtr_release_s_latch_at_savepoint(
2909 mtr, savepoint,
2910 dict_index_get_lock(index));
2911 }
2912
2913 /* release upper blocks */
2914 for (; n_releases < n_blocks; n_releases++) {
2915 mtr_release_block_at_savepoint(
2916 mtr,
2917 tree_savepoints[n_releases],
2918 tree_blocks[n_releases]);
2919 }
2920 }
2921 }
2922
2923 page_cur_open_on_rnd_user_rec(block, page_cursor);
2924
2925 if (height == 0) {
2926
2927 break;
2928 }
2929
2930 ut_ad(height > 0);
2931
2932 height--;
2933
2934 node_ptr = page_cur_get_rec(page_cursor);
2935 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2936 ULINT_UNDEFINED, &heap);
2937
2938 /* If the rec is the first or last in the page for
2939 pessimistic delete intention, it might cause node_ptr insert
2940 for the upper level. We should change the intention and retry.
2941 */
2942 if (latch_mode == BTR_MODIFY_TREE
2943 && btr_cur_need_opposite_intention(
2944 page, lock_intention, node_ptr)) {
2945
2946 ut_ad(upper_rw_latch == RW_X_LATCH);
2947 /* release all blocks */
2948 for (; n_releases <= n_blocks; n_releases++) {
2949 mtr_release_block_at_savepoint(
2950 mtr, tree_savepoints[n_releases],
2951 tree_blocks[n_releases]);
2952 }
2953
2954 lock_intention = BTR_INTENTION_BOTH;
2955
2956 page_id.set_page_no(dict_index_get_page(index));
2957
2958 height = ULINT_UNDEFINED;
2959
2960 n_blocks = 0;
2961 n_releases = 0;
2962
2963 continue;
2964 }
2965
2966 if (latch_mode == BTR_MODIFY_TREE
2967 && !btr_cur_will_modify_tree(
2968 cursor->index, page, lock_intention, node_ptr,
2969 node_ptr_max_size, page_size, mtr)) {
2970 ut_ad(upper_rw_latch == RW_X_LATCH);
2971 ut_ad(n_releases <= n_blocks);
2972
2973 /* we can release upper blocks */
2974 for (; n_releases < n_blocks; n_releases++) {
2975 if (n_releases == 0) {
2976 /* we should not release root page
2977 to pin to same block. */
2978 continue;
2979 }
2980
2981 /* release unused blocks to unpin */
2982 mtr_release_block_at_savepoint(
2983 mtr, tree_savepoints[n_releases],
2984 tree_blocks[n_releases]);
2985 }
2986 }
2987
2988 if (height == 0
2989 && latch_mode == BTR_MODIFY_TREE) {
2990 ut_ad(upper_rw_latch == RW_X_LATCH);
2991 /* we should sx-latch root page, if released already.
2992 It contains seg_header. */
2993 if (n_releases > 0) {
2994 mtr_block_sx_latch_at_savepoint(
2995 mtr, tree_savepoints[0],
2996 tree_blocks[0]);
2997 }
2998
2999 /* x-latch the branch blocks not released yet. */
3000 for (ulint i = n_releases; i <= n_blocks; i++) {
3001 mtr_block_x_latch_at_savepoint(
3002 mtr, tree_savepoints[i],
3003 tree_blocks[i]);
3004 }
3005 }
3006
3007 /* Go to the child node */
3008 page_id.set_page_no(
3009 btr_node_ptr_get_child_page_no(node_ptr, offsets));
3010
3011 n_blocks++;
3012 }
3013
3014 exit_loop:
3015 if (UNIV_LIKELY_NULL(heap)) {
3016 mem_heap_free(heap);
3017 }
3018
3019 return(true);
3020 }
3021
3022 /*==================== B-TREE INSERT =========================*/
3023
3024 /*************************************************************//**
3025 Inserts a record if there is enough space, or if enough space can
3026 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
3027 no heuristics is applied to whether it pays to use CPU time for
3028 reorganizing the page or not.
3029
3030 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
3031 if this is a compressed leaf page in a secondary index.
3032 This has to be done either within the same mini-transaction,
3033 or by invoking ibuf_reset_free_bits() before mtr_commit().
3034
3035 @return pointer to inserted record if succeed, else NULL */
3036 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3037 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)3038 btr_cur_insert_if_possible(
3039 /*=======================*/
3040 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
3041 cursor stays valid */
3042 const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
3043 have been stored to tuple */
3044 ulint** offsets,/*!< out: offsets on *rec */
3045 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
3046 ulint n_ext, /*!< in: number of externally stored columns */
3047 mtr_t* mtr) /*!< in/out: mini-transaction */
3048 {
3049 page_cur_t* page_cursor;
3050 rec_t* rec;
3051
3052 ut_ad(dtuple_check_typed(tuple));
3053
3054 ut_ad(mtr_is_block_fix(
3055 mtr, btr_cur_get_block(cursor),
3056 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3057 page_cursor = btr_cur_get_page_cur(cursor);
3058
3059 /* Now, try the insert */
3060 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3061 offsets, heap, n_ext, mtr);
3062
3063 /* If the record did not fit, reorganize.
3064 For compressed pages, page_cur_tuple_insert()
3065 attempted this already. */
3066 if (!rec && !page_cur_get_page_zip(page_cursor)
3067 && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
3068 rec = page_cur_tuple_insert(
3069 page_cursor, tuple, cursor->index,
3070 offsets, heap, n_ext, mtr);
3071 }
3072
3073 ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
3074 return(rec);
3075 }
3076
3077 /*************************************************************//**
3078 For an insert, checks the locks and does the undo logging if desired.
3079 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3080 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
3081 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,ibool * inherit)3082 btr_cur_ins_lock_and_undo(
3083 /*======================*/
3084 ulint flags, /*!< in: undo logging and locking flags: if
3085 not zero, the parameters index and thr
3086 should be specified */
3087 btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
3088 dtuple_t* entry, /*!< in/out: entry to insert */
3089 que_thr_t* thr, /*!< in: query thread or NULL */
3090 mtr_t* mtr, /*!< in/out: mini-transaction */
3091 ibool* inherit)/*!< out: TRUE if the inserted new record maybe
3092 should inherit LOCK_GAP type locks from the
3093 successor record */
3094 {
3095 dict_index_t* index;
3096 dberr_t err = DB_SUCCESS;
3097 rec_t* rec;
3098 roll_ptr_t roll_ptr;
3099
3100 /* Check if we have to wait for a lock: enqueue an explicit lock
3101 request if yes */
3102
3103 rec = btr_cur_get_rec(cursor);
3104 index = cursor->index;
3105
3106 ut_ad(!dict_index_is_online_ddl(index)
3107 || dict_index_is_clust(index)
3108 || (flags & BTR_CREATE_FLAG));
3109 ut_ad(mtr->is_named_space(index->space));
3110
3111 /* Check if there is predicate or GAP lock preventing the insertion */
3112 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3113 if (dict_index_is_spatial(index)) {
3114 lock_prdt_t prdt;
3115 rtr_mbr_t mbr;
3116
3117 rtr_get_mbr_from_tuple(entry, &mbr);
3118
3119 /* Use on stack MBR variable to test if a lock is
3120 needed. If so, the predicate (MBR) will be allocated
3121 from lock heap in lock_prdt_insert_check_and_lock() */
3122 lock_init_prdt_from_mbr(
3123 &prdt, &mbr, 0, NULL);
3124
3125 err = lock_prdt_insert_check_and_lock(
3126 flags, rec, btr_cur_get_block(cursor),
3127 index, thr, mtr, &prdt);
3128 *inherit = false;
3129 } else {
3130 err = lock_rec_insert_check_and_lock(
3131 flags, rec, btr_cur_get_block(cursor),
3132 index, thr, mtr, inherit);
3133 }
3134 }
3135
3136 if (err != DB_SUCCESS
3137 || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
3138
3139 return(err);
3140 }
3141
3142 err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
3143 thr, index, entry,
3144 NULL, 0, NULL, NULL,
3145 &roll_ptr);
3146 if (err != DB_SUCCESS) {
3147
3148 return(err);
3149 }
3150
3151 /* Now we can fill in the roll ptr field in entry
3152 (except if table is intrinsic) */
3153
3154 if (!(flags & BTR_KEEP_SYS_FLAG)
3155 && !dict_table_is_intrinsic(index->table)) {
3156
3157 row_upd_index_entry_sys_field(entry, index,
3158 DATA_ROLL_PTR, roll_ptr);
3159 }
3160
3161 return(DB_SUCCESS);
3162 }
3163
3164 /**
3165 Prefetch siblings of the leaf for the pessimistic operation.
3166 @param block leaf page */
3167 static
3168 void
btr_cur_prefetch_siblings(buf_block_t * block)3169 btr_cur_prefetch_siblings(
3170 buf_block_t* block)
3171 {
3172 page_t* page = buf_block_get_frame(block);
3173
3174 ut_ad(page_is_leaf(page));
3175
3176 ulint left_page_no = fil_page_get_prev(page);
3177 ulint right_page_no = fil_page_get_next(page);
3178
3179 if (left_page_no != FIL_NULL) {
3180 buf_read_page_background(
3181 page_id_t(block->page.id.space(), left_page_no),
3182 block->page.size, false);
3183 }
3184 if (right_page_no != FIL_NULL) {
3185 buf_read_page_background(
3186 page_id_t(block->page.id.space(), right_page_no),
3187 block->page.size, false);
3188 }
3189 if (left_page_no != FIL_NULL
3190 || right_page_no != FIL_NULL) {
3191 os_aio_simulated_wake_handler_threads();
3192 }
3193 }
3194
3195 /*************************************************************//**
3196 Tries to perform an insert to a page in an index tree, next to cursor.
3197 It is assumed that mtr holds an x-latch on the page. The operation does
3198 not succeed if there is too little space on the page. If there is just
3199 one record on the page, the insert will always succeed; this is to
3200 prevent trying to split a page with just one record.
3201 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3202 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3203 btr_cur_optimistic_insert(
3204 /*======================*/
3205 ulint flags, /*!< in: undo logging and locking flags: if not
3206 zero, the parameters index and thr should be
3207 specified */
3208 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
3209 cursor stays valid */
3210 ulint** offsets,/*!< out: offsets on *rec */
3211 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
3212 dtuple_t* entry, /*!< in/out: entry to insert */
3213 rec_t** rec, /*!< out: pointer to inserted record if
3214 succeed */
3215 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3216 be stored externally by the caller, or
3217 NULL */
3218 ulint n_ext, /*!< in: number of externally stored columns */
3219 que_thr_t* thr, /*!< in: query thread or NULL */
3220 mtr_t* mtr) /*!< in/out: mini-transaction;
3221 if this function returns DB_SUCCESS on
3222 a leaf page of a secondary index in a
3223 compressed tablespace, the caller must
3224 mtr_commit(mtr) before latching
3225 any further pages */
3226 {
3227 big_rec_t* big_rec_vec = NULL;
3228 dict_index_t* index;
3229 page_cur_t* page_cursor;
3230 buf_block_t* block;
3231 page_t* page;
3232 rec_t* dummy;
3233 ibool leaf;
3234 ibool reorg;
3235 ibool inherit = TRUE;
3236 ulint rec_size;
3237 dberr_t err;
3238
3239 *big_rec = NULL;
3240
3241 block = btr_cur_get_block(cursor);
3242
3243 SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION););
3244
3245 page = buf_block_get_frame(block);
3246 index = cursor->index;
3247
3248 /* Block are not latched for insert if table is intrinsic
3249 and index is auto-generated clustered index. */
3250 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3251 ut_ad(!dict_index_is_online_ddl(index)
3252 || dict_index_is_clust(index)
3253 || (flags & BTR_CREATE_FLAG));
3254 ut_ad(dtuple_check_typed(entry));
3255
3256 const page_size_t& page_size = block->page.size;
3257
3258 #ifdef UNIV_DEBUG_VALGRIND
3259 if (page_size.is_compressed()) {
3260 UNIV_MEM_ASSERT_RW(page, page_size.logical());
3261 UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3262 }
3263 #endif /* UNIV_DEBUG_VALGRIND */
3264
3265 leaf = page_is_leaf(page);
3266
3267 /* Calculate the record size when entry is converted to a record */
3268 rec_size = rec_get_converted_size(index, entry, n_ext);
3269
3270 if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3271 dtuple_get_n_fields(entry), page_size)) {
3272
3273 /* The record is so big that we have to store some fields
3274 externally on separate database pages */
3275 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3276
3277 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3278
3279 return(DB_TOO_BIG_RECORD);
3280 }
3281
3282 rec_size = rec_get_converted_size(index, entry, n_ext);
3283 }
3284
3285 if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
3286 if (big_rec_vec != NULL) {
3287 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3288 }
3289
3290 return(DB_TOO_BIG_RECORD);
3291 }
3292
3293 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3294 goto fail);
3295
3296 if (leaf && page_size.is_compressed()
3297 && (page_get_data_size(page) + rec_size
3298 >= dict_index_zip_pad_optimal_page_size(index))) {
3299 /* If compression padding tells us that insertion will
3300 result in too packed up page i.e.: which is likely to
3301 cause compression failure then don't do an optimistic
3302 insertion. */
3303 fail:
3304 err = DB_FAIL;
3305
3306 /* prefetch siblings of the leaf for the pessimistic
3307 operation, if the page is leaf. */
3308 if (page_is_leaf(page)) {
3309 btr_cur_prefetch_siblings(block);
3310 }
3311 fail_err:
3312
3313 if (big_rec_vec) {
3314 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3315 }
3316
3317 return(err);
3318 }
3319
3320 ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
3321
3322 if (page_has_garbage(page)) {
3323 if ((max_size < rec_size
3324 || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
3325 && page_get_n_recs(page) > 1
3326 && page_get_max_insert_size(page, 1) < rec_size) {
3327
3328 goto fail;
3329 }
3330 } else if (max_size < rec_size) {
3331 goto fail;
3332 }
3333
3334 /* If there have been many consecutive inserts to the
3335 clustered index leaf page of an uncompressed table, check if
3336 we have to split the page to reserve enough free space for
3337 future updates of records. */
3338
3339 if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3340 && page_get_n_recs(page) >= 2
3341 && dict_index_get_space_reserve() + rec_size > max_size
3342 && (btr_page_get_split_rec_to_right(cursor, &dummy)
3343 || btr_page_get_split_rec_to_left(cursor, &dummy))) {
3344 goto fail;
3345 }
3346
3347 page_cursor = btr_cur_get_page_cur(cursor);
3348
3349 DBUG_PRINT("ib_cur", ("insert %s (" IB_ID_FMT ") by " TRX_ID_FMT
3350 ": %s",
3351 index->name(), index->id,
3352 thr != NULL
3353 ? trx_get_id_for_print(thr_get_trx(thr))
3354 : 0,
3355 rec_printer(entry).str().c_str()));
3356
3357 DBUG_EXECUTE_IF("do_page_reorganize",
3358 btr_page_reorganize(page_cursor, index, mtr););
3359
3360 /* Now, try the insert */
3361 {
3362 const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
3363
3364 if (dict_table_is_intrinsic(index->table)) {
3365
3366 index->rec_cache.rec_size = rec_size;
3367
3368 *rec = page_cur_tuple_direct_insert(
3369 page_cursor, entry, index, n_ext, mtr);
3370 } else {
3371 /* Check locks and write to the undo log,
3372 if specified */
3373 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3374 thr, mtr, &inherit);
3375
3376 if (err != DB_SUCCESS) {
3377 goto fail_err;
3378 }
3379
3380 *rec = page_cur_tuple_insert(
3381 page_cursor, entry, index, offsets, heap,
3382 n_ext, mtr);
3383 }
3384
3385 reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3386 }
3387
3388 if (*rec) {
3389 } else if (page_size.is_compressed()) {
3390 /* Reset the IBUF_BITMAP_FREE bits, because
3391 page_cur_tuple_insert() will have attempted page
3392 reorganize before failing. */
3393 if (leaf
3394 && !dict_index_is_clust(index)
3395 && !dict_table_is_temporary(index->table)) {
3396 ibuf_reset_free_bits(block);
3397 }
3398
3399 goto fail;
3400 } else {
3401
3402 /* For intrinsic table we take a consistent path
3403 to re-organize using pessimistic path. */
3404 if (dict_table_is_intrinsic(index->table)) {
3405 goto fail;
3406 }
3407
3408 ut_ad(!reorg);
3409
3410 /* If the record did not fit, reorganize */
3411 if (!btr_page_reorganize(page_cursor, index, mtr)) {
3412 ut_ad(0);
3413 goto fail;
3414 }
3415
3416 ut_ad(page_get_max_insert_size(page, 1) == max_size);
3417
3418 reorg = TRUE;
3419
3420 *rec = page_cur_tuple_insert(page_cursor, entry, index,
3421 offsets, heap, n_ext, mtr);
3422
3423 if (UNIV_UNLIKELY(!*rec)) {
3424 ib::fatal() << "Cannot insert tuple " << *entry
3425 << "into index " << index->name
3426 << " of table " << index->table->name
3427 << ". Max size: " << max_size;
3428 }
3429 }
3430
3431 if (!index->disable_ahi) {
3432 if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
3433 btr_search_update_hash_node_on_insert(cursor);
3434 } else {
3435 btr_search_update_hash_on_insert(cursor);
3436 }
3437 }
3438
3439 if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3440
3441 lock_update_insert(block, *rec);
3442 }
3443
3444 if (leaf
3445 && !dict_index_is_clust(index)
3446 && !dict_table_is_temporary(index->table)) {
3447 /* Update the free bits of the B-tree page in the
3448 insert buffer bitmap. */
3449
3450 /* The free bits in the insert buffer bitmap must
3451 never exceed the free space on a page. It is safe to
3452 decrement or reset the bits in the bitmap in a
3453 mini-transaction that is committed before the
3454 mini-transaction that affects the free space. */
3455
3456 /* It is unsafe to increment the bits in a separately
3457 committed mini-transaction, because in crash recovery,
3458 the free bits could momentarily be set too high. */
3459
3460 if (page_size.is_compressed()) {
3461 /* Update the bits in the same mini-transaction. */
3462 ibuf_update_free_bits_zip(block, mtr);
3463 } else {
3464 /* Decrement the bits in a separate
3465 mini-transaction. */
3466 ibuf_update_free_bits_if_full(
3467 block, max_size,
3468 rec_size + PAGE_DIR_SLOT_SIZE);
3469 }
3470 }
3471
3472 *big_rec = big_rec_vec;
3473
3474 return(DB_SUCCESS);
3475 }
3476
3477 /*************************************************************//**
3478 Performs an insert on a page of an index tree. It is assumed that mtr
3479 holds an x-latch on the tree and on the cursor page. If the insert is
3480 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3481 to brothers of page, if those brothers exist.
3482 @return DB_SUCCESS or error number */
3483 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3484 btr_cur_pessimistic_insert(
3485 /*=======================*/
3486 ulint flags, /*!< in: undo logging and locking flags: if not
3487 zero, the parameter thr should be
3488 specified; if no undo logging is specified,
3489 then the caller must have reserved enough
3490 free extents in the file space so that the
3491 insertion will certainly succeed */
3492 btr_cur_t* cursor, /*!< in: cursor after which to insert;
3493 cursor stays valid */
3494 ulint** offsets,/*!< out: offsets on *rec */
3495 mem_heap_t** heap, /*!< in/out: pointer to memory heap
3496 that can be emptied, or NULL */
3497 dtuple_t* entry, /*!< in/out: entry to insert */
3498 rec_t** rec, /*!< out: pointer to inserted record if
3499 succeed */
3500 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3501 be stored externally by the caller, or
3502 NULL */
3503 ulint n_ext, /*!< in: number of externally stored columns */
3504 que_thr_t* thr, /*!< in: query thread or NULL */
3505 mtr_t* mtr) /*!< in/out: mini-transaction */
3506 {
3507 dict_index_t* index = cursor->index;
3508 big_rec_t* big_rec_vec = NULL;
3509 dberr_t err;
3510 ibool inherit = FALSE;
3511 bool success;
3512 ulint n_reserved = 0;
3513
3514 ut_ad(dtuple_check_typed(entry));
3515
3516 *big_rec = NULL;
3517
3518 ut_ad(mtr_memo_contains_flagged(
3519 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
3520 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
3521 || dict_table_is_intrinsic(cursor->index->table));
3522 ut_ad(mtr_is_block_fix(
3523 mtr, btr_cur_get_block(cursor),
3524 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3525 ut_ad(!dict_index_is_online_ddl(index)
3526 || dict_index_is_clust(index)
3527 || (flags & BTR_CREATE_FLAG));
3528
3529 cursor->flag = BTR_CUR_BINARY;
3530
3531 /* Check locks and write to undo log, if specified */
3532
3533 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3534 thr, mtr, &inherit);
3535
3536 if (err != DB_SUCCESS) {
3537
3538 return(err);
3539 }
3540
3541 if (!(flags & BTR_NO_UNDO_LOG_FLAG)
3542 || dict_table_is_intrinsic(index->table)) {
3543
3544 ut_a(cursor->tree_height != ULINT_UNDEFINED);
3545
3546 /* First reserve enough free space for the file segments
3547 of the index tree, so that the insert will not fail because
3548 of lack of space */
3549
3550 ulint n_extents = cursor->tree_height / 16 + 3;
3551
3552 success = fsp_reserve_free_extents(&n_reserved, index->space,
3553 n_extents, FSP_NORMAL, mtr);
3554 if (!success) {
3555 return(DB_OUT_OF_FILE_SPACE);
3556 }
3557 }
3558
3559 if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3560 dict_table_is_comp(index->table),
3561 dtuple_get_n_fields(entry),
3562 dict_table_page_size(index->table))) {
3563 /* The record is so big that we have to store some fields
3564 externally on separate database pages */
3565
3566 if (UNIV_LIKELY_NULL(big_rec_vec)) {
3567 /* This should never happen, but we handle
3568 the situation in a robust manner. */
3569 ut_ad(0);
3570 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3571 }
3572
3573 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3574
3575 if (big_rec_vec == NULL) {
3576
3577 if (n_reserved > 0) {
3578 fil_space_release_free_extents(index->space,
3579 n_reserved);
3580 }
3581 return(DB_TOO_BIG_RECORD);
3582 }
3583 }
3584
3585 if (dict_index_get_page(index)
3586 == btr_cur_get_block(cursor)->page.id.page_no()) {
3587
3588 /* The page is the root page */
3589 *rec = btr_root_raise_and_insert(
3590 flags, cursor, offsets, heap, entry, n_ext, mtr);
3591 } else {
3592 *rec = btr_page_split_and_insert(
3593 flags, cursor, offsets, heap, entry, n_ext, mtr);
3594 }
3595
3596 ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3597 || dict_index_is_spatial(index));
3598
3599 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3600 ut_ad(!dict_table_is_temporary(index->table));
3601 if (dict_index_is_spatial(index)) {
3602 /* Do nothing */
3603 } else {
3604 /* The cursor might be moved to the other page
3605 and the max trx id field should be updated after
3606 the cursor was fixed. */
3607 if (!dict_index_is_clust(index)) {
3608 page_update_max_trx_id(
3609 btr_cur_get_block(cursor),
3610 btr_cur_get_page_zip(cursor),
3611 thr_get_trx(thr)->id, mtr);
3612 }
3613 if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3614 || btr_page_get_prev(
3615 buf_block_get_frame(
3616 btr_cur_get_block(cursor)), mtr)
3617 == FIL_NULL) {
3618 /* split and inserted need to call
3619 lock_update_insert() always. */
3620 inherit = TRUE;
3621 }
3622 }
3623 }
3624
3625 if (!index->disable_ahi) {
3626 btr_search_update_hash_on_insert(cursor);
3627 }
3628 if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3629
3630 lock_update_insert(btr_cur_get_block(cursor), *rec);
3631 }
3632
3633 if (n_reserved > 0) {
3634 fil_space_release_free_extents(index->space, n_reserved);
3635 }
3636
3637 *big_rec = big_rec_vec;
3638
3639 return(DB_SUCCESS);
3640 }
3641
3642 /*==================== B-TREE UPDATE =========================*/
3643
3644 /*************************************************************//**
3645 For an update, checks the locks and does the undo logging.
3646 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3647 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3648 dberr_t
3649 btr_cur_upd_lock_and_undo(
3650 /*======================*/
3651 ulint flags, /*!< in: undo logging and locking flags */
3652 btr_cur_t* cursor, /*!< in: cursor on record to update */
3653 const ulint* offsets,/*!< in: rec_get_offsets() on cursor */
3654 const upd_t* update, /*!< in: update vector */
3655 ulint cmpl_info,/*!< in: compiler info on secondary index
3656 updates */
3657 que_thr_t* thr, /*!< in: query thread
3658 (can be NULL if BTR_NO_LOCKING_FLAG) */
3659 mtr_t* mtr, /*!< in/out: mini-transaction */
3660 roll_ptr_t* roll_ptr)/*!< out: roll pointer */
3661 {
3662 dict_index_t* index;
3663 const rec_t* rec;
3664 dberr_t err;
3665
3666 ut_ad(thr != NULL || (flags & BTR_NO_LOCKING_FLAG));
3667
3668 rec = btr_cur_get_rec(cursor);
3669 index = cursor->index;
3670
3671 ut_ad(rec_offs_validate(rec, index, offsets));
3672 ut_ad(mtr->is_named_space(index->space));
3673
3674 if (!dict_index_is_clust(index)) {
3675 ut_ad(dict_index_is_online_ddl(index)
3676 == !!(flags & BTR_CREATE_FLAG));
3677
3678 /* We do undo logging only when we update a clustered index
3679 record */
3680 return(lock_sec_rec_modify_check_and_lock(
3681 flags, btr_cur_get_block(cursor), rec,
3682 index, thr, mtr));
3683 }
3684
3685 /* Check if we have to wait for a lock: enqueue an explicit lock
3686 request if yes */
3687
3688 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3689 err = lock_clust_rec_modify_check_and_lock(
3690 flags, btr_cur_get_block(cursor), rec, index,
3691 offsets, thr);
3692 if (err != DB_SUCCESS) {
3693 return(err);
3694 }
3695 }
3696
3697 /* Append the info about the update in the undo log */
3698
3699 return(trx_undo_report_row_operation(
3700 flags, TRX_UNDO_MODIFY_OP, thr,
3701 index, NULL, update,
3702 cmpl_info, rec, offsets, roll_ptr));
3703 }
3704
3705 /***********************************************************//**
3706 Writes a redo log record of updating a record in-place. */
3707 void
btr_cur_update_in_place_log(ulint flags,const rec_t * rec,dict_index_t * index,const upd_t * update,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3708 btr_cur_update_in_place_log(
3709 /*========================*/
3710 ulint flags, /*!< in: flags */
3711 const rec_t* rec, /*!< in: record */
3712 dict_index_t* index, /*!< in: index of the record */
3713 const upd_t* update, /*!< in: update vector */
3714 trx_id_t trx_id, /*!< in: transaction id */
3715 roll_ptr_t roll_ptr, /*!< in: roll ptr */
3716 mtr_t* mtr) /*!< in: mtr */
3717 {
3718 byte* log_ptr;
3719 const page_t* page = page_align(rec);
3720 ut_ad(flags < 256);
3721 ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
3722
3723 log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3724 ? MLOG_COMP_REC_UPDATE_IN_PLACE
3725 : MLOG_REC_UPDATE_IN_PLACE,
3726 1 + DATA_ROLL_PTR_LEN + 14 + 2
3727 + MLOG_BUF_MARGIN);
3728
3729 if (!log_ptr) {
3730 /* Logging in mtr is switched off during crash recovery */
3731 return;
3732 }
3733
3734 /* For secondary indexes, we could skip writing the dummy system fields
3735 to the redo log but we have to change redo log parsing of
3736 MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
3737 new redo log record. For now, just write dummy sys fields to the redo
3738 log if we are updating a secondary index record.
3739 */
3740 mach_write_to_1(log_ptr, flags);
3741 log_ptr++;
3742
3743 if (dict_index_is_clust(index)) {
3744 log_ptr = row_upd_write_sys_vals_to_log(
3745 index, trx_id, roll_ptr, log_ptr, mtr);
3746 } else {
3747 /* Dummy system fields for a secondary index */
3748 /* TRX_ID Position */
3749 log_ptr += mach_write_compressed(log_ptr, 0);
3750 /* ROLL_PTR */
3751 trx_write_roll_ptr(log_ptr, 0);
3752 log_ptr += DATA_ROLL_PTR_LEN;
3753 /* TRX_ID */
3754 log_ptr += mach_u64_write_compressed(log_ptr, 0);
3755 }
3756
3757 mach_write_to_2(log_ptr, page_offset(rec));
3758 log_ptr += 2;
3759
3760 row_upd_index_write_log(update, log_ptr, mtr);
3761 }
3762 #endif /* UNIV_HOTBACKUP */
3763
3764 /***********************************************************//**
3765 Parses a redo log record of updating a record in-place.
3766 @return end of log record or NULL */
3767 byte*
btr_cur_parse_update_in_place(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)3768 btr_cur_parse_update_in_place(
3769 /*==========================*/
3770 byte* ptr, /*!< in: buffer */
3771 byte* end_ptr,/*!< in: buffer end */
3772 page_t* page, /*!< in/out: page or NULL */
3773 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
3774 dict_index_t* index) /*!< in: index corresponding to page */
3775 {
3776 ulint flags;
3777 rec_t* rec;
3778 upd_t* update;
3779 ulint pos;
3780 trx_id_t trx_id;
3781 roll_ptr_t roll_ptr;
3782 ulint rec_offset;
3783 mem_heap_t* heap;
3784 ulint* offsets;
3785
3786 if (end_ptr < ptr + 1) {
3787
3788 return(NULL);
3789 }
3790
3791 flags = mach_read_from_1(ptr);
3792 ptr++;
3793
3794 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
3795
3796 if (ptr == NULL) {
3797
3798 return(NULL);
3799 }
3800
3801 if (end_ptr < ptr + 2) {
3802
3803 return(NULL);
3804 }
3805
3806 rec_offset = mach_read_from_2(ptr);
3807 ptr += 2;
3808
3809 ut_a(rec_offset <= UNIV_PAGE_SIZE);
3810
3811 heap = mem_heap_create(256);
3812
3813 ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
3814
3815 if (!ptr || !page) {
3816
3817 goto func_exit;
3818 }
3819
3820 ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
3821 rec = page + rec_offset;
3822
3823 /* We do not need to reserve search latch, as the page is only
3824 being recovered, and there cannot be a hash index to it. */
3825
3826 offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3827
3828 if (!(flags & BTR_KEEP_SYS_FLAG)) {
3829 row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3830 pos, trx_id, roll_ptr);
3831 }
3832
3833 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3834
3835 func_exit:
3836 mem_heap_free(heap);
3837
3838 return(ptr);
3839 }
3840
3841 #ifndef UNIV_HOTBACKUP
3842 /*************************************************************//**
3843 See if there is enough place in the page modification log to log
3844 an update-in-place.
3845
3846 @retval false if out of space; IBUF_BITMAP_FREE will be reset
3847 outside mtr if the page was recompressed
3848 @retval true if enough place;
3849
3850 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
3851 a secondary index leaf page. This has to be done either within the
3852 same mini-transaction, or by invoking ibuf_reset_free_bits() before
3853 mtr_commit(mtr). */
3854 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,ulint * offsets,ulint length,bool create,mtr_t * mtr)3855 btr_cur_update_alloc_zip_func(
3856 /*==========================*/
3857 page_zip_des_t* page_zip,/*!< in/out: compressed page */
3858 page_cur_t* cursor, /*!< in/out: B-tree page cursor */
3859 dict_index_t* index, /*!< in: the index corresponding to cursor */
3860 #ifdef UNIV_DEBUG
3861 ulint* offsets,/*!< in/out: offsets of the cursor record */
3862 #endif /* UNIV_DEBUG */
3863 ulint length, /*!< in: size needed */
3864 bool create, /*!< in: true=delete-and-insert,
3865 false=update-in-place */
3866 mtr_t* mtr) /*!< in/out: mini-transaction */
3867 {
3868 const page_t* page = page_cur_get_page(cursor);
3869
3870 ut_ad(page_zip == page_cur_get_page_zip(cursor));
3871 ut_ad(page_zip);
3872 ut_ad(!dict_index_is_ibuf(index));
3873 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3874
3875 if (page_zip_available(page_zip, dict_index_is_clust(index),
3876 length, create)) {
3877 return(true);
3878 }
3879
3880 if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3881 /* The page has been freshly compressed, so
3882 reorganizing it will not help. */
3883 return(false);
3884 }
3885
3886 if (create && page_is_leaf(page)
3887 && (length + page_get_data_size(page)
3888 >= dict_index_zip_pad_optimal_page_size(index))) {
3889 return(false);
3890 }
3891
3892 if (!btr_page_reorganize(cursor, index, mtr)) {
3893 goto out_of_space;
3894 }
3895
3896 rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
3897
3898 /* After recompressing a page, we must make sure that the free
3899 bits in the insert buffer bitmap will not exceed the free
3900 space on the page. Because this function will not attempt
3901 recompression unless page_zip_available() fails above, it is
3902 safe to reset the free bits if page_zip_available() fails
3903 again, below. The free bits can safely be reset in a separate
3904 mini-transaction. If page_zip_available() succeeds below, we
3905 can be sure that the btr_page_reorganize() above did not reduce
3906 the free space available on the page. */
3907
3908 if (page_zip_available(page_zip, dict_index_is_clust(index),
3909 length, create)) {
3910 return(true);
3911 }
3912
3913 out_of_space:
3914 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3915
3916 /* Out of space: reset the free bits. */
3917 if (!dict_index_is_clust(index)
3918 && !dict_table_is_temporary(index->table)
3919 && page_is_leaf(page)) {
3920 ibuf_reset_free_bits(page_cur_get_block(cursor));
3921 }
3922
3923 return(false);
3924 }
3925
3926 /*************************************************************//**
3927 Updates a record when the update causes no size changes in its fields.
3928 We assume here that the ordering fields of the record do not change.
3929 @return locking or undo log related error code, or
3930 @retval DB_SUCCESS on success
3931 @retval DB_ZIP_OVERFLOW if there is not enough space left
3932 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3933 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,ulint * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3934 btr_cur_update_in_place(
3935 /*====================*/
3936 ulint flags, /*!< in: undo logging and locking flags */
3937 btr_cur_t* cursor, /*!< in: cursor on the record to update;
3938 cursor stays valid and positioned on the
3939 same record */
3940 ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3941 const upd_t* update, /*!< in: update vector */
3942 ulint cmpl_info,/*!< in: compiler info on secondary index
3943 updates */
3944 que_thr_t* thr, /*!< in: query thread */
3945 trx_id_t trx_id, /*!< in: transaction id */
3946 mtr_t* mtr) /*!< in/out: mini-transaction; if this
3947 is a secondary index, the caller must
3948 mtr_commit(mtr) before latching any
3949 further pages */
3950 {
3951 dict_index_t* index;
3952 buf_block_t* block;
3953 page_zip_des_t* page_zip;
3954 dberr_t err;
3955 rec_t* rec;
3956 roll_ptr_t roll_ptr = 0;
3957 ulint was_delete_marked;
3958 ibool is_hashed;
3959
3960 rec = btr_cur_get_rec(cursor);
3961 index = cursor->index;
3962 ut_ad(rec_offs_validate(rec, index, offsets));
3963 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3964 ut_ad(trx_id > 0
3965 || (flags & BTR_KEEP_SYS_FLAG)
3966 || dict_table_is_intrinsic(index->table));
3967 /* The insert buffer tree should never be updated in place. */
3968 ut_ad(!dict_index_is_ibuf(index));
3969 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3970 || dict_index_is_clust(index));
3971 ut_ad(thr_get_trx(thr)->id == trx_id
3972 || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3973 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3974 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3975 ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3976 ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3977
3978 DBUG_PRINT("ib_cur", ("update-in-place %s (" IB_ID_FMT
3979 ") by " TRX_ID_FMT ": %s",
3980 index->name(), index->id, trx_id,
3981 rec_printer(rec, offsets).str().c_str()));
3982
3983 block = btr_cur_get_block(cursor);
3984 page_zip = buf_block_get_page_zip(block);
3985
3986 /* Check that enough space is available on the compressed page. */
3987 if (page_zip) {
3988 if (!btr_cur_update_alloc_zip(
3989 page_zip, btr_cur_get_page_cur(cursor),
3990 index, offsets, rec_offs_size(offsets),
3991 false, mtr)) {
3992 return(DB_ZIP_OVERFLOW);
3993 }
3994
3995 rec = btr_cur_get_rec(cursor);
3996 }
3997
3998 /* Do lock checking and undo logging */
3999 err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
4000 update, cmpl_info,
4001 thr, mtr, &roll_ptr);
4002 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
4003 /* We may need to update the IBUF_BITMAP_FREE
4004 bits after a reorganize that was done in
4005 btr_cur_update_alloc_zip(). */
4006 goto func_exit;
4007 }
4008
4009 if (!(flags & BTR_KEEP_SYS_FLAG)
4010 && !dict_table_is_intrinsic(index->table)) {
4011 row_upd_rec_sys_fields(rec, NULL, index, offsets,
4012 thr_get_trx(thr), roll_ptr);
4013 }
4014
4015 was_delete_marked = rec_get_deleted_flag(
4016 rec, page_is_comp(buf_block_get_frame(block)));
4017
4018 is_hashed = (block->index != NULL);
4019
4020 if (is_hashed) {
4021 /* TO DO: Can we skip this if none of the fields
4022 index->search_info->curr_n_fields
4023 are being updated? */
4024
4025 /* The function row_upd_changes_ord_field_binary works only
4026 if the update vector was built for a clustered index, we must
4027 NOT call it if index is secondary */
4028
4029 if (!dict_index_is_clust(index)
4030 || row_upd_changes_ord_field_binary(index, update, thr,
4031 NULL, NULL)) {
4032
4033 /* Remove possible hash index pointer to this record */
4034 btr_search_update_hash_on_delete(cursor);
4035 }
4036
4037 rw_lock_x_lock(btr_get_search_latch(index));
4038 }
4039
4040 assert_block_ahi_valid(block);
4041 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
4042
4043 if (is_hashed) {
4044 rw_lock_x_unlock(btr_get_search_latch(index));
4045 }
4046
4047 btr_cur_update_in_place_log(flags, rec, index, update,
4048 trx_id, roll_ptr, mtr);
4049
4050 if (was_delete_marked
4051 && !rec_get_deleted_flag(
4052 rec, page_is_comp(buf_block_get_frame(block)))) {
4053 /* The new updated record owns its possible externally
4054 stored fields */
4055
4056 btr_cur_unmark_extern_fields(page_zip,
4057 rec, index, offsets, mtr);
4058 }
4059
4060 ut_ad(err == DB_SUCCESS);
4061
4062 func_exit:
4063 if (page_zip
4064 && !(flags & BTR_KEEP_IBUF_BITMAP)
4065 && !dict_index_is_clust(index)
4066 && !dict_table_is_temporary(index->table)
4067 && page_is_leaf(buf_block_get_frame(block))) {
4068 /* Update the free bits in the insert buffer. */
4069 ibuf_update_free_bits_zip(block, mtr);
4070 }
4071
4072 return(err);
4073 }
4074
4075 /*************************************************************//**
4076 Tries to update a record on a page in an index tree. It is assumed that mtr
4077 holds an x-latch on the page. The operation does not succeed if there is too
4078 little space on the page or if the update would result in too empty a page,
4079 so that tree compression is recommended. We assume here that the ordering
4080 fields of the record do not change.
4081 @return error code, including
4082 @retval DB_SUCCESS on success
4083 @retval DB_OVERFLOW if the updated record does not fit
4084 @retval DB_UNDERFLOW if the page would become too empty
4085 @retval DB_ZIP_OVERFLOW if there is not enough space left
4086 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
4087 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4088 btr_cur_optimistic_update(
4089 /*======================*/
4090 ulint flags, /*!< in: undo logging and locking flags */
4091 btr_cur_t* cursor, /*!< in: cursor on the record to update;
4092 cursor stays valid and positioned on the
4093 same record */
4094 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
4095 mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
4096 const upd_t* update, /*!< in: update vector; this must also
4097 contain trx id and roll ptr fields */
4098 ulint cmpl_info,/*!< in: compiler info on secondary index
4099 updates */
4100 que_thr_t* thr, /*!< in: query thread */
4101 trx_id_t trx_id, /*!< in: transaction id */
4102 mtr_t* mtr) /*!< in/out: mini-transaction; if this
4103 is a secondary index, the caller must
4104 mtr_commit(mtr) before latching any
4105 further pages */
4106 {
4107 dict_index_t* index;
4108 page_cur_t* page_cursor;
4109 dberr_t err;
4110 buf_block_t* block;
4111 page_t* page;
4112 page_zip_des_t* page_zip;
4113 rec_t* rec;
4114 ulint max_size;
4115 ulint new_rec_size;
4116 ulint old_rec_size;
4117 ulint max_ins_size = 0;
4118 dtuple_t* new_entry;
4119 roll_ptr_t roll_ptr;
4120 ulint i;
4121 ulint n_ext;
4122
4123 block = btr_cur_get_block(cursor);
4124 page = buf_block_get_frame(block);
4125 rec = btr_cur_get_rec(cursor);
4126 index = cursor->index;
4127 ut_ad(trx_id > 0
4128 || (flags & BTR_KEEP_SYS_FLAG)
4129 || dict_table_is_intrinsic(index->table));
4130 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4131 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4132 /* This is intended only for leaf page updates */
4133 ut_ad(page_is_leaf(page));
4134 /* The insert buffer tree should never be updated in place. */
4135 ut_ad(!dict_index_is_ibuf(index));
4136 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4137 || dict_index_is_clust(index));
4138 ut_ad(thr_get_trx(thr)->id == trx_id
4139 || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
4140 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4141 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4142 ut_ad(fil_page_index_page_check(page));
4143 ut_ad(btr_page_get_index_id(page) == index->id);
4144
4145 *offsets = rec_get_offsets(rec, index, *offsets,
4146 ULINT_UNDEFINED, heap);
4147 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4148 ut_a(!rec_offs_any_null_extern(rec, *offsets)
4149 || trx_is_recv(thr_get_trx(thr)));
4150 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4151
4152 if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
4153
4154 /* The simplest and the most common case: the update does not
4155 change the size of any field and none of the updated fields is
4156 externally stored in rec or update, and there is enough space
4157 on the compressed page to log the update. */
4158
4159 return(btr_cur_update_in_place(
4160 flags, cursor, *offsets, update,
4161 cmpl_info, thr, trx_id, mtr));
4162 }
4163
4164 if (rec_offs_any_extern(*offsets)) {
4165 any_extern:
4166 /* Externally stored fields are treated in pessimistic
4167 update */
4168
4169 /* prefetch siblings of the leaf for the pessimistic
4170 operation. */
4171 btr_cur_prefetch_siblings(block);
4172
4173 return(DB_OVERFLOW);
4174 }
4175
4176 for (i = 0; i < upd_get_n_fields(update); i++) {
4177 if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4178
4179 goto any_extern;
4180 }
4181 }
4182
4183 DBUG_PRINT("ib_cur", ("update %s (" IB_ID_FMT ") by " TRX_ID_FMT
4184 ": %s",
4185 index->name(), index->id, trx_id,
4186 rec_printer(rec, *offsets).str().c_str()));
4187
4188 page_cursor = btr_cur_get_page_cur(cursor);
4189
4190 if (!*heap) {
4191 *heap = mem_heap_create(
4192 rec_offs_size(*offsets)
4193 + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4194 }
4195
4196 new_entry = row_rec_to_index_entry(rec, index, *offsets,
4197 &n_ext, *heap);
4198 /* We checked above that there are no externally stored fields. */
4199 ut_a(!n_ext);
4200
4201 /* The page containing the clustered index record
4202 corresponding to new_entry is latched in mtr.
4203 Thus the following call is safe. */
4204 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4205 FALSE, *heap);
4206 old_rec_size = rec_offs_size(*offsets);
4207 new_rec_size = rec_get_converted_size(index, new_entry, 0);
4208
4209 page_zip = buf_block_get_page_zip(block);
4210 #ifdef UNIV_ZIP_DEBUG
4211 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4212 #endif /* UNIV_ZIP_DEBUG */
4213
4214 if (page_zip) {
4215 if (!btr_cur_update_alloc_zip(
4216 page_zip, page_cursor, index, *offsets,
4217 new_rec_size, true, mtr)) {
4218 return(DB_ZIP_OVERFLOW);
4219 }
4220
4221 rec = page_cur_get_rec(page_cursor);
4222 }
4223
4224 /* We limit max record size to 16k even for 64k page size. */
4225 if (new_rec_size >= REC_MAX_DATA_SIZE) {
4226 err = DB_OVERFLOW;
4227
4228 goto func_exit;
4229 }
4230
4231 if (UNIV_UNLIKELY(new_rec_size
4232 >= (page_get_free_space_of_empty(page_is_comp(page))
4233 / 2))) {
4234 /* We may need to update the IBUF_BITMAP_FREE
4235 bits after a reorganize that was done in
4236 btr_cur_update_alloc_zip(). */
4237 err = DB_OVERFLOW;
4238 goto func_exit;
4239 }
4240
4241 if (UNIV_UNLIKELY(page_get_data_size(page)
4242 - old_rec_size + new_rec_size
4243 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4244 /* We may need to update the IBUF_BITMAP_FREE
4245 bits after a reorganize that was done in
4246 btr_cur_update_alloc_zip(). */
4247
4248 /* The page would become too empty */
4249 err = DB_UNDERFLOW;
4250 goto func_exit;
4251 }
4252
4253 /* We do not attempt to reorganize if the page is compressed.
4254 This is because the page may fail to compress after reorganization. */
4255 max_size = page_zip
4256 ? page_get_max_insert_size(page, 1)
4257 : (old_rec_size
4258 + page_get_max_insert_size_after_reorganize(page, 1));
4259
4260 if (!page_zip) {
4261 max_ins_size = page_get_max_insert_size_after_reorganize(
4262 page, 1);
4263 }
4264
4265 if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4266 && (max_size >= new_rec_size))
4267 || (page_get_n_recs(page) <= 1))) {
4268
4269 /* We may need to update the IBUF_BITMAP_FREE
4270 bits after a reorganize that was done in
4271 btr_cur_update_alloc_zip(). */
4272
4273 /* There was not enough space, or it did not pay to
4274 reorganize: for simplicity, we decide what to do assuming a
4275 reorganization is needed, though it might not be necessary */
4276
4277 err = DB_OVERFLOW;
4278 goto func_exit;
4279 }
4280
4281 /* Do lock checking and undo logging */
4282 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4283 update, cmpl_info,
4284 thr, mtr, &roll_ptr);
4285 if (err != DB_SUCCESS) {
4286 /* We may need to update the IBUF_BITMAP_FREE
4287 bits after a reorganize that was done in
4288 btr_cur_update_alloc_zip(). */
4289 goto func_exit;
4290 }
4291
4292 /* Ok, we may do the replacement. Store on the page infimum the
4293 explicit locks on rec, before deleting rec (see the comment in
4294 btr_cur_pessimistic_update). */
4295 if (!dict_table_is_locking_disabled(index->table)) {
4296 lock_rec_store_on_page_infimum(block, rec);
4297 }
4298
4299 btr_search_update_hash_on_delete(cursor);
4300
4301 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4302
4303 page_cur_move_to_prev(page_cursor);
4304
4305 if (!(flags & BTR_KEEP_SYS_FLAG)
4306 && !dict_table_is_intrinsic(index->table)) {
4307 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4308 roll_ptr);
4309 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4310 trx_id);
4311 }
4312
4313 /* There are no externally stored columns in new_entry */
4314 rec = btr_cur_insert_if_possible(
4315 cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4316 ut_a(rec); /* <- We calculated above the insert would fit */
4317
4318 /* Restore the old explicit lock state on the record */
4319 if (!dict_table_is_locking_disabled(index->table)) {
4320 lock_rec_restore_from_page_infimum(block, rec, block);
4321 }
4322
4323 page_cur_move_to_next(page_cursor);
4324 ut_ad(err == DB_SUCCESS);
4325
4326 func_exit:
4327 if (!(flags & BTR_KEEP_IBUF_BITMAP)
4328 && !dict_index_is_clust(index)
4329 && !dict_table_is_temporary(index->table)) {
4330 /* Update the free bits in the insert buffer. */
4331 if (page_zip) {
4332 ibuf_update_free_bits_zip(block, mtr);
4333 } else {
4334 ibuf_update_free_bits_low(block, max_ins_size, mtr);
4335 }
4336 }
4337
4338 if (err != DB_SUCCESS) {
4339 /* prefetch siblings of the leaf for the pessimistic
4340 operation. */
4341 btr_cur_prefetch_siblings(block);
4342 }
4343
4344 return(err);
4345 }
4346
4347 /*************************************************************//**
4348 If, in a split, a new supremum record was created as the predecessor of the
4349 updated record, the supremum record must inherit exactly the locks on the
4350 updated record. In the split it may have inherited locks from the successor
4351 of the updated record, which is not correct. This function restores the
4352 right locks for the new supremum. */
4353 static
4354 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4355 btr_cur_pess_upd_restore_supremum(
4356 /*==============================*/
4357 buf_block_t* block, /*!< in: buffer block of rec */
4358 const rec_t* rec, /*!< in: updated record */
4359 mtr_t* mtr) /*!< in: mtr */
4360 {
4361 page_t* page;
4362 buf_block_t* prev_block;
4363
4364 page = buf_block_get_frame(block);
4365
4366 if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4367 /* Updated record is not the first user record on its page */
4368
4369 return;
4370 }
4371
4372 const ulint prev_page_no = btr_page_get_prev(page, mtr);
4373
4374 const page_id_t page_id(block->page.id.space(), prev_page_no);
4375
4376 ut_ad(prev_page_no != FIL_NULL);
4377 prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4378 #ifdef UNIV_BTR_DEBUG
4379 ut_a(btr_page_get_next(prev_block->frame, mtr)
4380 == page_get_page_no(page));
4381 #endif /* UNIV_BTR_DEBUG */
4382
4383 /* We must already have an x-latch on prev_block! */
4384 ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
4385
4386 lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4387 PAGE_HEAP_NO_SUPREMUM,
4388 page_rec_get_heap_no(rec));
4389 }
4390
4391 /*************************************************************//**
4392 Performs an update of a record on a page of a tree. It is assumed
4393 that mtr holds an x-latch on the tree and on the cursor page. If the
4394 update is made on the leaf level, to avoid deadlocks, mtr must also
4395 own x-latches to brothers of page, if those brothers exist. We assume
4396 here that the ordering fields of the record do not change.
4397 @return DB_SUCCESS or error code */
4398 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4399 btr_cur_pessimistic_update(
4400 /*=======================*/
4401 ulint flags, /*!< in: undo logging, locking, and rollback
4402 flags */
4403 btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
4404 cursor may become invalid if *big_rec == NULL
4405 || !(flags & BTR_KEEP_POS_FLAG) */
4406 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
4407 mem_heap_t** offsets_heap,
4408 /*!< in/out: pointer to memory heap
4409 that can be emptied, or NULL */
4410 mem_heap_t* entry_heap,
4411 /*!< in/out: memory heap for allocating
4412 big_rec and the index tuple */
4413 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
4414 be stored externally by the caller, or NULL */
4415 upd_t* update, /*!< in/out: update vector; this is allowed to
4416 also contain trx id and roll ptr fields.
4417 Non-updated columns that are moved offpage will
4418 be appended to this. */
4419 ulint cmpl_info,/*!< in: compiler info on secondary index
4420 updates */
4421 que_thr_t* thr, /*!< in: query thread */
4422 trx_id_t trx_id, /*!< in: transaction id */
4423 mtr_t* mtr) /*!< in/out: mini-transaction; must be
4424 committed before latching any further pages */
4425 {
4426 big_rec_t* big_rec_vec = NULL;
4427 big_rec_t* dummy_big_rec;
4428 dict_index_t* index;
4429 buf_block_t* block;
4430 page_t* page;
4431 page_zip_des_t* page_zip;
4432 rec_t* rec;
4433 page_cur_t* page_cursor;
4434 dberr_t err;
4435 dberr_t optim_err;
4436 roll_ptr_t roll_ptr;
4437 ibool was_first;
4438 ulint n_reserved = 0;
4439 ulint n_ext;
4440 ulint max_ins_size = 0;
4441
4442 *offsets = NULL;
4443 *big_rec = NULL;
4444
4445 block = btr_cur_get_block(cursor);
4446 page = buf_block_get_frame(block);
4447 page_zip = buf_block_get_page_zip(block);
4448 index = cursor->index;
4449
4450 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4451 MTR_MEMO_X_LOCK |
4452 MTR_MEMO_SX_LOCK)
4453 || dict_table_is_intrinsic(index->table));
4454 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4455 #ifdef UNIV_ZIP_DEBUG
4456 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4457 #endif /* UNIV_ZIP_DEBUG */
4458 /* The insert buffer tree should never be updated in place. */
4459 ut_ad(!dict_index_is_ibuf(index));
4460 ut_ad(trx_id > 0
4461 || (flags & BTR_KEEP_SYS_FLAG)
4462 || dict_table_is_intrinsic(index->table));
4463 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4464 || dict_index_is_clust(index));
4465 ut_ad(thr_get_trx(thr)->id == trx_id
4466 || (flags & ~BTR_KEEP_POS_FLAG)
4467 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4468 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4469
4470 err = optim_err = btr_cur_optimistic_update(
4471 flags | BTR_KEEP_IBUF_BITMAP,
4472 cursor, offsets, offsets_heap, update,
4473 cmpl_info, thr, trx_id, mtr);
4474
4475 switch (err) {
4476 case DB_ZIP_OVERFLOW:
4477 case DB_UNDERFLOW:
4478 case DB_OVERFLOW:
4479 break;
4480 default:
4481 err_exit:
4482 /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4483 For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4484 already reset by btr_cur_update_alloc_zip() if the
4485 page was recompressed. */
4486 if (page_zip
4487 && optim_err != DB_ZIP_OVERFLOW
4488 && !dict_index_is_clust(index)
4489 && !dict_table_is_temporary(index->table)
4490 && page_is_leaf(page)) {
4491 ibuf_update_free_bits_zip(block, mtr);
4492 }
4493
4494 if (big_rec_vec != NULL) {
4495 dtuple_big_rec_free(big_rec_vec);
4496 }
4497
4498 return(err);
4499 }
4500
4501 rec = btr_cur_get_rec(cursor);
4502
4503 *offsets = rec_get_offsets(
4504 rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
4505
4506 dtuple_t* new_entry = row_rec_to_index_entry(
4507 rec, index, *offsets, &n_ext, entry_heap);
4508
4509 /* The page containing the clustered index record
4510 corresponding to new_entry is latched in mtr. If the
4511 clustered index record is delete-marked, then its externally
4512 stored fields cannot have been purged yet, because then the
4513 purge would also have removed the clustered index record
4514 itself. Thus the following call is safe. */
4515 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4516 FALSE, entry_heap);
4517
4518 /* We have to set appropriate extern storage bits in the new
4519 record to be inserted: we have to remember which fields were such */
4520
4521 ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
4522 ut_ad(rec_offs_validate(rec, index, *offsets));
4523
4524 /* Get number of externally stored columns in updated record */
4525 n_ext = new_entry->get_n_ext();
4526
4527 /* UNDO logging is also turned-off during normal operation on intrinsic
4528 table so condition needs to ensure that table is not intrinsic. */
4529 if ((flags & BTR_NO_UNDO_LOG_FLAG)
4530 && rec_offs_any_extern(*offsets)
4531 && !dict_table_is_intrinsic(index->table)) {
4532 /* We are in a transaction rollback undoing a row
4533 update: we must free possible externally stored fields
4534 which got new values in the update, if they are not
4535 inherited values. They can be inherited if we have
4536 updated the primary key to another value, and then
4537 update it back again. */
4538
4539 ut_ad(big_rec_vec == NULL);
4540 ut_ad(dict_index_is_clust(index));
4541 ut_ad(thr_get_trx(thr)->in_rollback);
4542
4543 DBUG_EXECUTE_IF("ib_blob_update_rollback", DBUG_SUICIDE(););
4544 RECOVERY_CRASH(99);
4545
4546 btr_rec_free_updated_extern_fields(
4547 index, rec, page_zip, *offsets, update, true, mtr);
4548 }
4549
4550 if (page_zip_rec_needs_ext(
4551 rec_get_converted_size(index, new_entry, n_ext),
4552 page_is_comp(page),
4553 dict_index_get_n_fields(index),
4554 block->page.size)) {
4555
4556 big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4557 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
4558
4559 /* We cannot goto return_after_reservations,
4560 because we may need to update the
4561 IBUF_BITMAP_FREE bits, which was suppressed by
4562 BTR_KEEP_IBUF_BITMAP. */
4563 #ifdef UNIV_ZIP_DEBUG
4564 ut_a(!page_zip
4565 || page_zip_validate(page_zip, page, index));
4566 #endif /* UNIV_ZIP_DEBUG */
4567 if (n_reserved > 0) {
4568 fil_space_release_free_extents(
4569 index->space, n_reserved);
4570 }
4571
4572 err = DB_TOO_BIG_RECORD;
4573 goto err_exit;
4574 }
4575
4576 ut_ad(page_is_leaf(page));
4577 ut_ad(dict_index_is_clust(index));
4578 ut_ad(flags & BTR_KEEP_POS_FLAG);
4579 }
4580
4581 /* Do lock checking and undo logging */
4582 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4583 update, cmpl_info,
4584 thr, mtr, &roll_ptr);
4585 if (err != DB_SUCCESS) {
4586 goto err_exit;
4587 }
4588
4589 if (optim_err == DB_OVERFLOW) {
4590
4591 /* First reserve enough free space for the file segments
4592 of the index tree, so that the update will not fail because
4593 of lack of space */
4594
4595 ulint n_extents = cursor->tree_height / 16 + 3;
4596
4597 if (!fsp_reserve_free_extents(
4598 &n_reserved, index->space, n_extents,
4599 flags & BTR_NO_UNDO_LOG_FLAG
4600 ? FSP_CLEANING : FSP_NORMAL,
4601 mtr)) {
4602 err = DB_OUT_OF_FILE_SPACE;
4603 goto err_exit;
4604 }
4605 }
4606
4607 if (!(flags & BTR_KEEP_SYS_FLAG)
4608 && !dict_table_is_intrinsic(index->table)) {
4609 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4610 roll_ptr);
4611 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4612 trx_id);
4613 }
4614
4615 if (!page_zip) {
4616 max_ins_size = page_get_max_insert_size_after_reorganize(
4617 page, 1);
4618 }
4619
4620 /* Store state of explicit locks on rec on the page infimum record,
4621 before deleting rec. The page infimum acts as a dummy carrier of the
4622 locks, taking care also of lock releases, before we can move the locks
4623 back on the actual record. There is a special case: if we are
4624 inserting on the root page and the insert causes a call of
4625 btr_root_raise_and_insert. Therefore we cannot in the lock system
4626 delete the lock structs set on the root page even if the root
4627 page carries just node pointers. */
4628 if (!dict_table_is_locking_disabled(index->table)) {
4629 lock_rec_store_on_page_infimum(block, rec);
4630 }
4631
4632 btr_search_update_hash_on_delete(cursor);
4633
4634 #ifdef UNIV_ZIP_DEBUG
4635 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4636 #endif /* UNIV_ZIP_DEBUG */
4637 page_cursor = btr_cur_get_page_cur(cursor);
4638
4639 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4640
4641 page_cur_move_to_prev(page_cursor);
4642
4643 rec = btr_cur_insert_if_possible(cursor, new_entry,
4644 offsets, offsets_heap, n_ext, mtr);
4645
4646 if (rec) {
4647 page_cursor->rec = rec;
4648
4649 if (!dict_table_is_locking_disabled(index->table)) {
4650 lock_rec_restore_from_page_infimum(
4651 btr_cur_get_block(cursor), rec, block);
4652 }
4653
4654 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4655 /* The new inserted record owns its possible externally
4656 stored fields */
4657 btr_cur_unmark_extern_fields(
4658 page_zip, rec, index, *offsets, mtr);
4659 }
4660
4661 bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
4662
4663 if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
4664 if (adjust) {
4665 rec_offs_make_valid(
4666 page_cursor->rec, index, *offsets);
4667 }
4668 } else if (!dict_index_is_clust(index)
4669 && !dict_table_is_temporary(index->table)
4670 && page_is_leaf(page)) {
4671 /* Update the free bits in the insert buffer.
4672 This is the same block which was skipped by
4673 BTR_KEEP_IBUF_BITMAP. */
4674 if (page_zip) {
4675 ibuf_update_free_bits_zip(block, mtr);
4676 } else {
4677 ibuf_update_free_bits_low(block, max_ins_size,
4678 mtr);
4679 }
4680 }
4681
4682 if (!srv_read_only_mode
4683 && !big_rec_vec
4684 && page_is_leaf(page)
4685 && !dict_index_is_online_ddl(index)) {
4686
4687 mtr_memo_release(mtr, dict_index_get_lock(index),
4688 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
4689
4690 /* NOTE: We cannot release root block latch here, because it
4691 has segment header and already modified in most of cases.*/
4692 }
4693
4694 err = DB_SUCCESS;
4695 goto return_after_reservations;
4696 } else {
4697 /* If the page is compressed and it initially
4698 compresses very well, and there is a subsequent insert
4699 of a badly-compressing record, it is possible for
4700 btr_cur_optimistic_update() to return DB_UNDERFLOW and
4701 btr_cur_insert_if_possible() to return FALSE. */
4702 ut_a(page_zip || optim_err != DB_UNDERFLOW);
4703
4704 /* Out of space: reset the free bits.
4705 This is the same block which was skipped by
4706 BTR_KEEP_IBUF_BITMAP. */
4707 if (!dict_index_is_clust(index)
4708 && !dict_table_is_temporary(index->table)
4709 && page_is_leaf(page)) {
4710 ibuf_reset_free_bits(block);
4711 }
4712 }
4713
4714 if (big_rec_vec != NULL && !dict_table_is_intrinsic(index->table)) {
4715 ut_ad(page_is_leaf(page));
4716 ut_ad(dict_index_is_clust(index));
4717 ut_ad(flags & BTR_KEEP_POS_FLAG);
4718
4719 /* btr_page_split_and_insert() in
4720 btr_cur_pessimistic_insert() invokes
4721 mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4722 We must keep the index->lock when we created a
4723 big_rec, so that row_upd_clust_rec() can store the
4724 big_rec in the same mini-transaction. */
4725
4726 ut_ad(mtr_memo_contains_flagged(mtr,
4727 dict_index_get_lock(index),
4728 MTR_MEMO_X_LOCK |
4729 MTR_MEMO_SX_LOCK));
4730
4731 mtr_sx_lock(dict_index_get_lock(index), mtr);
4732 }
4733
4734 /* Was the record to be updated positioned as the first user
4735 record on its page? */
4736 was_first = page_cur_is_before_first(page_cursor);
4737
4738 /* Lock checks and undo logging were already performed by
4739 btr_cur_upd_lock_and_undo(). We do not try
4740 btr_cur_optimistic_insert() because
4741 btr_cur_insert_if_possible() already failed above. */
4742
4743 err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4744 | BTR_NO_LOCKING_FLAG
4745 | BTR_KEEP_SYS_FLAG,
4746 cursor, offsets, offsets_heap,
4747 new_entry, &rec,
4748 &dummy_big_rec, n_ext, NULL, mtr);
4749 ut_a(rec);
4750 ut_a(err == DB_SUCCESS);
4751 ut_a(dummy_big_rec == NULL);
4752 ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4753 page_cursor->rec = rec;
4754
4755 /* Multiple transactions cannot simultaneously operate on the
4756 same temp-table in parallel.
4757 max_trx_id is ignored for temp tables because it not required
4758 for MVCC. */
4759 if (dict_index_is_sec_or_ibuf(index)
4760 && !dict_table_is_temporary(index->table)) {
4761 /* Update PAGE_MAX_TRX_ID in the index page header.
4762 It was not updated by btr_cur_pessimistic_insert()
4763 because of BTR_NO_LOCKING_FLAG. */
4764 buf_block_t* rec_block;
4765
4766 rec_block = btr_cur_get_block(cursor);
4767
4768 page_update_max_trx_id(rec_block,
4769 buf_block_get_page_zip(rec_block),
4770 trx_id, mtr);
4771 }
4772
4773 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4774 /* The new inserted record owns its possible externally
4775 stored fields */
4776 buf_block_t* rec_block = btr_cur_get_block(cursor);
4777
4778 #ifdef UNIV_ZIP_DEBUG
4779 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4780 page = buf_block_get_frame(rec_block);
4781 #endif /* UNIV_ZIP_DEBUG */
4782 page_zip = buf_block_get_page_zip(rec_block);
4783
4784 btr_cur_unmark_extern_fields(page_zip,
4785 rec, index, *offsets, mtr);
4786 }
4787
4788 if (!dict_table_is_locking_disabled(index->table)) {
4789 lock_rec_restore_from_page_infimum(
4790 btr_cur_get_block(cursor), rec, block);
4791 }
4792
4793 /* If necessary, restore also the correct lock state for a new,
4794 preceding supremum record created in a page split. While the old
4795 record was nonexistent, the supremum might have inherited its locks
4796 from a wrong record. */
4797
4798 if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4799 btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
4800 rec, mtr);
4801 }
4802
4803 return_after_reservations:
4804 #ifdef UNIV_ZIP_DEBUG
4805 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4806 #endif /* UNIV_ZIP_DEBUG */
4807
4808 if (n_reserved > 0) {
4809 fil_space_release_free_extents(index->space, n_reserved);
4810 }
4811
4812 *big_rec = big_rec_vec;
4813
4814 return(err);
4815 }
4816
4817 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
4818
4819 /****************************************************************//**
4820 Writes the redo log record for delete marking or unmarking of an index
4821 record. */
4822 UNIV_INLINE
4823 void
btr_cur_del_mark_set_clust_rec_log(rec_t * rec,dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)4824 btr_cur_del_mark_set_clust_rec_log(
4825 /*===============================*/
4826 rec_t* rec, /*!< in: record */
4827 dict_index_t* index, /*!< in: index of the record */
4828 trx_id_t trx_id, /*!< in: transaction id */
4829 roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */
4830 mtr_t* mtr) /*!< in: mtr */
4831 {
4832 byte* log_ptr;
4833
4834 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4835 ut_ad(mtr->is_named_space(index->space));
4836
4837 log_ptr = mlog_open_and_write_index(mtr, rec, index,
4838 page_rec_is_comp(rec)
4839 ? MLOG_COMP_REC_CLUST_DELETE_MARK
4840 : MLOG_REC_CLUST_DELETE_MARK,
4841 1 + 1 + DATA_ROLL_PTR_LEN
4842 + 14 + 2);
4843
4844 if (!log_ptr) {
4845 /* Logging in mtr is switched off during crash recovery */
4846 return;
4847 }
4848
4849 *log_ptr++ = 0;
4850 *log_ptr++ = 1;
4851
4852 log_ptr = row_upd_write_sys_vals_to_log(
4853 index, trx_id, roll_ptr, log_ptr, mtr);
4854 mach_write_to_2(log_ptr, page_offset(rec));
4855 log_ptr += 2;
4856
4857 mlog_close(mtr, log_ptr);
4858 }
4859 #endif /* !UNIV_HOTBACKUP */
4860
4861 /****************************************************************//**
4862 Parses the redo log record for delete marking or unmarking of a clustered
4863 index record.
4864 @return end of log record or NULL */
4865 byte*
btr_cur_parse_del_mark_set_clust_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)4866 btr_cur_parse_del_mark_set_clust_rec(
4867 /*=================================*/
4868 byte* ptr, /*!< in: buffer */
4869 byte* end_ptr,/*!< in: buffer end */
4870 page_t* page, /*!< in/out: page or NULL */
4871 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
4872 dict_index_t* index) /*!< in: index corresponding to page */
4873 {
4874 ulint flags;
4875 ulint val;
4876 ulint pos;
4877 trx_id_t trx_id;
4878 roll_ptr_t roll_ptr;
4879 ulint offset;
4880 rec_t* rec;
4881
4882 ut_ad(!page
4883 || !!page_is_comp(page) == dict_table_is_comp(index->table));
4884
4885 if (end_ptr < ptr + 2) {
4886
4887 return(NULL);
4888 }
4889
4890 flags = mach_read_from_1(ptr);
4891 ptr++;
4892 val = mach_read_from_1(ptr);
4893 ptr++;
4894
4895 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
4896
4897 if (ptr == NULL) {
4898
4899 return(NULL);
4900 }
4901
4902 if (end_ptr < ptr + 2) {
4903
4904 return(NULL);
4905 }
4906
4907 offset = mach_read_from_2(ptr);
4908 ptr += 2;
4909
4910 ut_a(offset <= UNIV_PAGE_SIZE);
4911
4912 if (page) {
4913 rec = page + offset;
4914
4915 /* We do not need to reserve search latch, as the page
4916 is only being recovered, and there cannot be a hash index to
4917 it. Besides, these fields are being updated in place
4918 and the adaptive hash index does not depend on them. */
4919
4920 btr_rec_set_deleted_flag(rec, page_zip, val);
4921
4922 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4923 mem_heap_t* heap = NULL;
4924 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4925 rec_offs_init(offsets_);
4926
4927 row_upd_rec_sys_fields_in_recovery(
4928 rec, page_zip,
4929 rec_get_offsets(rec, index, offsets_,
4930 ULINT_UNDEFINED, &heap),
4931 pos, trx_id, roll_ptr);
4932 if (UNIV_LIKELY_NULL(heap)) {
4933 mem_heap_free(heap);
4934 }
4935 }
4936 }
4937
4938 return(ptr);
4939 }
4940
4941 #ifndef UNIV_HOTBACKUP
4942 /***********************************************************//**
4943 Marks a clustered index record deleted. Writes an undo log record to
4944 undo log on this delete marking. Writes in the trx id field the id
4945 of the deleting transaction, and in the roll ptr field pointer to the
4946 undo log record created.
4947 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4948 dberr_t
btr_cur_del_mark_set_clust_rec(ulint flags,buf_block_t * block,rec_t * rec,dict_index_t * index,const ulint * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)4949 btr_cur_del_mark_set_clust_rec(
4950 /*===========================*/
4951 ulint flags, /*!< in: undo logging and locking flags */
4952 buf_block_t* block, /*!< in/out: buffer block of the record */
4953 rec_t* rec, /*!< in/out: record */
4954 dict_index_t* index, /*!< in: clustered index of the record */
4955 const ulint* offsets,/*!< in: rec_get_offsets(rec) */
4956 que_thr_t* thr, /*!< in: query thread */
4957 const dtuple_t* entry, /*!< in: dtuple for the deleting record, also
4958 contains the virtual cols if there are any */
4959 mtr_t* mtr) /*!< in/out: mini-transaction */
4960 {
4961 roll_ptr_t roll_ptr;
4962 dberr_t err;
4963 page_zip_des_t* page_zip;
4964 trx_t* trx;
4965
4966 ut_ad(dict_index_is_clust(index));
4967 ut_ad(rec_offs_validate(rec, index, offsets));
4968 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4969 ut_ad(buf_block_get_frame(block) == page_align(rec));
4970 ut_ad(page_is_leaf(page_align(rec)));
4971 ut_ad(mtr->is_named_space(index->space));
4972
4973 if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
4974 /* While cascading delete operations, this becomes possible. */
4975 ut_ad(rec_get_trx_id(rec, index) == thr_get_trx(thr)->id);
4976 return(DB_SUCCESS);
4977 }
4978
4979 err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
4980 rec, index, offsets, thr);
4981
4982 if (err != DB_SUCCESS) {
4983
4984 return(err);
4985 }
4986
4987 err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
4988 index, entry, NULL, 0, rec, offsets,
4989 &roll_ptr);
4990 if (err != DB_SUCCESS) {
4991
4992 return(err);
4993 }
4994
4995 /* The search latch is not needed here, because
4996 the adaptive hash index does not depend on the delete-mark
4997 and the delete-mark is being updated in place. */
4998
4999 page_zip = buf_block_get_page_zip(block);
5000
5001 btr_rec_set_deleted_flag(rec, page_zip, TRUE);
5002
5003 /* For intrinsic table, roll-ptr is not maintained as there is no UNDO
5004 logging. Skip updating it. */
5005 if (dict_table_is_intrinsic(index->table)) {
5006 return(err);
5007 }
5008
5009 trx = thr_get_trx(thr);
5010 /* This function must not be invoked during rollback
5011 (of a TRX_STATE_PREPARE transaction or otherwise). */
5012 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
5013 ut_ad(!trx->in_rollback);
5014
5015 DBUG_PRINT("ib_cur", ("delete-mark clust %s (" IB_ID_FMT
5016 ") by " TRX_ID_FMT ": %s",
5017 index->table_name, index->id,
5018 trx_get_id_for_print(trx),
5019 rec_printer(rec, offsets).str().c_str()));
5020
5021 if (dict_index_is_online_ddl(index)) {
5022 row_log_table_delete(rec, entry, index, offsets, NULL);
5023 }
5024
5025 row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
5026
5027 btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
5028 roll_ptr, mtr);
5029
5030 return(err);
5031 }
5032
5033 /****************************************************************//**
5034 Writes the redo log record for a delete mark setting of a secondary
5035 index record. */
5036 UNIV_INLINE
5037 void
btr_cur_del_mark_set_sec_rec_log(rec_t * rec,ibool val,mtr_t * mtr)5038 btr_cur_del_mark_set_sec_rec_log(
5039 /*=============================*/
5040 rec_t* rec, /*!< in: record */
5041 ibool val, /*!< in: value to set */
5042 mtr_t* mtr) /*!< in: mtr */
5043 {
5044 byte* log_ptr;
5045 ut_ad(val <= 1);
5046
5047 log_ptr = mlog_open(mtr, 11 + 1 + 2);
5048
5049 if (!log_ptr) {
5050 /* Logging in mtr is switched off during crash recovery:
5051 in that case mlog_open returns NULL */
5052 return;
5053 }
5054
5055 log_ptr = mlog_write_initial_log_record_fast(
5056 rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
5057 mach_write_to_1(log_ptr, val);
5058 log_ptr++;
5059
5060 mach_write_to_2(log_ptr, page_offset(rec));
5061 log_ptr += 2;
5062
5063 mlog_close(mtr, log_ptr);
5064 }
5065 #endif /* !UNIV_HOTBACKUP */
5066
5067 /****************************************************************//**
5068 Parses the redo log record for delete marking or unmarking of a secondary
5069 index record.
5070 @return end of log record or NULL */
5071 byte*
btr_cur_parse_del_mark_set_sec_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip)5072 btr_cur_parse_del_mark_set_sec_rec(
5073 /*===============================*/
5074 byte* ptr, /*!< in: buffer */
5075 byte* end_ptr,/*!< in: buffer end */
5076 page_t* page, /*!< in/out: page or NULL */
5077 page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */
5078 {
5079 ulint val;
5080 ulint offset;
5081 rec_t* rec;
5082
5083 if (end_ptr < ptr + 3) {
5084
5085 return(NULL);
5086 }
5087
5088 val = mach_read_from_1(ptr);
5089 ptr++;
5090
5091 offset = mach_read_from_2(ptr);
5092 ptr += 2;
5093
5094 ut_a(offset <= UNIV_PAGE_SIZE);
5095
5096 if (page) {
5097 rec = page + offset;
5098
5099 /* We do not need to reserve search latch, as the page
5100 is only being recovered, and there cannot be a hash index to
5101 it. Besides, the delete-mark flag is being updated in place
5102 and the adaptive hash index does not depend on it. */
5103
5104 btr_rec_set_deleted_flag(rec, page_zip, val);
5105 }
5106
5107 return(ptr);
5108 }
5109
5110 #ifndef UNIV_HOTBACKUP
5111 /***********************************************************//**
5112 Sets a secondary index record delete mark to TRUE or FALSE.
5113 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
5114 dberr_t
btr_cur_del_mark_set_sec_rec(ulint flags,btr_cur_t * cursor,ibool val,que_thr_t * thr,mtr_t * mtr)5115 btr_cur_del_mark_set_sec_rec(
5116 /*=========================*/
5117 ulint flags, /*!< in: locking flag */
5118 btr_cur_t* cursor, /*!< in: cursor */
5119 ibool val, /*!< in: value to set */
5120 que_thr_t* thr, /*!< in: query thread */
5121 mtr_t* mtr) /*!< in/out: mini-transaction */
5122 {
5123 buf_block_t* block;
5124 rec_t* rec;
5125 dberr_t err;
5126
5127 block = btr_cur_get_block(cursor);
5128 rec = btr_cur_get_rec(cursor);
5129
5130 err = lock_sec_rec_modify_check_and_lock(flags,
5131 btr_cur_get_block(cursor),
5132 rec, cursor->index, thr, mtr);
5133 if (err != DB_SUCCESS) {
5134
5135 return(err);
5136 }
5137
5138 ut_ad(!!page_rec_is_comp(rec)
5139 == dict_table_is_comp(cursor->index->table));
5140
5141 DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
5142 IB_ID_FMT ") by " TRX_ID_FMT,
5143 unsigned(val),
5144 block->page.id.space(), block->page.id.page_no(),
5145 unsigned(page_rec_get_heap_no(rec)),
5146 cursor->index->name(), cursor->index->id,
5147 trx_get_id_for_print(thr_get_trx(thr))));
5148
5149 /* We do not need to reserve search latch, as the
5150 delete-mark flag is being updated in place and the adaptive
5151 hash index does not depend on it. */
5152 btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
5153
5154 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5155
5156 return(DB_SUCCESS);
5157 }
5158
5159 /***********************************************************//**
5160 Sets a secondary index record's delete mark to the given value. This
5161 function is only used by the insert buffer merge mechanism. */
5162 void
btr_cur_set_deleted_flag_for_ibuf(rec_t * rec,page_zip_des_t * page_zip,ibool val,mtr_t * mtr)5163 btr_cur_set_deleted_flag_for_ibuf(
5164 /*==============================*/
5165 rec_t* rec, /*!< in/out: record */
5166 page_zip_des_t* page_zip, /*!< in/out: compressed page
5167 corresponding to rec, or NULL
5168 when the tablespace is
5169 uncompressed */
5170 ibool val, /*!< in: value to set */
5171 mtr_t* mtr) /*!< in/out: mini-transaction */
5172 {
5173 /* We do not need to reserve search latch, as the page
5174 has just been read to the buffer pool and there cannot be
5175 a hash index to it. Besides, the delete-mark flag is being
5176 updated in place and the adaptive hash index does not depend
5177 on it. */
5178
5179 btr_rec_set_deleted_flag(rec, page_zip, val);
5180
5181 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5182 }
5183
5184 /*==================== B-TREE RECORD REMOVE =========================*/
5185
5186 /*************************************************************//**
5187 Tries to compress a page of the tree if it seems useful. It is assumed
5188 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5189 deadlocks, mtr must also own x-latches to brothers of page, if those
5190 brothers exist. NOTE: it is assumed that the caller has reserved enough
5191 free extents so that the compression will always succeed if done!
5192 @return TRUE if compression occurred */
5193 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5194 btr_cur_compress_if_useful(
5195 /*=======================*/
5196 btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
5197 cursor does not stay valid if !adjust and
5198 compression occurs */
5199 ibool adjust, /*!< in: TRUE if should adjust the
5200 cursor position even if compression occurs */
5201 mtr_t* mtr) /*!< in/out: mini-transaction */
5202 {
5203 /* Avoid applying compression as we don't accept lot of page garbage
5204 given the workload of intrinsic table. */
5205 if (dict_table_is_intrinsic(cursor->index->table)) {
5206 return(FALSE);
5207 }
5208
5209 ut_ad(mtr_memo_contains_flagged(
5210 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5211 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
5212 || dict_table_is_intrinsic(cursor->index->table));
5213 ut_ad(mtr_is_block_fix(
5214 mtr, btr_cur_get_block(cursor),
5215 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5216
5217 if (dict_index_is_spatial(cursor->index)) {
5218 const page_t* page = btr_cur_get_page(cursor);
5219 const trx_t* trx = NULL;
5220
5221 if (cursor->rtr_info->thr != NULL) {
5222 trx = thr_get_trx(cursor->rtr_info->thr);
5223 }
5224
5225 /* Check whether page lock prevents the compression */
5226 if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
5227 page_get_page_no(page))) {
5228 return(false);
5229 }
5230 }
5231
5232 return(btr_cur_compress_recommendation(cursor, mtr)
5233 && btr_compress(cursor, adjust, mtr));
5234 }
5235
5236 /*******************************************************//**
5237 Removes the record on which the tree cursor is positioned on a leaf page.
5238 It is assumed that the mtr has an x-latch on the page where the cursor is
5239 positioned, but no latch on the whole tree.
5240 @return TRUE if success, i.e., the page did not become too empty */
5241 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5242 btr_cur_optimistic_delete_func(
5243 /*===========================*/
5244 btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
5245 delete; cursor stays valid: if deletion
5246 succeeds, on function exit it points to the
5247 successor of the deleted record */
5248 #ifdef UNIV_DEBUG
5249 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5250 #endif /* UNIV_DEBUG */
5251 mtr_t* mtr) /*!< in: mtr; if this function returns
5252 TRUE on a leaf page of a secondary
5253 index, the mtr must be committed
5254 before latching any further pages */
5255 {
5256 buf_block_t* block;
5257 rec_t* rec;
5258 mem_heap_t* heap = NULL;
5259 ulint offsets_[REC_OFFS_NORMAL_SIZE];
5260 ulint* offsets = offsets_;
5261 ibool no_compress_needed;
5262 rec_offs_init(offsets_);
5263
5264 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5265 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5266 MTR_MEMO_PAGE_X_FIX));
5267 ut_ad(mtr_is_block_fix(mtr, btr_cur_get_block(cursor),
5268 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5269 ut_ad(mtr->is_named_space(cursor->index->space));
5270
5271 /* This is intended only for leaf page deletions */
5272
5273 block = btr_cur_get_block(cursor);
5274
5275 SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION););
5276
5277 ut_ad(page_is_leaf(buf_block_get_frame(block)));
5278 ut_ad(!dict_index_is_online_ddl(cursor->index)
5279 || dict_index_is_clust(cursor->index)
5280 || (flags & BTR_CREATE_FLAG));
5281
5282 rec = btr_cur_get_rec(cursor);
5283 offsets = rec_get_offsets(rec, cursor->index, offsets,
5284 ULINT_UNDEFINED, &heap);
5285
5286 no_compress_needed = !rec_offs_any_extern(offsets)
5287 && btr_cur_can_delete_without_compress(
5288 cursor, rec_offs_size(offsets), mtr);
5289
5290 if (no_compress_needed) {
5291
5292 page_t* page = buf_block_get_frame(block);
5293 page_zip_des_t* page_zip= buf_block_get_page_zip(block);
5294
5295 lock_update_delete(block, rec);
5296
5297 btr_search_update_hash_on_delete(cursor);
5298
5299 if (page_zip) {
5300 #ifdef UNIV_ZIP_DEBUG
5301 ut_a(page_zip_validate(page_zip, page, cursor->index));
5302 #endif /* UNIV_ZIP_DEBUG */
5303 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5304 cursor->index, offsets, mtr);
5305 #ifdef UNIV_ZIP_DEBUG
5306 ut_a(page_zip_validate(page_zip, page, cursor->index));
5307 #endif /* UNIV_ZIP_DEBUG */
5308
5309 /* On compressed pages, the IBUF_BITMAP_FREE
5310 space is not affected by deleting (purging)
5311 records, because it is defined as the minimum
5312 of space available *without* reorganize, and
5313 space available in the modification log. */
5314 } else {
5315 const ulint max_ins
5316 = page_get_max_insert_size_after_reorganize(
5317 page, 1);
5318
5319 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5320 cursor->index, offsets, mtr);
5321
5322 /* The change buffer does not handle inserts
5323 into non-leaf pages, into clustered indexes,
5324 or into the change buffer. */
5325 if (!dict_index_is_clust(cursor->index)
5326 && !dict_table_is_temporary(cursor->index->table)
5327 && !dict_index_is_ibuf(cursor->index)) {
5328 ibuf_update_free_bits_low(block, max_ins, mtr);
5329 }
5330 }
5331 } else {
5332 /* prefetch siblings of the leaf for the pessimistic
5333 operation. */
5334 btr_cur_prefetch_siblings(block);
5335 }
5336
5337 if (UNIV_LIKELY_NULL(heap)) {
5338 mem_heap_free(heap);
5339 }
5340
5341 return(no_compress_needed);
5342 }
5343
5344 /*************************************************************//**
5345 Removes the record on which the tree cursor is positioned. Tries
5346 to compress the page if its fillfactor drops below a threshold
5347 or if it is the only page on the level. It is assumed that mtr holds
5348 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5349 mtr must also own x-latches to brothers of page, if those brothers
5350 exist.
5351 @return TRUE if compression occurred and FALSE if not or something
5352 wrong. */
5353 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5354 btr_cur_pessimistic_delete(
5355 /*=======================*/
5356 dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5357 the latter may occur because we may have
5358 to update node pointers on upper levels,
5359 and in the case of variable length keys
5360 these may actually grow in size */
5361 ibool has_reserved_extents, /*!< in: TRUE if the
5362 caller has already reserved enough free
5363 extents so that he knows that the operation
5364 will succeed */
5365 btr_cur_t* cursor, /*!< in: cursor on the record to delete;
5366 if compression does not occur, the cursor
5367 stays valid: it points to successor of
5368 deleted record on function exit */
5369 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5370 bool rollback,/*!< in: performing rollback? */
5371 mtr_t* mtr) /*!< in: mtr */
5372 {
5373 buf_block_t* block;
5374 page_t* page;
5375 page_zip_des_t* page_zip;
5376 dict_index_t* index;
5377 rec_t* rec;
5378 ulint n_reserved = 0;
5379 bool success;
5380 ibool ret = FALSE;
5381 ulint level;
5382 mem_heap_t* heap;
5383 ulint* offsets;
5384 bool allow_merge = true; /* if true, implies we have taken appropriate page
5385 latches needed to merge this page.*/
5386 #ifdef UNIV_DEBUG
5387 bool parent_latched = false;
5388 #endif /* UNIV_DEBUG */
5389
5390 block = btr_cur_get_block(cursor);
5391 page = buf_block_get_frame(block);
5392 index = btr_cur_get_index(cursor);
5393
5394 ulint rec_size_est = dict_index_node_ptr_max_size(index);
5395 const page_size_t page_size(dict_table_page_size(index->table));
5396
5397 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5398 ut_ad(!dict_index_is_online_ddl(index)
5399 || dict_index_is_clust(index)
5400 || (flags & BTR_CREATE_FLAG));
5401 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
5402 MTR_MEMO_X_LOCK
5403 | MTR_MEMO_SX_LOCK)
5404 || dict_table_is_intrinsic(index->table));
5405 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
5406 ut_ad(mtr->is_named_space(index->space));
5407
5408 if (!has_reserved_extents) {
5409 /* First reserve enough free space for the file segments
5410 of the index tree, so that the node pointer updates will
5411 not fail because of lack of space */
5412
5413 ut_a(cursor->tree_height != ULINT_UNDEFINED);
5414
5415 ulint n_extents = cursor->tree_height / 32 + 1;
5416
5417 success = fsp_reserve_free_extents(&n_reserved,
5418 index->space,
5419 n_extents,
5420 FSP_CLEANING, mtr);
5421 if (!success) {
5422 *err = DB_OUT_OF_FILE_SPACE;
5423
5424 return(FALSE);
5425 }
5426 }
5427
5428 heap = mem_heap_create(1024);
5429 rec = btr_cur_get_rec(cursor);
5430 page_zip = buf_block_get_page_zip(block);
5431 #ifdef UNIV_ZIP_DEBUG
5432 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5433 #endif /* UNIV_ZIP_DEBUG */
5434
5435 offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
5436
5437 if (rec_offs_any_extern(offsets)) {
5438 btr_rec_free_externally_stored_fields(index,
5439 rec, offsets, page_zip,
5440 rollback, mtr);
5441 #ifdef UNIV_ZIP_DEBUG
5442 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5443 #endif /* UNIV_ZIP_DEBUG */
5444 }
5445
5446 if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
5447 && UNIV_UNLIKELY(dict_index_get_page(index)
5448 != block->page.id.page_no())) {
5449
5450 /* If there is only one record, drop the whole page in
5451 btr_discard_page, if this is not the root page */
5452
5453 btr_discard_page(cursor, mtr);
5454
5455 ret = TRUE;
5456
5457 goto return_after_reservations;
5458 }
5459
5460 if (flags == 0) {
5461 lock_update_delete(block, rec);
5462 }
5463
5464 level = btr_page_get_level(page, mtr);
5465
5466 if (level > 0
5467 && UNIV_UNLIKELY(rec == page_rec_get_next(
5468 page_get_infimum_rec(page)))) {
5469
5470 rec_t* next_rec = page_rec_get_next(rec);
5471
5472 if (btr_page_get_prev(page, mtr) == FIL_NULL) {
5473
5474 /* If we delete the leftmost node pointer on a
5475 non-leaf level, we must mark the new leftmost node
5476 pointer as the predefined minimum record */
5477
5478 /* This will make page_zip_validate() fail until
5479 page_cur_delete_rec() completes. This is harmless,
5480 because everything will take place within a single
5481 mini-transaction and because writing to the redo log
5482 is an atomic operation (performed by mtr_commit()). */
5483 btr_set_min_rec_mark(next_rec, mtr);
5484 } else if (dict_index_is_spatial(index)) {
5485 /* For rtree, if delete the leftmost node pointer,
5486 we need to update parent page. */
5487 rtr_mbr_t father_mbr;
5488 rec_t* father_rec;
5489 btr_cur_t father_cursor;
5490 ulint* offsets;
5491 bool upd_ret;
5492 ulint len;
5493
5494 rtr_page_get_father_block(NULL, heap, index,
5495 block, mtr, NULL,
5496 &father_cursor);
5497 offsets = rec_get_offsets(
5498 btr_cur_get_rec(&father_cursor), index,
5499 NULL, ULINT_UNDEFINED, &heap);
5500
5501 father_rec = btr_cur_get_rec(&father_cursor);
5502 rtr_read_mbr(rec_get_nth_field(
5503 father_rec, offsets, 0, &len), &father_mbr);
5504
5505 upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5506 NULL, page, &father_mbr,
5507 next_rec, mtr);
5508
5509 if (!upd_ret) {
5510 *err = DB_ERROR;
5511
5512 mem_heap_free(heap);
5513 return(FALSE);
5514 }
5515
5516 ut_d(parent_latched = true);
5517 } else {
5518 /* Otherwise, if we delete the leftmost node pointer
5519 on a page, we have to change the parent node pointer
5520 so that it is equal to the new leftmost node pointer
5521 on the page */
5522
5523 btr_node_ptr_delete(index, block, mtr);
5524
5525 dtuple_t* node_ptr = dict_index_build_node_ptr(
5526 index, next_rec, block->page.id.page_no(),
5527 heap, level);
5528
5529 btr_insert_on_non_leaf_level(
5530 flags, index, level + 1, node_ptr, mtr);
5531
5532 ut_d(parent_latched = true);
5533 }
5534 }
5535
5536 btr_search_update_hash_on_delete(cursor);
5537
5538 if (page_is_leaf(page) || dict_index_is_spatial(index)) {
5539 /* Set allow merge to true for spatial indexes as the tree is X
5540 locked incase of delete operation on spatial indexes thus avoiding
5541 possibility of upward locking.*/
5542 allow_merge = true;
5543 } else {
5544 allow_merge = btr_cur_will_modify_tree(index,page,BTR_INTENTION_DELETE,
5545 rec,rec_size_est,page_size,mtr);
5546 }
5547 page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
5548 #ifdef UNIV_ZIP_DEBUG
5549 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5550 #endif /* UNIV_ZIP_DEBUG */
5551
5552 /* btr_check_node_ptr() needs parent block latched */
5553 ut_ad(!parent_latched || btr_check_node_ptr(index, block, mtr));
5554
5555 return_after_reservations:
5556 *err = DB_SUCCESS;
5557
5558 mem_heap_free(heap);
5559
5560 if(!ret) {
5561 bool do_merge = btr_cur_compress_recommendation(cursor,mtr);
5562 /* We are not allowed do merge because appropriate locks
5563 are not taken while positioning the cursor. */
5564 if (!allow_merge && do_merge) {
5565 ib::info() << "Ignoring merge recommendation for page"
5566 "as we could not predict it early .Page"
5567 "number being\n" << page_get_page_no(page) <<
5568 "Index name\n" << index->name;
5569 ut_ad(false);
5570 } else if (do_merge) {
5571
5572 ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
5573 }
5574 }
5575
5576 if (!srv_read_only_mode
5577 && page_is_leaf(page)
5578 && !dict_index_is_online_ddl(index)) {
5579
5580 mtr_memo_release(mtr, dict_index_get_lock(index),
5581 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5582
5583 /* NOTE: We cannot release root block latch here, because it
5584 has segment header and already modified in most of cases.*/
5585 }
5586
5587 if (n_reserved > 0) {
5588 fil_space_release_free_extents(index->space, n_reserved);
5589 }
5590
5591 return(ret);
5592 }
5593
5594 /*******************************************************************//**
5595 Adds path information to the cursor for the current page, for which
5596 the binary search has been performed. */
5597 static
5598 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5599 btr_cur_add_path_info(
5600 /*==================*/
5601 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
5602 ulint height, /*!< in: height of the page in tree;
5603 0 means leaf node */
5604 ulint root_height) /*!< in: root node height in tree */
5605 {
5606 btr_path_t* slot;
5607 const rec_t* rec;
5608 const page_t* page;
5609
5610 ut_a(cursor->path_arr);
5611
5612 if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5613 /* Do nothing; return empty path */
5614
5615 slot = cursor->path_arr;
5616 slot->nth_rec = ULINT_UNDEFINED;
5617
5618 return;
5619 }
5620
5621 if (height == 0) {
5622 /* Mark end of slots for path */
5623 slot = cursor->path_arr + root_height + 1;
5624 slot->nth_rec = ULINT_UNDEFINED;
5625 }
5626
5627 rec = btr_cur_get_rec(cursor);
5628
5629 slot = cursor->path_arr + (root_height - height);
5630
5631 page = page_align(rec);
5632
5633 slot->nth_rec = page_rec_get_n_recs_before(rec);
5634 slot->n_recs = page_get_n_recs(page);
5635 slot->page_no = page_get_page_no(page);
5636 slot->page_level = btr_page_get_level_low(page);
5637 }
5638
5639 /*******************************************************************//**
5640 Estimate the number of rows between slot1 and slot2 for any level on a
5641 B-tree. This function starts from slot1->page and reads a few pages to
5642 the right, counting their records. If we reach slot2->page quickly then
5643 we know exactly how many records there are between slot1 and slot2 and
5644 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
5645 then we calculate the average number of records in the pages scanned
5646 so far and assume that all pages that we did not scan up to slot2->page
5647 contain the same number of records, then we multiply that average to
5648 the number of pages between slot1->page and slot2->page (which is
5649 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5650 @return number of rows, not including the borders (exact or estimated) */
5651 static
5652 int64_t
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,int64_t n_rows_on_prev_level,ibool * is_n_rows_exact)5653 btr_estimate_n_rows_in_range_on_level(
5654 /*==================================*/
5655 dict_index_t* index, /*!< in: index */
5656 btr_path_t* slot1, /*!< in: left border */
5657 btr_path_t* slot2, /*!< in: right border */
5658 int64_t n_rows_on_prev_level, /*!< in: number of rows
5659 on the previous level for the
5660 same descend paths; used to
5661 determine the number of pages
5662 on this level */
5663 ibool* is_n_rows_exact) /*!< out: TRUE if the returned
5664 value is exact i.e. not an
5665 estimation */
5666 {
5667 int64_t n_rows;
5668 ulint n_pages_read;
5669 ulint level;
5670
5671 n_rows = 0;
5672 n_pages_read = 0;
5673
5674 /* Assume by default that we will scan all pages between
5675 slot1->page_no and slot2->page_no. */
5676 *is_n_rows_exact = TRUE;
5677
5678 /* Add records from slot1->page_no which are to the right of
5679 the record which serves as a left border of the range, if any
5680 (we don't include the record itself in this count). */
5681 if (slot1->nth_rec <= slot1->n_recs) {
5682 n_rows += slot1->n_recs - slot1->nth_rec;
5683 }
5684
5685 /* Add records from slot2->page_no which are to the left of
5686 the record which servers as a right border of the range, if any
5687 (we don't include the record itself in this count). */
5688 if (slot2->nth_rec > 1) {
5689 n_rows += slot2->nth_rec - 1;
5690 }
5691
5692 /* Count the records in the pages between slot1->page_no and
5693 slot2->page_no (non inclusive), if any. */
5694
5695 /* Do not read more than this number of pages in order not to hurt
5696 performance with this code which is just an estimation. If we read
5697 this many pages before reaching slot2->page_no then we estimate the
5698 average from the pages scanned so far. */
5699 # define N_PAGES_READ_LIMIT 10
5700
5701 page_id_t page_id(
5702 dict_index_get_space(index), slot1->page_no);
5703 const fil_space_t* space = fil_space_get(index->space);
5704 ut_ad(space);
5705 const page_size_t page_size(space->flags);
5706
5707 level = slot1->page_level;
5708
5709 do {
5710 mtr_t mtr;
5711 page_t* page;
5712 buf_block_t* block;
5713 dberr_t err = DB_SUCCESS;
5714
5715 mtr_start(&mtr);
5716
5717 /* Fetch the page. Because we are not holding the
5718 index->lock, the tree may have changed and we may be
5719 attempting to read a page that is no longer part of
5720 the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
5721 silence a debug assertion about this. */
5722 block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5723 NULL, BUF_GET_POSSIBLY_FREED,
5724 __FILE__, __LINE__, &mtr);
5725
5726 ut_ad((block != NULL) == (err == DB_SUCCESS));
5727
5728 if (err != DB_SUCCESS) {
5729 if (err == DB_DECRYPTION_FAILED) {
5730 ib::warn() << "Table is encrypted but encryption service or"
5731 " used key_id is not available. "
5732 " Can't continue reading table.";
5733
5734 index->table->set_file_unreadable();
5735 }
5736
5737 mtr_commit(&mtr);
5738 goto inexact;
5739 }
5740
5741 page = buf_block_get_frame(block);
5742
5743 /* It is possible that the tree has been reorganized in the
5744 meantime and this is a different page. If this happens the
5745 calculated estimate will be bogus, which is not fatal as
5746 this is only an estimate. We are sure that a page with
5747 page_no exists because InnoDB never frees pages, only
5748 reuses them. */
5749 if (!fil_page_index_page_check(page)
5750 || btr_page_get_index_id(page) != index->id
5751 || btr_page_get_level_low(page) != level) {
5752
5753 /* The page got reused for something else */
5754 mtr_commit(&mtr);
5755 goto inexact;
5756 }
5757
5758 /* It is possible but highly unlikely that the page was
5759 originally written by an old version of InnoDB that did
5760 not initialize FIL_PAGE_TYPE on other than B-tree pages.
5761 For example, this could be an almost-empty BLOB page
5762 that happens to contain the magic values in the fields
5763 that we checked above. */
5764
5765 n_pages_read++;
5766
5767 if (page_id.page_no() != slot1->page_no) {
5768 /* Do not count the records on slot1->page_no,
5769 we already counted them before this loop. */
5770 n_rows += page_get_n_recs(page);
5771 }
5772
5773 page_id.set_page_no(btr_page_get_next(page, &mtr));
5774
5775 mtr_commit(&mtr);
5776
5777 if (n_pages_read == N_PAGES_READ_LIMIT
5778 || page_id.page_no() == FIL_NULL) {
5779 /* Either we read too many pages or
5780 we reached the end of the level without passing
5781 through slot2->page_no, the tree must have changed
5782 in the meantime */
5783 goto inexact;
5784 }
5785
5786 } while (page_id.page_no() != slot2->page_no);
5787
5788 return(n_rows);
5789
5790 inexact:
5791
5792 *is_n_rows_exact = FALSE;
5793
5794 /* We did interrupt before reaching slot2->page */
5795
5796 if (n_pages_read > 0) {
5797 /* The number of pages on this level is
5798 n_rows_on_prev_level, multiply it by the
5799 average number of recs per page so far */
5800 n_rows = n_rows_on_prev_level
5801 * n_rows / n_pages_read;
5802 } else {
5803 /* The tree changed before we could even
5804 start with slot1->page_no */
5805 n_rows = 10;
5806 }
5807
5808 return(n_rows);
5809 }
5810
5811 /** If the tree gets changed too much between the two dives for the left
5812 and right boundary then btr_estimate_n_rows_in_range_low() will retry
5813 that many times before giving up and returning the value stored in
5814 rows_in_range_arbitrary_ret_val. */
5815 static const unsigned rows_in_range_max_retries = 4;
5816
5817 /** We pretend that a range has that many records if the tree keeps changing
5818 for rows_in_range_max_retries retries while we try to estimate the records
5819 in a given range. */
5820 static const int64_t rows_in_range_arbitrary_ret_val = 10;
5821
5822 /** Estimates the number of rows in a given index range.
5823 @param[in] index index
5824 @param[in] tuple1 range start, may also be empty tuple
5825 @param[in] mode1 search mode for range start
5826 @param[in] tuple2 range end, may also be empty tuple
5827 @param[in] mode2 search mode for range end
5828 @param[in] nth_attempt if the tree gets modified too much while
5829 we are trying to analyze it, then we will retry (this function will call
5830 itself, incrementing this parameter)
5831 @return estimated number of rows; if after rows_in_range_max_retries
5832 retries the tree keeps changing, then we will just return
5833 rows_in_range_arbitrary_ret_val as a result (if
5834 nth_attempt >= rows_in_range_max_retries and the tree is modified between
5835 the two dives). */
5836 static
5837 int64_t
btr_estimate_n_rows_in_range_low(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2,unsigned nth_attempt)5838 btr_estimate_n_rows_in_range_low(
5839 dict_index_t* index,
5840 const dtuple_t* tuple1,
5841 page_cur_mode_t mode1,
5842 const dtuple_t* tuple2,
5843 page_cur_mode_t mode2,
5844 unsigned nth_attempt)
5845 {
5846 btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
5847 btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
5848 btr_cur_t cursor;
5849 btr_path_t* slot1;
5850 btr_path_t* slot2;
5851 ibool diverged;
5852 ibool diverged_lot;
5853 ulint divergence_level;
5854 int64_t n_rows;
5855 ibool is_n_rows_exact;
5856 ulint i;
5857 mtr_t mtr;
5858 int64_t table_n_rows;
5859
5860 table_n_rows = dict_table_get_n_rows(index->table);
5861
5862 /* Below we dive to the two records specified by tuple1 and tuple2 and
5863 we remember the entire dive paths from the tree root. The place where
5864 the tuple1 path ends on the leaf level we call "left border" of our
5865 interval and the place where the tuple2 path ends on the leaf level -
5866 "right border". We take care to either include or exclude the interval
5867 boundaries depending on whether <, <=, > or >= was specified. For
5868 example if "5 < x AND x <= 10" then we should not include the left
5869 boundary, but should include the right one. */
5870
5871 mtr_start(&mtr);
5872
5873 cursor.path_arr = path1;
5874
5875 bool should_count_the_left_border = false;
5876
5877 if (dtuple_get_n_fields(tuple1) > 0) {
5878
5879 btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5880 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5881 &cursor, 0,
5882 __FILE__, __LINE__, &mtr);
5883
5884 if (index->is_readable())
5885 {
5886 ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5887
5888 /* We should count the border if there are any records to
5889 match the criteria, i.e. if the maximum record on the tree is
5890 5 and x > 3 is specified then the cursor will be positioned at
5891 5 and we should count the border, but if x > 7 is specified,
5892 then the cursor will be positioned at 'sup' on the rightmost
5893 leaf page in the tree and we should not count the border. */
5894 should_count_the_left_border
5895 = !page_rec_is_supremum(btr_cur_get_rec(&cursor));
5896 }
5897 } else {
5898 dberr_t err = btr_cur_open_at_index_side(true, index,
5899 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5900 &cursor, 0, &mtr);
5901
5902 if (err != DB_SUCCESS) {
5903 ib::warn() << " Error code: " << err
5904 << " btr_estimate_n_rows_in_range_low "
5905 << " called from file: "
5906 << __FILE__ << " line: " << __LINE__
5907 << " table: " << index->table->name
5908 << " index: " << index->name;
5909 }
5910
5911 if (index->is_readable()) {
5912 ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5913
5914 /* The range specified is wihout a left border, just
5915 'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
5916 positioned the cursor on the infimum record on the leftmost
5917 page, which must not be counted. */
5918 should_count_the_left_border = false;
5919 }
5920 }
5921
5922 mtr_commit(&mtr);
5923
5924 if (!index->is_readable()) {
5925 return 0;
5926 }
5927
5928 #ifdef UNIV_DEBUG
5929 if (!strcmp(index->name, "iC")) {
5930 DEBUG_SYNC_C("btr_estimate_n_rows_in_range_between_dives");
5931 }
5932 #endif
5933
5934 mtr_start(&mtr);
5935
5936 cursor.path_arr = path2;
5937
5938 bool should_count_the_right_border;
5939
5940 if (dtuple_get_n_fields(tuple2) > 0) {
5941
5942 btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
5943 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5944 &cursor, 0,
5945 __FILE__, __LINE__, &mtr);
5946
5947 const rec_t* rec = btr_cur_get_rec(&cursor);
5948
5949 ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
5950
5951 should_count_the_right_border
5952 = (mode2 == PAGE_CUR_LE /* if the range is '<=' */
5953 /* and the record was found */
5954 && cursor.low_match >= dtuple_get_n_fields(tuple2))
5955 || (mode2 == PAGE_CUR_L /* or if the range is '<' */
5956 /* and there are any records to match the criteria,
5957 i.e. if the minimum record on the tree is 5 and
5958 x < 7 is specified then the cursor will be
5959 positioned at 5 and we should count the border, but
5960 if x < 2 is specified, then the cursor will be
5961 positioned at 'inf' and we should not count the
5962 border */
5963 && !page_rec_is_infimum(rec));
5964 /* Notice that for "WHERE col <= 'foo'" MySQL passes to
5965 ha_innobase::records_in_range():
5966 min_key=NULL (left-unbounded) which is expected
5967 max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
5968 unexpected - one would expect
5969 flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
5970 cursor will be positioned on the first record to the right of
5971 the requested one (can also be positioned on the 'sup') and
5972 we should not count the right border. */
5973 } else {
5974 dberr_t err = btr_cur_open_at_index_side(false, index,
5975 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5976 &cursor, 0, &mtr);
5977
5978 if (err != DB_SUCCESS) {
5979 ib::warn() << " Error code: " << err
5980 << " btr_estimate_n_rows_in_range_low "
5981 << " called from file: "
5982 << __FILE__ << " line: " << __LINE__
5983 << " table: " << index->table->name
5984 << " index: " << index->name;
5985 }
5986
5987 ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
5988
5989 /* The range specified is wihout a right border, just
5990 'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
5991 positioned the cursor on the supremum record on the rightmost
5992 page, which must not be counted. */
5993 should_count_the_right_border = false;
5994 }
5995
5996 mtr_commit(&mtr);
5997
5998 /* We have the path information for the range in path1 and path2 */
5999
6000 n_rows = 0;
6001 is_n_rows_exact = TRUE;
6002
6003 /* This becomes true when the two paths do not pass through the
6004 same pages anymore. */
6005 diverged = FALSE;
6006
6007 /* This becomes true when the paths are not the same or adjacent
6008 any more. This means that they pass through the same or
6009 neighboring-on-the-same-level pages only. */
6010 diverged_lot = FALSE;
6011
6012 /* This is the level where paths diverged a lot. */
6013 divergence_level = 1000000;
6014
6015 for (i = 0; ; i++) {
6016 ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
6017
6018 slot1 = path1 + i;
6019 slot2 = path2 + i;
6020
6021 if (slot1->nth_rec == ULINT_UNDEFINED
6022 || slot2->nth_rec == ULINT_UNDEFINED) {
6023
6024 /* Here none of the borders were counted. For example,
6025 if on the leaf level we descended to:
6026 (inf, a, b, c, d, e, f, sup)
6027 ^ ^
6028 path1 path2
6029 then n_rows will be 2 (c and d). */
6030
6031 if (is_n_rows_exact) {
6032 /* Only fiddle to adjust this off-by-one
6033 if the number is exact, otherwise we do
6034 much grosser adjustments below. */
6035
6036 btr_path_t* last1 = &path1[i - 1];
6037 btr_path_t* last2 = &path2[i - 1];
6038
6039 /* If both paths end up on the same record on
6040 the leaf level. */
6041 if (last1->page_no == last2->page_no
6042 && last1->nth_rec == last2->nth_rec) {
6043
6044 /* n_rows can be > 0 here if the paths
6045 were first different and then converged
6046 to the same record on the leaf level.
6047 For example:
6048 SELECT ... LIKE 'wait/synch/rwlock%'
6049 mode1=PAGE_CUR_GE,
6050 tuple1="wait/synch/rwlock"
6051 path1[0]={nth_rec=58, n_recs=58,
6052 page_no=3, page_level=1}
6053 path1[1]={nth_rec=56, n_recs=55,
6054 page_no=119, page_level=0}
6055
6056 mode2=PAGE_CUR_G
6057 tuple2="wait/synch/rwlock"
6058 path2[0]={nth_rec=57, n_recs=57,
6059 page_no=3, page_level=1}
6060 path2[1]={nth_rec=56, n_recs=55,
6061 page_no=119, page_level=0} */
6062
6063 /* If the range is such that we should
6064 count both borders, then avoid
6065 counting that record twice - once as a
6066 left border and once as a right
6067 border. */
6068 if (should_count_the_left_border
6069 && should_count_the_right_border) {
6070
6071 n_rows = 1;
6072 } else {
6073 /* Some of the borders should
6074 not be counted, e.g. [3,3). */
6075 n_rows = 0;
6076 }
6077 } else {
6078 if (should_count_the_left_border) {
6079 n_rows++;
6080 }
6081
6082 if (should_count_the_right_border) {
6083 n_rows++;
6084 }
6085 }
6086 }
6087
6088 if (i > divergence_level + 1 && !is_n_rows_exact) {
6089 /* In trees whose height is > 1 our algorithm
6090 tends to underestimate: multiply the estimate
6091 by 2: */
6092
6093 n_rows = n_rows * 2;
6094 }
6095
6096 DBUG_EXECUTE_IF("bug14007649", return(n_rows););
6097
6098 /* Do not estimate the number of rows in the range
6099 to over 1 / 2 of the estimated rows in the whole
6100 table */
6101
6102 if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
6103
6104 n_rows = table_n_rows / 2;
6105
6106 /* If there are just 0 or 1 rows in the table,
6107 then we estimate all rows are in the range */
6108
6109 if (n_rows == 0) {
6110 n_rows = table_n_rows;
6111 }
6112 }
6113
6114 return(n_rows);
6115 }
6116
6117 if (!diverged && slot1->nth_rec != slot2->nth_rec) {
6118
6119 /* If both slots do not point to the same page or if
6120 the paths have crossed and the same page on both
6121 apparently contains a different number of records,
6122 this means that the tree must have changed between
6123 the dive for slot1 and the dive for slot2 at the
6124 beginning of this function. */
6125 if (slot1->page_no != slot2->page_no
6126 || slot1->page_level != slot2->page_level
6127 || (slot1->nth_rec >= slot2->nth_rec
6128 && slot1->n_recs != slot2->n_recs)) {
6129
6130 /* If the tree keeps changing even after a
6131 few attempts, then just return some arbitrary
6132 number. */
6133 if (nth_attempt >= rows_in_range_max_retries) {
6134 return(rows_in_range_arbitrary_ret_val);
6135 }
6136
6137 const int64_t ret =
6138 btr_estimate_n_rows_in_range_low(
6139 index, tuple1, mode1,
6140 tuple2, mode2, nth_attempt + 1);
6141
6142 return(ret);
6143 }
6144
6145 diverged = TRUE;
6146
6147 if (slot1->nth_rec < slot2->nth_rec) {
6148 /* We do not count the borders (nor the left
6149 nor the right one), thus "- 1". */
6150 n_rows = slot2->nth_rec - slot1->nth_rec - 1;
6151
6152 if (n_rows > 0) {
6153 /* There is at least one row between
6154 the two borders pointed to by slot1
6155 and slot2, so on the level below the
6156 slots will point to non-adjacent
6157 pages. */
6158 diverged_lot = TRUE;
6159 divergence_level = i;
6160 }
6161 } else {
6162 /* It is possible that
6163 slot1->nth_rec >= slot2->nth_rec
6164 if, for example, we have a single page
6165 tree which contains (inf, 5, 6, supr)
6166 and we select where x > 20 and x < 30;
6167 in this case slot1->nth_rec will point
6168 to the supr record and slot2->nth_rec
6169 will point to 6 */
6170 return(0);
6171 }
6172
6173 } else if (diverged && !diverged_lot) {
6174
6175 if (slot1->nth_rec < slot1->n_recs
6176 || slot2->nth_rec > 1) {
6177
6178 diverged_lot = TRUE;
6179 divergence_level = i;
6180
6181 n_rows = 0;
6182
6183 if (slot1->nth_rec < slot1->n_recs) {
6184 n_rows += slot1->n_recs
6185 - slot1->nth_rec;
6186 }
6187
6188 if (slot2->nth_rec > 1) {
6189 n_rows += slot2->nth_rec - 1;
6190 }
6191 }
6192 } else if (diverged_lot) {
6193
6194 n_rows = btr_estimate_n_rows_in_range_on_level(
6195 index, slot1, slot2, n_rows,
6196 &is_n_rows_exact);
6197 }
6198 }
6199 }
6200
6201 /** Estimates the number of rows in a given index range.
6202 @param[in] index index
6203 @param[in] tuple1 range start, may also be empty tuple
6204 @param[in] mode1 search mode for range start
6205 @param[in] tuple2 range end, may also be empty tuple
6206 @param[in] mode2 search mode for range end
6207 @return estimated number of rows */
6208 int64_t
btr_estimate_n_rows_in_range(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2)6209 btr_estimate_n_rows_in_range(
6210 dict_index_t* index,
6211 const dtuple_t* tuple1,
6212 page_cur_mode_t mode1,
6213 const dtuple_t* tuple2,
6214 page_cur_mode_t mode2)
6215 {
6216 const int64_t ret = btr_estimate_n_rows_in_range_low(
6217 index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */);
6218
6219 return(ret);
6220 }
6221
6222 /*******************************************************************//**
6223 Record the number of non_null key values in a given index for
6224 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6225 The estimates are eventually stored in the array:
6226 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6227 static
6228 void
btr_record_not_null_field_in_rec(ulint n_unique,const ulint * offsets,ib_uint64_t * n_not_null)6229 btr_record_not_null_field_in_rec(
6230 /*=============================*/
6231 ulint n_unique, /*!< in: dict_index_get_n_unique(index),
6232 number of columns uniquely determine
6233 an index entry */
6234 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
6235 its size could be for all fields or
6236 that of "n_unique" */
6237 ib_uint64_t* n_not_null) /*!< in/out: array to record number of
6238 not null rows for n-column prefix */
6239 {
6240 ulint i;
6241
6242 ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6243
6244 if (n_not_null == NULL) {
6245 return;
6246 }
6247
6248 for (i = 0; i < n_unique; i++) {
6249 if (rec_offs_nth_sql_null(offsets, i)) {
6250 break;
6251 }
6252
6253 n_not_null[i]++;
6254 }
6255 }
6256
6257 /*******************************************************************//**
6258 Estimates the number of different key values in a given index, for
6259 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6260 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6261 0..n_uniq-1) and the number of pages that were sampled is saved in
6262 index->stat_n_sample_sizes[].
6263 If innodb_stats_method is nulls_ignored, we also record the number of
6264 non-null values for each prefix and stored the estimates in
6265 array index->stat_n_non_null_key_vals.
6266 @return true if the index is available and we get the estimated numbers,
6267 false if the index is unavailable. */
6268 bool
btr_estimate_number_of_different_key_vals(dict_index_t * index)6269 btr_estimate_number_of_different_key_vals(
6270 /*======================================*/
6271 dict_index_t* index) /*!< in: index */
6272 {
6273 btr_cur_t cursor;
6274 page_t* page;
6275 rec_t* rec;
6276 ulint n_cols;
6277 ib_uint64_t* n_diff;
6278 ib_uint64_t* n_not_null;
6279 ibool stats_null_not_equal;
6280 uintmax_t n_sample_pages; /* number of pages to sample */
6281 ulint not_empty_flag = 0;
6282 ulint total_external_size = 0;
6283 ulint i;
6284 ulint j;
6285 uintmax_t add_on;
6286 mtr_t mtr;
6287 mem_heap_t* heap = NULL;
6288 ulint* offsets_rec = NULL;
6289 ulint* offsets_next_rec = NULL;
6290
6291 /* For spatial index, there is no such stats can be
6292 fetched. */
6293 if (dict_index_is_spatial(index)) {
6294 return(false);
6295 }
6296
6297 n_cols = dict_index_get_n_unique(index);
6298
6299 heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6300 * n_cols
6301 + dict_index_get_n_fields(index)
6302 * (sizeof *offsets_rec
6303 + sizeof *offsets_next_rec));
6304
6305 n_diff = (ib_uint64_t*) mem_heap_zalloc(
6306 heap, n_cols * sizeof(n_diff[0]));
6307
6308 n_not_null = NULL;
6309
6310 /* Check srv_innodb_stats_method setting, and decide whether we
6311 need to record non-null value and also decide if NULL is
6312 considered equal (by setting stats_null_not_equal value) */
6313 switch (srv_innodb_stats_method) {
6314 case SRV_STATS_NULLS_IGNORED:
6315 n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6316 heap, n_cols * sizeof *n_not_null);
6317 /* fall through */
6318
6319 case SRV_STATS_NULLS_UNEQUAL:
6320 /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6321 case, we will treat NULLs as unequal value */
6322 stats_null_not_equal = TRUE;
6323 break;
6324
6325 case SRV_STATS_NULLS_EQUAL:
6326 stats_null_not_equal = FALSE;
6327 break;
6328
6329 default:
6330 ut_error;
6331 }
6332
6333 /* It makes no sense to test more pages than are contained
6334 in the index, thus we lower the number if it is too high */
6335 if (srv_stats_transient_sample_pages > index->stat_index_size) {
6336 if (index->stat_index_size > 0) {
6337 n_sample_pages = index->stat_index_size;
6338 } else {
6339 n_sample_pages = 1;
6340 }
6341 } else {
6342 n_sample_pages = srv_stats_transient_sample_pages;
6343 }
6344
6345 /* We sample some pages in the index to get an estimate */
6346
6347 for (i = 0; i < n_sample_pages; i++) {
6348 mtr_start(&mtr);
6349
6350 bool available;
6351
6352 available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6353 &cursor, &mtr);
6354
6355 if (!available) {
6356 mtr_commit(&mtr);
6357 mem_heap_free(heap);
6358
6359 return(false);
6360 }
6361
6362 /* Count the number of different key values for each prefix of
6363 the key on this index page. If the prefix does not determine
6364 the index record uniquely in the B-tree, then we subtract one
6365 because otherwise our algorithm would give a wrong estimate
6366 for an index where there is just one key value. */
6367
6368 page = btr_cur_get_page(&cursor);
6369
6370 SRV_CORRUPT_TABLE_CHECK(page, goto exit_loop;);
6371 DBUG_EXECUTE_IF("ib_corrupt_page_while_stats_calc",
6372 page = NULL;);
6373
6374 SRV_CORRUPT_TABLE_CHECK(page,
6375 {
6376 mtr_commit(&mtr);
6377 goto exit_loop;
6378 });
6379
6380 rec = page_rec_get_next(page_get_infimum_rec(page));
6381
6382 if (!page_rec_is_supremum(rec)) {
6383 not_empty_flag = 1;
6384 offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6385 ULINT_UNDEFINED, &heap);
6386
6387 if (n_not_null != NULL) {
6388 btr_record_not_null_field_in_rec(
6389 n_cols, offsets_rec, n_not_null);
6390 }
6391 }
6392
6393 while (!page_rec_is_supremum(rec)) {
6394 ulint matched_fields;
6395 rec_t* next_rec = page_rec_get_next(rec);
6396 if (page_rec_is_supremum(next_rec)) {
6397 total_external_size +=
6398 btr_rec_get_externally_stored_len(
6399 rec, offsets_rec);
6400 break;
6401 }
6402
6403 offsets_next_rec = rec_get_offsets(next_rec, index,
6404 offsets_next_rec,
6405 ULINT_UNDEFINED,
6406 &heap);
6407
6408 cmp_rec_rec_with_match(rec, next_rec,
6409 offsets_rec, offsets_next_rec,
6410 index,
6411 page_is_spatial_non_leaf(next_rec, index),
6412 stats_null_not_equal,
6413 &matched_fields);
6414
6415 for (j = matched_fields; j < n_cols; j++) {
6416 /* We add one if this index record has
6417 a different prefix from the previous */
6418
6419 n_diff[j]++;
6420 }
6421
6422 if (n_not_null != NULL) {
6423 btr_record_not_null_field_in_rec(
6424 n_cols, offsets_next_rec, n_not_null);
6425 }
6426
6427 total_external_size
6428 += btr_rec_get_externally_stored_len(
6429 rec, offsets_rec);
6430
6431 rec = next_rec;
6432 /* Initialize offsets_rec for the next round
6433 and assign the old offsets_rec buffer to
6434 offsets_next_rec. */
6435 {
6436 ulint* offsets_tmp = offsets_rec;
6437 offsets_rec = offsets_next_rec;
6438 offsets_next_rec = offsets_tmp;
6439 }
6440 }
6441
6442
6443 if (n_cols == dict_index_get_n_unique_in_tree(index)) {
6444
6445 /* If there is more than one leaf page in the tree,
6446 we add one because we know that the first record
6447 on the page certainly had a different prefix than the
6448 last record on the previous index page in the
6449 alphabetical order. Before this fix, if there was
6450 just one big record on each clustered index page, the
6451 algorithm grossly underestimated the number of rows
6452 in the table. */
6453
6454 if (btr_page_get_prev(page, &mtr) != FIL_NULL
6455 || btr_page_get_next(page, &mtr) != FIL_NULL) {
6456
6457 n_diff[n_cols - 1]++;
6458 }
6459 }
6460
6461 mtr_commit(&mtr);
6462 }
6463
6464 exit_loop:
6465 /* If we saw k borders between different key values on
6466 n_sample_pages leaf pages, we can estimate how many
6467 there will be in index->stat_n_leaf_pages */
6468
6469 /* We must take into account that our sample actually represents
6470 also the pages used for external storage of fields (those pages are
6471 included in index->stat_n_leaf_pages) */
6472
6473 for (j = 0; j < n_cols; j++) {
6474 index->stat_n_diff_key_vals[j]
6475 = BTR_TABLE_STATS_FROM_SAMPLE(
6476 n_diff[j], index, n_sample_pages,
6477 total_external_size, not_empty_flag);
6478
6479 /* If the tree is small, smaller than
6480 10 * n_sample_pages + total_external_size, then
6481 the above estimate is ok. For bigger trees it is common that we
6482 do not see any borders between key values in the few pages
6483 we pick. But still there may be n_sample_pages
6484 different key values, or even more. Let us try to approximate
6485 that: */
6486
6487 add_on = index->stat_n_leaf_pages
6488 / (10 * (n_sample_pages
6489 + total_external_size));
6490
6491 if (add_on > n_sample_pages) {
6492 add_on = n_sample_pages;
6493 }
6494
6495 index->stat_n_diff_key_vals[j] += add_on;
6496
6497 index->stat_n_sample_sizes[j] = n_sample_pages;
6498
6499 /* Update the stat_n_non_null_key_vals[] with our
6500 sampled result. stat_n_non_null_key_vals[] is created
6501 and initialized to zero in dict_index_add_to_cache(),
6502 along with stat_n_diff_key_vals[] array */
6503 if (n_not_null != NULL) {
6504 index->stat_n_non_null_key_vals[j] =
6505 BTR_TABLE_STATS_FROM_SAMPLE(
6506 n_not_null[j], index, n_sample_pages,
6507 total_external_size, not_empty_flag);
6508 }
6509 }
6510
6511 mem_heap_free(heap);
6512
6513 return(true);
6514 }
6515
6516 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6517
6518 /***********************************************************//**
6519 Gets the offset of the pointer to the externally stored part of a field.
6520 @return offset of the pointer to the externally stored part */
6521 static
6522 ulint
btr_rec_get_field_ref_offs(const ulint * offsets,ulint n)6523 btr_rec_get_field_ref_offs(
6524 /*=======================*/
6525 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6526 ulint n) /*!< in: index of the external field */
6527 {
6528 ulint field_ref_offs;
6529 ulint local_len;
6530
6531 ut_a(rec_offs_nth_extern(offsets, n));
6532 field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6533 ut_a(local_len != UNIV_SQL_NULL);
6534 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6535
6536 return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6537 }
6538
6539 /** Gets a pointer to the externally stored part of a field.
6540 @param rec record
6541 @param offsets rec_get_offsets(rec)
6542 @param n index of the externally stored field
6543 @return pointer to the externally stored part */
6544 #define btr_rec_get_field_ref(rec, offsets, n) \
6545 ((rec) + btr_rec_get_field_ref_offs(offsets, n))
6546
6547 /** Gets the externally stored size of a record, in units of a database page.
6548 @param[in] rec record
6549 @param[in] offsets array returned by rec_get_offsets()
6550 @return externally stored part, in units of a database page */
6551 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const ulint * offsets)6552 btr_rec_get_externally_stored_len(
6553 const rec_t* rec,
6554 const ulint* offsets)
6555 {
6556 ulint n_fields;
6557 ulint total_extern_len = 0;
6558 ulint i;
6559
6560 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6561
6562 if (!rec_offs_any_extern(offsets)) {
6563 return(0);
6564 }
6565
6566 n_fields = rec_offs_n_fields(offsets);
6567
6568 for (i = 0; i < n_fields; i++) {
6569 if (rec_offs_nth_extern(offsets, i)) {
6570
6571 ulint extern_len = mach_read_from_4(
6572 btr_rec_get_field_ref(rec, offsets, i)
6573 + BTR_EXTERN_LEN + 4);
6574
6575 total_extern_len += ut_calc_align(extern_len,
6576 UNIV_PAGE_SIZE);
6577 }
6578 }
6579
6580 return(total_extern_len / UNIV_PAGE_SIZE);
6581 }
6582
6583 /*******************************************************************//**
6584 Sets the ownership bit of an externally stored field in a record. */
6585 static
6586 void
btr_cur_set_ownership_of_extern_field(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool val,mtr_t * mtr)6587 btr_cur_set_ownership_of_extern_field(
6588 /*==================================*/
6589 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6590 part will be updated, or NULL */
6591 rec_t* rec, /*!< in/out: clustered index record */
6592 dict_index_t* index, /*!< in: index of the page */
6593 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6594 ulint i, /*!< in: field number */
6595 ibool val, /*!< in: value to set */
6596 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6597 {
6598 byte* data;
6599 ulint local_len;
6600 ulint byte_val;
6601
6602 data = rec_get_nth_field(rec, offsets, i, &local_len);
6603 ut_ad(rec_offs_nth_extern(offsets, i));
6604 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6605
6606 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6607
6608 byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6609
6610 if (val) {
6611 byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
6612 } else {
6613 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6614 ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6615 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6616 byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
6617 }
6618
6619 if (page_zip) {
6620 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6621 page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6622 } else if (mtr != NULL) {
6623
6624 mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6625 MLOG_1BYTE, mtr);
6626 } else {
6627 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6628 }
6629 }
6630
6631 /*******************************************************************//**
6632 Marks non-updated off-page fields as disowned by this record. The ownership
6633 must be transferred to the updated record which is inserted elsewhere in the
6634 index tree. In purge only the owner of externally stored field is allowed
6635 to free the field. */
6636 void
btr_cur_disown_inherited_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,const upd_t * update,mtr_t * mtr)6637 btr_cur_disown_inherited_fields(
6638 /*============================*/
6639 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6640 part will be updated, or NULL */
6641 rec_t* rec, /*!< in/out: record in a clustered index */
6642 dict_index_t* index, /*!< in: index of the page */
6643 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6644 const upd_t* update, /*!< in: update vector */
6645 mtr_t* mtr) /*!< in/out: mini-transaction */
6646 {
6647 ulint i;
6648
6649 ut_ad(rec_offs_validate(rec, index, offsets));
6650 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6651 ut_ad(rec_offs_any_extern(offsets));
6652 ut_ad(mtr);
6653
6654 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
6655 if (rec_offs_nth_extern(offsets, i)
6656 && !upd_get_field_by_field_no(update, i, false)) {
6657 btr_cur_set_ownership_of_extern_field(
6658 page_zip, rec, index, offsets, i, FALSE, mtr);
6659 }
6660 }
6661 }
6662
6663 /*******************************************************************//**
6664 Marks all extern fields in a record as owned by the record. This function
6665 should be called if the delete mark of a record is removed: a not delete
6666 marked record always owns all its extern fields. */
6667 static
6668 void
btr_cur_unmark_extern_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,mtr_t * mtr)6669 btr_cur_unmark_extern_fields(
6670 /*=========================*/
6671 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6672 part will be updated, or NULL */
6673 rec_t* rec, /*!< in/out: record in a clustered index */
6674 dict_index_t* index, /*!< in: index of the page */
6675 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6676 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6677 {
6678 ulint n;
6679 ulint i;
6680
6681 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6682 n = rec_offs_n_fields(offsets);
6683
6684 if (!rec_offs_any_extern(offsets)) {
6685
6686 return;
6687 }
6688
6689 for (i = 0; i < n; i++) {
6690 if (rec_offs_nth_extern(offsets, i)) {
6691
6692 btr_cur_set_ownership_of_extern_field(
6693 page_zip, rec, index, offsets, i, TRUE, mtr);
6694 }
6695 }
6696 }
6697
6698 /*******************************************************************//**
6699 Returns the length of a BLOB part stored on the header page.
6700 @return part length */
6701 static
6702 ulint
btr_blob_get_part_len(const byte * blob_header)6703 btr_blob_get_part_len(
6704 /*==================*/
6705 const byte* blob_header) /*!< in: blob header */
6706 {
6707 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
6708 }
6709
6710 /*******************************************************************//**
6711 Returns the page number where the next BLOB part is stored.
6712 @return page number or FIL_NULL if no more pages */
6713 static
6714 ulint
btr_blob_get_next_page_no(const byte * blob_header)6715 btr_blob_get_next_page_no(
6716 /*======================*/
6717 const byte* blob_header) /*!< in: blob header */
6718 {
6719 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
6720 }
6721
6722 /*******************************************************************//**
6723 Deallocate a buffer block that was reserved for a BLOB part. */
6724 static
6725 void
btr_blob_free(dict_index_t * index,buf_block_t * block,ibool all,mtr_t * mtr)6726 btr_blob_free(
6727 /*==========*/
6728 dict_index_t* index, /*!< in: index */
6729 buf_block_t* block, /*!< in: buffer block */
6730 ibool all, /*!< in: TRUE=remove also the compressed page
6731 if there is one */
6732 mtr_t* mtr) /*!< in: mini-transaction to commit */
6733 {
6734 buf_pool_t* buf_pool = buf_pool_from_block(block);
6735 page_id_t page_id(block->page.id.space(),
6736 block->page.id.page_no());
6737 bool freed = false;
6738
6739 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
6740
6741 mtr_commit(mtr);
6742
6743 mutex_enter(&buf_pool->LRU_list_mutex);
6744 buf_page_mutex_enter(block);
6745
6746 /* Only free the block if it is still allocated to
6747 the same file page. */
6748
6749 if (buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
6750 && page_id.equals_to(block->page.id)) {
6751
6752 freed = buf_LRU_free_page(&block->page, all);
6753
6754 if (!freed && all && block->page.zip.data
6755 && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
6756 && page_id.equals_to(block->page.id)) {
6757
6758 /* Attempt to deallocate the uncompressed page
6759 if the whole block cannot be deallocted. */
6760
6761 freed = buf_LRU_free_page(&block->page, false);
6762 }
6763 }
6764
6765 if (!freed) {
6766 mutex_exit(&buf_pool->LRU_list_mutex);
6767 buf_page_mutex_exit(block);
6768 }
6769 }
6770
6771 /** Helper class used while writing blob pages, during insert or update. */
6772 struct btr_blob_log_check_t {
6773 /** Persistent cursor on a clusterex index record with blobs. */
6774 btr_pcur_t* m_pcur;
6775 /** Mini transaction holding the latches for m_pcur */
6776 mtr_t* m_mtr;
6777 /** rec_get_offsets(rec, index); offset of clust_rec */
6778 const ulint* m_offsets;
6779 /** The block containing clustered record */
6780 buf_block_t** m_block;
6781 /** The clustered record pointer */
6782 rec_t** m_rec;
6783 /** The blob operation code */
6784 enum blob_op m_op;
6785
6786 /** Constructor
6787 @param[in] pcur persistent cursor on a clustered
6788 index record with blobs.
6789 @param[in] mtr mini-transaction holding latches for
6790 pcur.
6791 @param[in] offsets offsets of the clust_rec
6792 @param[in,out] block record block containing pcur record
6793 @param[in,out] rec the clustered record pointer
6794 @param[in] op the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t6795 btr_blob_log_check_t(
6796 btr_pcur_t* pcur,
6797 mtr_t* mtr,
6798 const ulint* offsets,
6799 buf_block_t** block,
6800 rec_t** rec,
6801 enum blob_op op)
6802 : m_pcur(pcur),
6803 m_mtr(mtr),
6804 m_offsets(offsets),
6805 m_block(block),
6806 m_rec(rec),
6807 m_op(op)
6808 {
6809 ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
6810 ut_ad((*m_block)->frame == page_align(*m_rec));
6811 ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
6812 }
6813
6814 /** Check if there is enough space in log file. Commit and re-start the
6815 mini transaction. */
checkbtr_blob_log_check_t6816 void check()
6817 {
6818 dict_index_t* index = m_pcur->index();
6819 ulint offs = 0;
6820 ulint page_no = ULINT_UNDEFINED;
6821 FlushObserver* observer = m_mtr->get_flush_observer();
6822
6823 if (m_op == BTR_STORE_INSERT_BULK) {
6824 offs = page_offset(*m_rec);
6825 page_no = page_get_page_no(
6826 buf_block_get_frame(*m_block));
6827
6828 buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
6829 } else {
6830 btr_pcur_store_position(m_pcur, m_mtr);
6831 }
6832 m_mtr->commit();
6833
6834 DEBUG_SYNC_C("blob_write_middle");
6835
6836 log_free_check();
6837
6838 DEBUG_SYNC_C("blob_write_middle_after_check");
6839
6840 const mtr_log_t log_mode = m_mtr->get_log_mode();
6841 m_mtr->start();
6842 m_mtr->set_log_mode(log_mode);
6843 m_mtr->set_named_space(index->space);
6844 m_mtr->set_flush_observer(observer);
6845
6846 if (m_op == BTR_STORE_INSERT_BULK) {
6847 page_id_t page_id(dict_index_get_space(index),
6848 page_no);
6849 page_size_t page_size(dict_table_page_size(
6850 index->table));
6851 page_cur_t* page_cur = &m_pcur->btr_cur.page_cur;
6852
6853 mtr_x_lock(dict_index_get_lock(index), m_mtr);
6854 page_cur->block = btr_block_get(
6855 page_id, page_size, RW_X_LATCH, index, m_mtr);
6856 page_cur->rec = buf_block_get_frame(page_cur->block)
6857 + offs;
6858
6859 buf_block_buf_fix_dec(page_cur->block);
6860 } else {
6861 ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
6862 bool ret = btr_pcur_restore_position(
6863 BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
6864 m_pcur, m_mtr);
6865
6866 ut_a(ret);
6867 }
6868
6869 *m_block = btr_pcur_get_block(m_pcur);
6870 *m_rec = btr_pcur_get_rec(m_pcur);
6871
6872 ut_d(rec_offs_make_valid(
6873 *m_rec, index, const_cast<ulint*>(m_offsets)));
6874
6875 ut_ad(m_mtr->memo_contains_page_flagged(
6876 *m_rec,
6877 MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)
6878 || dict_table_is_intrinsic(index->table));
6879
6880 ut_ad(mtr_memo_contains_flagged(m_mtr,
6881 dict_index_get_lock(index),
6882 MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)
6883 || dict_table_is_intrinsic(index->table));
6884 }
6885 };
6886
6887
6888 /*******************************************************************//**
6889 Stores the fields in big_rec_vec to the tablespace and puts pointers to
6890 them in rec. The extern flags in rec will have to be set beforehand.
6891 The fields are stored on pages allocated from leaf node
6892 file segment of the index tree.
6893
6894 TODO: If the allocation extends the tablespace, it will not be redo logged, in
6895 any mini-transaction. Tablespace extension should be redo-logged, so that
6896 recovery will not fail when the big_rec was written to the extended portion of
6897 the file, in case the file was somehow truncated in the crash.
6898
6899 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
6900 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,const upd_t * upd,ulint * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)6901 btr_store_big_rec_extern_fields(
6902 /*============================*/
6903 btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if
6904 btr_mtr is restarted, then this can
6905 be repositioned. */
6906 const upd_t* upd, /*!< in: update vector */
6907 ulint* offsets, /*!< in/out: rec_get_offsets() on
6908 pcur. the "external storage" flags
6909 in offsets will correctly correspond
6910 to rec when this function returns */
6911 const big_rec_t*big_rec_vec, /*!< in: vector containing fields
6912 to be stored externally */
6913 mtr_t* btr_mtr, /*!< in/out: mtr containing the
6914 latches to the clustered index. can be
6915 committed and restarted. */
6916 enum blob_op op) /*! in: operation code */
6917 {
6918 ulint rec_page_no;
6919 byte* field_ref;
6920 ulint extern_len;
6921 ulint store_len;
6922 ulint page_no;
6923 ulint space_id;
6924 ulint prev_page_no;
6925 ulint hint_page_no;
6926 ulint i;
6927 mtr_t mtr;
6928 mtr_t mtr_bulk;
6929 mem_heap_t* heap = NULL;
6930 page_zip_des_t* page_zip;
6931 z_stream c_stream;
6932 dberr_t error = DB_SUCCESS;
6933 dict_index_t* index = pcur->index();
6934 buf_block_t* rec_block = btr_pcur_get_block(pcur);
6935 rec_t* rec = btr_pcur_get_rec(pcur);
6936
6937 ut_ad(rec_offs_validate(rec, index, offsets));
6938 ut_ad(rec_offs_any_extern(offsets));
6939 ut_ad(btr_mtr);
6940 ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
6941 MTR_MEMO_X_LOCK
6942 | MTR_MEMO_SX_LOCK)
6943 || dict_table_is_intrinsic(index->table)
6944 || !index->is_committed());
6945 ut_ad(mtr_is_block_fix(
6946 btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table));
6947 ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
6948 ut_a(dict_index_is_clust(index));
6949
6950 ut_a(dict_table_page_size(index->table)
6951 .equals_to(rec_block->page.size));
6952
6953 btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
6954 &rec, op);
6955 page_zip = buf_block_get_page_zip(rec_block);
6956 space_id = rec_block->page.id.space();
6957 rec_page_no = rec_block->page.id.page_no();
6958 ut_a(fil_page_index_page_check(page_align(rec))
6959 || op == BTR_STORE_INSERT_BULK);
6960
6961 if (page_zip) {
6962 int err;
6963
6964 /* Zlib deflate needs 128 kilobytes for the default
6965 window size, plus 512 << memLevel, plus a few
6966 kilobytes for small objects. We use reduced memLevel
6967 to limit the memory consumption, and preallocate the
6968 heap, hoping to avoid memory fragmentation. */
6969 heap = mem_heap_create(250000);
6970 page_zip_set_alloc(&c_stream, heap);
6971
6972 err = deflateInit2(&c_stream, page_zip_level,
6973 Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
6974 ut_a(err == Z_OK);
6975 }
6976
6977 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6978 /* All pointers to externally stored columns in the record
6979 must either be zero or they must be pointers to inherited
6980 columns, owned by this record or an earlier record version. */
6981 for (i = 0; i < big_rec_vec->n_fields; i++) {
6982 field_ref = btr_rec_get_field_ref(
6983 rec, offsets, big_rec_vec->fields[i].field_no);
6984
6985 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
6986 /* Either this must be an update in place,
6987 or the BLOB must be inherited, or the BLOB pointer
6988 must be zero (will be written in this function). */
6989 ut_a(op == BTR_STORE_UPDATE
6990 || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
6991 || !memcmp(field_ref, field_ref_zero,
6992 BTR_EXTERN_FIELD_REF_SIZE));
6993 }
6994 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6995
6996 const page_size_t page_size(dict_table_page_size(index->table));
6997
6998 /* Space available in compressed page to carry blob data */
6999 const ulint payload_size_zip = page_size.physical()
7000 - FIL_PAGE_DATA;
7001
7002 /* Space available in uncompressed page to carry blob data */
7003 const ulint payload_size = page_size.physical()
7004 - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
7005
7006 /* We have to create a file segment to the tablespace
7007 for each field and put the pointer to the field in rec */
7008
7009 for (i = 0; i < big_rec_vec->n_fields; i++) {
7010 const ulint field_no = big_rec_vec->fields[i].field_no;
7011
7012 field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
7013 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7014 /* A zero BLOB pointer should have been initially inserted. */
7015 ut_a(!memcmp(field_ref, field_ref_zero,
7016 BTR_EXTERN_FIELD_REF_SIZE));
7017 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7018 extern_len = big_rec_vec->fields[i].len;
7019 UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
7020 extern_len);
7021
7022 ut_a(extern_len > 0);
7023
7024 prev_page_no = FIL_NULL;
7025
7026 if (page_zip) {
7027 int err = deflateReset(&c_stream);
7028 ut_a(err == Z_OK);
7029
7030 c_stream.next_in = (Bytef*)
7031 big_rec_vec->fields[i].data;
7032 c_stream.avail_in = static_cast<uInt>(extern_len);
7033 }
7034
7035 for (ulint blob_npages = 0;; ++blob_npages) {
7036 buf_block_t* block;
7037 page_t* page;
7038 const ulint commit_freq = 4;
7039 ulint r_extents;
7040
7041 ut_ad(page_align(field_ref) == page_align(rec));
7042
7043 if (!(blob_npages % commit_freq)) {
7044
7045 redo_log.check();
7046
7047 field_ref = btr_rec_get_field_ref(
7048 rec, offsets, field_no);
7049
7050 page_zip = buf_block_get_page_zip(rec_block);
7051 rec_page_no = rec_block->page.id.page_no();
7052 }
7053
7054 mtr_start(&mtr);
7055 mtr.set_named_space(index->space);
7056 mtr.set_log_mode(btr_mtr->get_log_mode());
7057 mtr.set_flush_observer(btr_mtr->get_flush_observer());
7058
7059 buf_page_get(rec_block->page.id,
7060 rec_block->page.size, RW_X_LATCH, &mtr);
7061
7062 if (prev_page_no == FIL_NULL) {
7063 hint_page_no = 1 + rec_page_no;
7064 } else {
7065 hint_page_no = prev_page_no + 1;
7066 }
7067
7068 mtr_t *alloc_mtr;
7069
7070 if (op == BTR_STORE_INSERT_BULK) {
7071 mtr_start(&mtr_bulk);
7072 mtr_bulk.set_spaces(mtr);
7073 alloc_mtr = &mtr_bulk;
7074 } else {
7075 alloc_mtr = &mtr;
7076 }
7077
7078 if (!fsp_reserve_free_extents(&r_extents, space_id, 1,
7079 FSP_BLOB, alloc_mtr,
7080 1)) {
7081
7082 mtr_commit(alloc_mtr);
7083 error = DB_OUT_OF_FILE_SPACE;
7084 goto func_exit;
7085 }
7086
7087 block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
7088 0, alloc_mtr, &mtr);
7089
7090 alloc_mtr->release_free_extents(r_extents);
7091
7092 if (op == BTR_STORE_INSERT_BULK) {
7093 mtr_commit(&mtr_bulk);
7094 }
7095
7096 ut_a(block != NULL);
7097
7098 page_no = block->page.id.page_no();
7099 page = buf_block_get_frame(block);
7100
7101 if (prev_page_no != FIL_NULL) {
7102 buf_block_t* prev_block;
7103 page_t* prev_page;
7104
7105 prev_block = buf_page_get(
7106 page_id_t(space_id, prev_page_no),
7107 rec_block->page.size,
7108 RW_X_LATCH, &mtr);
7109
7110 buf_block_dbg_add_level(prev_block,
7111 SYNC_EXTERN_STORAGE);
7112 prev_page = buf_block_get_frame(prev_block);
7113
7114 if (page_zip) {
7115 mlog_write_ulint(
7116 prev_page + FIL_PAGE_NEXT,
7117 page_no, MLOG_4BYTES, &mtr);
7118 memcpy(buf_block_get_page_zip(
7119 prev_block)
7120 ->data + FIL_PAGE_NEXT,
7121 prev_page + FIL_PAGE_NEXT, 4);
7122 } else {
7123 mlog_write_ulint(
7124 prev_page + FIL_PAGE_DATA
7125 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7126 page_no, MLOG_4BYTES, &mtr);
7127 }
7128
7129 } else if (dict_index_is_online_ddl(index)) {
7130 row_log_table_blob_alloc(index, page_no);
7131 }
7132
7133 if (page_zip) {
7134 int err;
7135 page_zip_des_t* blob_page_zip;
7136
7137 /* Write FIL_PAGE_TYPE to the redo log
7138 separately, before logging any other
7139 changes to the page, so that the debug
7140 assertions in
7141 recv_parse_or_apply_log_rec_body() can
7142 be made simpler. Before InnoDB Plugin
7143 1.0.4, the initialization of
7144 FIL_PAGE_TYPE was logged as part of
7145 the mlog_log_string() below. */
7146
7147 mlog_write_ulint(page + FIL_PAGE_TYPE,
7148 prev_page_no == FIL_NULL
7149 ? FIL_PAGE_TYPE_ZBLOB
7150 : FIL_PAGE_TYPE_ZBLOB2,
7151 MLOG_2BYTES, &mtr);
7152
7153 c_stream.next_out = page
7154 + FIL_PAGE_DATA;
7155 c_stream.avail_out = static_cast<uInt>(
7156 payload_size_zip);
7157
7158 err = deflate(&c_stream, Z_FINISH);
7159 ut_a(err == Z_OK || err == Z_STREAM_END);
7160 ut_a(err == Z_STREAM_END
7161 || c_stream.avail_out == 0);
7162
7163 /* Write the "next BLOB page" pointer */
7164 mlog_write_ulint(page + FIL_PAGE_NEXT,
7165 FIL_NULL, MLOG_4BYTES, &mtr);
7166 /* Initialize the unused "prev page" pointer */
7167 mlog_write_ulint(page + FIL_PAGE_PREV,
7168 FIL_NULL, MLOG_4BYTES, &mtr);
7169 /* Write a back pointer to the record
7170 into the otherwise unused area. This
7171 information could be useful in
7172 debugging. Later, we might want to
7173 implement the possibility to relocate
7174 BLOB pages. Then, we would need to be
7175 able to adjust the BLOB pointer in the
7176 record. We do not store the heap
7177 number of the record, because it can
7178 change in page_zip_reorganize() or
7179 btr_page_reorganize(). However, also
7180 the page number of the record may
7181 change when B-tree nodes are split or
7182 merged.
7183 NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
7184 used by R-tree index for a Split Sequence
7185 Number */
7186 ut_ad(!dict_index_is_spatial(index));
7187
7188 mlog_write_ulint(page
7189 + FIL_PAGE_FILE_FLUSH_LSN,
7190 space_id,
7191 MLOG_4BYTES, &mtr);
7192 mlog_write_ulint(page
7193 + FIL_PAGE_FILE_FLUSH_LSN + 4,
7194 rec_page_no,
7195 MLOG_4BYTES, &mtr);
7196
7197 /* Zero out the unused part of the page. */
7198 memset(page + page_zip_get_size(page_zip)
7199 - c_stream.avail_out,
7200 0, c_stream.avail_out);
7201 mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
7202 page_zip_get_size(page_zip)
7203 - FIL_PAGE_FILE_FLUSH_LSN,
7204 &mtr);
7205 /* Copy the page to compressed storage,
7206 because it will be flushed to disk
7207 from there. */
7208 blob_page_zip = buf_block_get_page_zip(block);
7209 ut_ad(blob_page_zip);
7210 ut_ad(page_zip_get_size(blob_page_zip)
7211 == page_zip_get_size(page_zip));
7212 memcpy(blob_page_zip->data, page,
7213 page_zip_get_size(page_zip));
7214
7215 if (err == Z_OK && prev_page_no != FIL_NULL) {
7216
7217 goto next_zip_page;
7218 }
7219
7220 if (err == Z_STREAM_END) {
7221 mach_write_to_4(field_ref
7222 + BTR_EXTERN_LEN, 0);
7223 mach_write_to_4(field_ref
7224 + BTR_EXTERN_LEN + 4,
7225 c_stream.total_in);
7226 } else {
7227 memset(field_ref + BTR_EXTERN_LEN,
7228 0, 8);
7229 }
7230
7231 if (prev_page_no == FIL_NULL) {
7232 ut_ad(blob_npages == 0);
7233 mach_write_to_4(field_ref
7234 + BTR_EXTERN_SPACE_ID,
7235 space_id);
7236
7237 mach_write_to_4(field_ref
7238 + BTR_EXTERN_PAGE_NO,
7239 page_no);
7240
7241 mach_write_to_4(field_ref
7242 + BTR_EXTERN_OFFSET,
7243 FIL_PAGE_NEXT);
7244 }
7245
7246 /* We compress a page when finish bulk insert.*/
7247 if (op != BTR_STORE_INSERT_BULK) {
7248 page_zip_write_blob_ptr(
7249 page_zip, rec, index, offsets,
7250 field_no, &mtr);
7251 }
7252
7253 next_zip_page:
7254 prev_page_no = page_no;
7255
7256 /* Commit mtr and release the
7257 uncompressed page frame to save memory. */
7258 btr_blob_free(index, block, FALSE, &mtr);
7259
7260 if (err == Z_STREAM_END) {
7261 break;
7262 }
7263 } else {
7264 mlog_write_ulint(page + FIL_PAGE_TYPE,
7265 FIL_PAGE_TYPE_BLOB,
7266 MLOG_2BYTES, &mtr);
7267
7268 if (extern_len > payload_size) {
7269 store_len = payload_size;
7270 } else {
7271 store_len = extern_len;
7272 }
7273
7274 mlog_write_string(page + FIL_PAGE_DATA
7275 + BTR_BLOB_HDR_SIZE,
7276 (const byte*)
7277 big_rec_vec->fields[i].data
7278 + big_rec_vec->fields[i].len
7279 - extern_len,
7280 store_len, &mtr);
7281 mlog_write_ulint(page + FIL_PAGE_DATA
7282 + BTR_BLOB_HDR_PART_LEN,
7283 store_len, MLOG_4BYTES, &mtr);
7284 mlog_write_ulint(page + FIL_PAGE_DATA
7285 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7286 FIL_NULL, MLOG_4BYTES, &mtr);
7287
7288 extern_len -= store_len;
7289
7290 mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7291 MLOG_4BYTES, &mtr);
7292 mlog_write_ulint(field_ref
7293 + BTR_EXTERN_LEN + 4,
7294 big_rec_vec->fields[i].len
7295 - extern_len,
7296 MLOG_4BYTES, &mtr);
7297
7298 if (prev_page_no == FIL_NULL) {
7299 ut_ad(blob_npages == 0);
7300 mlog_write_ulint(field_ref
7301 + BTR_EXTERN_SPACE_ID,
7302 space_id, MLOG_4BYTES,
7303 &mtr);
7304
7305 mlog_write_ulint(field_ref
7306 + BTR_EXTERN_PAGE_NO,
7307 page_no, MLOG_4BYTES,
7308 &mtr);
7309
7310 mlog_write_ulint(field_ref
7311 + BTR_EXTERN_OFFSET,
7312 FIL_PAGE_DATA,
7313 MLOG_4BYTES,
7314 &mtr);
7315 }
7316
7317 prev_page_no = page_no;
7318
7319 mtr_commit(&mtr);
7320
7321 if (extern_len == 0) {
7322 break;
7323 }
7324 }
7325 }
7326
7327 DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7328 error = DB_OUT_OF_FILE_SPACE;
7329 goto func_exit;);
7330
7331 rec_offs_make_nth_extern(offsets, field_no);
7332 }
7333
7334 func_exit:
7335 if (page_zip) {
7336 deflateEnd(&c_stream);
7337 }
7338
7339 if (heap != NULL) {
7340 mem_heap_free(heap);
7341 }
7342
7343 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7344 /* All pointers to externally stored columns in the record
7345 must be valid. */
7346 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7347 if (!rec_offs_nth_extern(offsets, i)) {
7348 continue;
7349 }
7350
7351 field_ref = btr_rec_get_field_ref(rec, offsets, i);
7352
7353 /* The pointer must not be zero if the operation
7354 succeeded. */
7355 ut_a(0 != memcmp(field_ref, field_ref_zero,
7356 BTR_EXTERN_FIELD_REF_SIZE)
7357 || error != DB_SUCCESS);
7358 /* The column must not be disowned by this record. */
7359 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7360 }
7361 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7362 return(error);
7363 }
7364
7365 /*******************************************************************//**
7366 Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
7367 static
7368 void
btr_check_blob_fil_page_type(ulint space_id,ulint page_no,const page_t * page,ibool read)7369 btr_check_blob_fil_page_type(
7370 /*=========================*/
7371 ulint space_id, /*!< in: space id */
7372 ulint page_no, /*!< in: page number */
7373 const page_t* page, /*!< in: page */
7374 ibool read) /*!< in: TRUE=read, FALSE=purge */
7375 {
7376 ulint type = fil_page_get_type(page);
7377
7378 ut_a(space_id == page_get_space_id(page));
7379 ut_a(page_no == page_get_page_no(page));
7380
7381 if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
7382 ulint flags = fil_space_get_flags(space_id);
7383
7384 #ifndef UNIV_DEBUG /* Improve debug test coverage */
7385 if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
7386 /* Old versions of InnoDB did not initialize
7387 FIL_PAGE_TYPE on BLOB pages. Do not print
7388 anything about the type mismatch when reading
7389 a BLOB page that is in Antelope format.*/
7390 return;
7391 }
7392 #endif /* !UNIV_DEBUG */
7393
7394 ib::fatal() << "FIL_PAGE_TYPE=" << type
7395 << " on BLOB " << (read ? "read" : "purge")
7396 << " space " << space_id << " page " << page_no
7397 << " flags " << flags;
7398 }
7399 }
7400
7401 /*******************************************************************//**
7402 Frees the space in an externally stored field to the file space
7403 management if the field in data is owned by the externally stored field,
7404 in a rollback we may have the additional condition that the field must
7405 not be inherited. */
7406 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,ulint i,bool rollback,mtr_t * local_mtr)7407 btr_free_externally_stored_field(
7408 /*=============================*/
7409 dict_index_t* index, /*!< in: index of the data, the index
7410 tree MUST be X-latched; if the tree
7411 height is 1, then also the root page
7412 must be X-latched! (this is relevant
7413 in the case this function is called
7414 from purge where 'data' is located on
7415 an undo log page, not an index
7416 page) */
7417 byte* field_ref, /*!< in/out: field reference */
7418 const rec_t* rec, /*!< in: record containing field_ref, for
7419 page_zip_write_blob_ptr(), or NULL */
7420 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
7421 or NULL */
7422 page_zip_des_t* page_zip, /*!< in: compressed page corresponding
7423 to rec, or NULL if rec == NULL */
7424 ulint i, /*!< in: field number of field_ref;
7425 ignored if rec == NULL */
7426 bool rollback, /*!< in: performing rollback? */
7427 mtr_t* local_mtr) /*!< in: mtr
7428 containing the latch to data an an
7429 X-latch to the index tree */
7430 {
7431 page_t* page;
7432 const ulint space_id = mach_read_from_4(
7433 field_ref + BTR_EXTERN_SPACE_ID);
7434 const ulint start_page = mach_read_from_4(
7435 field_ref + BTR_EXTERN_PAGE_NO);
7436 ulint page_no;
7437 ulint next_page_no;
7438 mtr_t mtr;
7439
7440 ut_ad(dict_index_is_clust(index));
7441 ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7442 MTR_MEMO_X_LOCK
7443 | MTR_MEMO_SX_LOCK)
7444 || dict_table_is_intrinsic(index->table));
7445 ut_ad(mtr_is_page_fix(
7446 local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table));
7447 ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7448 ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7449 ut_ad(local_mtr->is_named_space(
7450 page_get_space_id(page_align(field_ref))));
7451
7452 if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7453 BTR_EXTERN_FIELD_REF_SIZE))) {
7454 /* In the rollback, we may encounter a clustered index
7455 record with some unwritten off-page columns. There is
7456 nothing to free then. */
7457 ut_a(rollback);
7458 return;
7459 }
7460
7461 ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7462 & ~((BTR_EXTERN_OWNER_FLAG
7463 | BTR_EXTERN_INHERITED_FLAG) << 24)));
7464 ut_ad(space_id == index->space);
7465
7466 const page_size_t ext_page_size(dict_table_page_size(index->table));
7467 const page_size_t& rec_page_size(rec == NULL
7468 ? univ_page_size
7469 : ext_page_size);
7470 if (rec == NULL) {
7471 /* This is a call from row_purge_upd_exist_or_extern(). */
7472 ut_ad(!page_zip);
7473 }
7474
7475 for (;;) {
7476 #ifdef UNIV_DEBUG
7477 buf_block_t* rec_block;
7478 #endif /* UNIV_DEBUG */
7479 buf_block_t* ext_block;
7480
7481 mtr_start(&mtr);
7482 mtr.set_spaces(*local_mtr);
7483 mtr.set_log_mode(local_mtr->get_log_mode());
7484
7485 ut_ad(!dict_table_is_temporary(index->table)
7486 || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7487
7488 const page_t* p = page_align(field_ref);
7489
7490 const page_id_t page_id(page_get_space_id(p),
7491 page_get_page_no(p));
7492
7493 #ifdef UNIV_DEBUG
7494 rec_block =
7495 #endif /* UNIV_DEBUG */
7496 buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);
7497
7498 buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7499 page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7500
7501 if (/* There is no external storage data */
7502 page_no == FIL_NULL
7503 /* This field does not own the externally stored field */
7504 || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7505 & BTR_EXTERN_OWNER_FLAG)
7506 /* Rollback and inherited field */
7507 || (rollback
7508 && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7509 & BTR_EXTERN_INHERITED_FLAG))) {
7510
7511 /* Do not free */
7512 mtr_commit(&mtr);
7513
7514 return;
7515 }
7516
7517 if (page_no == start_page && dict_index_is_online_ddl(index)) {
7518 row_log_table_blob_free(index, start_page);
7519 }
7520
7521 ext_block = buf_page_get(
7522 page_id_t(space_id, page_no), ext_page_size,
7523 RW_X_LATCH, &mtr);
7524
7525 buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7526 page = buf_block_get_frame(ext_block);
7527
7528 if (ext_page_size.is_compressed()) {
7529 /* Note that page_zip will be NULL
7530 in row_purge_upd_exist_or_extern(). */
7531 switch (fil_page_get_type(page)) {
7532 case FIL_PAGE_TYPE_ZBLOB:
7533 case FIL_PAGE_TYPE_ZBLOB2:
7534 break;
7535 default:
7536 ut_error;
7537 }
7538 next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7539
7540 btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7541 &mtr);
7542
7543 if (page_zip != NULL) {
7544 mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7545 next_page_no);
7546 mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
7547 0);
7548 page_zip_write_blob_ptr(page_zip, rec, index,
7549 offsets, i, &mtr);
7550 } else {
7551 mlog_write_ulint(field_ref
7552 + BTR_EXTERN_PAGE_NO,
7553 next_page_no,
7554 MLOG_4BYTES, &mtr);
7555 mlog_write_ulint(field_ref
7556 + BTR_EXTERN_LEN + 4, 0,
7557 MLOG_4BYTES, &mtr);
7558 }
7559 } else {
7560 ut_a(!page_zip);
7561 btr_check_blob_fil_page_type(space_id, page_no, page,
7562 FALSE);
7563
7564 next_page_no = mach_read_from_4(
7565 page + FIL_PAGE_DATA
7566 + BTR_BLOB_HDR_NEXT_PAGE_NO);
7567
7568 btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7569 &mtr);
7570
7571 mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7572 next_page_no,
7573 MLOG_4BYTES, &mtr);
7574 /* Zero out the BLOB length. If the server
7575 crashes during the execution of this function,
7576 trx_rollback_or_clean_all_recovered() could
7577 dereference the half-deleted BLOB, fetching a
7578 wrong prefix for the BLOB. */
7579 mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7580 0,
7581 MLOG_4BYTES, &mtr);
7582 }
7583
7584 /* Commit mtr and release the BLOB block to save memory. */
7585 btr_blob_free(index, ext_block, TRUE, &mtr);
7586 }
7587 }
7588
7589 /***********************************************************//**
7590 Frees the externally stored fields for a record. */
7591 static
7592 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,bool rollback,mtr_t * mtr)7593 btr_rec_free_externally_stored_fields(
7594 /*==================================*/
7595 dict_index_t* index, /*!< in: index of the data, the index
7596 tree MUST be X-latched */
7597 rec_t* rec, /*!< in/out: record */
7598 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7599 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7600 part will be updated, or NULL */
7601 bool rollback,/*!< in: performing rollback? */
7602 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7603 an X-latch to record page and to the index
7604 tree */
7605 {
7606 ulint n_fields;
7607 ulint i;
7608
7609 ut_ad(rec_offs_validate(rec, index, offsets));
7610 ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7611 /* Free possible externally stored fields in the record */
7612
7613 ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7614 n_fields = rec_offs_n_fields(offsets);
7615
7616 for (i = 0; i < n_fields; i++) {
7617 if (rec_offs_nth_extern(offsets, i)) {
7618 btr_free_externally_stored_field(
7619 index, btr_rec_get_field_ref(rec, offsets, i),
7620 rec, offsets, page_zip, i, rollback, mtr);
7621 }
7622 }
7623 }
7624
7625 /***********************************************************//**
7626 Frees the externally stored fields for a record, if the field is mentioned
7627 in the update vector. */
7628 static
7629 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7630 btr_rec_free_updated_extern_fields(
7631 /*===============================*/
7632 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
7633 X-latched */
7634 rec_t* rec, /*!< in/out: record */
7635 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7636 part will be updated, or NULL */
7637 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7638 const upd_t* update, /*!< in: update vector */
7639 bool rollback,/*!< in: performing rollback? */
7640 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7641 an X-latch to record page and to the tree */
7642 {
7643 ulint n_fields;
7644 ulint i;
7645
7646 ut_ad(rec_offs_validate(rec, index, offsets));
7647 ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7648
7649 /* Free possible externally stored fields in the record */
7650
7651 n_fields = upd_get_n_fields(update);
7652
7653 for (i = 0; i < n_fields; i++) {
7654 const upd_field_t* ufield = upd_get_nth_field(update, i);
7655
7656 if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7657 ulint len;
7658 byte* data = rec_get_nth_field(
7659 rec, offsets, ufield->field_no, &len);
7660 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7661
7662 btr_free_externally_stored_field(
7663 index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7664 rec, offsets, page_zip,
7665 ufield->field_no, rollback, mtr);
7666 }
7667 }
7668 }
7669
7670 /*******************************************************************//**
7671 Copies the prefix of an uncompressed BLOB. The clustered index record
7672 that points to this BLOB must be protected by a lock or a page latch.
7673 @return number of bytes written to buf */
7674 static
7675 ulint
btr_copy_blob_prefix(byte * buf,ulint len,ulint space_id,ulint page_no,ulint offset)7676 btr_copy_blob_prefix(
7677 /*=================*/
7678 byte* buf, /*!< out: the externally stored part of
7679 the field, or a prefix of it */
7680 ulint len, /*!< in: length of buf, in bytes */
7681 ulint space_id,/*!< in: space id of the BLOB pages */
7682 ulint page_no,/*!< in: page number of the first BLOB page */
7683 ulint offset) /*!< in: offset on the first BLOB page */
7684 {
7685 ulint copied_len = 0;
7686
7687 for (;;) {
7688 mtr_t mtr;
7689 buf_block_t* block;
7690 const page_t* page;
7691 const byte* blob_header;
7692 ulint part_len;
7693 ulint copy_len;
7694
7695 mtr_start(&mtr);
7696
7697 block = buf_page_get(page_id_t(space_id, page_no),
7698 univ_page_size, RW_S_LATCH, &mtr);
7699 buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7700 page = buf_block_get_frame(block);
7701
7702 btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
7703
7704 blob_header = page + offset;
7705 part_len = btr_blob_get_part_len(blob_header);
7706 copy_len = ut_min(part_len, len - copied_len);
7707
7708 memcpy(buf + copied_len,
7709 blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7710 copied_len += copy_len;
7711
7712 page_no = btr_blob_get_next_page_no(blob_header);
7713
7714 mtr_commit(&mtr);
7715
7716 if (page_no == FIL_NULL || copy_len != part_len) {
7717 UNIV_MEM_ASSERT_RW(buf, copied_len);
7718 return(copied_len);
7719 }
7720
7721 /* On other BLOB pages except the first the BLOB header
7722 always is at the page data start: */
7723
7724 offset = FIL_PAGE_DATA;
7725
7726 ut_ad(copied_len <= len);
7727 }
7728 }
7729
7730 /** Copies the prefix of a compressed BLOB.
7731 The clustered index record that points to this BLOB must be protected
7732 by a lock or a page latch.
7733 @param[out] buf the externally stored part of the field,
7734 or a prefix of it
7735 @param[in] len length of buf, in bytes
7736 @param[in] page_size compressed BLOB page size
7737 @param[in] space_id space id of the BLOB pages
7738 @param[in] offset offset on the first BLOB page
7739 @return number of bytes written to buf */
7740 static
7741 ulint
btr_copy_zblob_prefix(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7742 btr_copy_zblob_prefix(
7743 byte* buf,
7744 ulint len,
7745 const page_size_t& page_size,
7746 ulint space_id,
7747 ulint page_no,
7748 ulint offset)
7749 {
7750 ulint page_type = FIL_PAGE_TYPE_ZBLOB;
7751 mem_heap_t* heap;
7752 int err;
7753 z_stream d_stream;
7754
7755 d_stream.next_out = buf;
7756 d_stream.avail_out = static_cast<uInt>(len);
7757 d_stream.next_in = Z_NULL;
7758 d_stream.avail_in = 0;
7759
7760 /* Zlib inflate needs 32 kilobytes for the default
7761 window size, plus a few kilobytes for small objects. */
7762 heap = mem_heap_create(40000);
7763 page_zip_set_alloc(&d_stream, heap);
7764
7765 ut_ad(page_size.is_compressed());
7766 ut_ad(space_id);
7767
7768 err = inflateInit(&d_stream);
7769 ut_a(err == Z_OK);
7770
7771 for (;;) {
7772 buf_page_t* bpage;
7773 ulint next_page_no;
7774
7775 /* There is no latch on bpage directly. Instead,
7776 bpage is protected by the B-tree page latch that
7777 is being held on the clustered index record, or,
7778 in row_merge_copy_blobs(), by an exclusive table lock. */
7779 bpage = buf_page_get_zip(page_id_t(space_id, page_no),
7780 page_size);
7781
7782 if (UNIV_UNLIKELY(!bpage)) {
7783 ib::error() << "Cannot load compressed BLOB "
7784 << page_id_t(space_id, page_no);
7785 goto func_exit;
7786 }
7787
7788 if (UNIV_UNLIKELY
7789 (fil_page_get_type(bpage->zip.data) != page_type)) {
7790
7791 ib::error() << "Unexpected type "
7792 << fil_page_get_type(bpage->zip.data)
7793 << " of compressed BLOB page "
7794 << page_id_t(space_id, page_no);
7795
7796 ut_ad(0);
7797 goto end_of_blob;
7798 }
7799
7800 next_page_no = mach_read_from_4(bpage->zip.data + offset);
7801
7802 if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
7803 /* When the BLOB begins at page header,
7804 the compressed data payload does not
7805 immediately follow the next page pointer. */
7806 offset = FIL_PAGE_DATA;
7807 } else {
7808 offset += 4;
7809 }
7810
7811 d_stream.next_in = bpage->zip.data + offset;
7812 d_stream.avail_in = static_cast<uInt>(page_size.physical()
7813 - offset);
7814
7815 err = inflate(&d_stream, Z_NO_FLUSH);
7816 switch (err) {
7817 case Z_OK:
7818 if (!d_stream.avail_out) {
7819 goto end_of_blob;
7820 }
7821 break;
7822 case Z_STREAM_END:
7823 if (next_page_no == FIL_NULL) {
7824 goto end_of_blob;
7825 }
7826 /* fall through */
7827 default:
7828 inflate_error:
7829 ib::error() << "inflate() of compressed BLOB page "
7830 << page_id_t(space_id, page_no)
7831 << " returned " << err
7832 << " (" << d_stream.msg << ")";
7833
7834 case Z_BUF_ERROR:
7835 goto end_of_blob;
7836 }
7837
7838 if (next_page_no == FIL_NULL) {
7839 if (!d_stream.avail_in) {
7840 ib::error()
7841 << "Unexpected end of compressed "
7842 << "BLOB page "
7843 << page_id_t(space_id, page_no);
7844 } else {
7845 err = inflate(&d_stream, Z_FINISH);
7846 switch (err) {
7847 case Z_STREAM_END:
7848 case Z_BUF_ERROR:
7849 break;
7850 default:
7851 goto inflate_error;
7852 }
7853 }
7854
7855 end_of_blob:
7856 buf_page_release_zip(bpage);
7857 goto func_exit;
7858 }
7859
7860 buf_page_release_zip(bpage);
7861
7862 /* On other BLOB pages except the first
7863 the BLOB header always is at the page header: */
7864
7865 page_no = next_page_no;
7866 offset = FIL_PAGE_NEXT;
7867 page_type = FIL_PAGE_TYPE_ZBLOB2;
7868 }
7869
7870 func_exit:
7871 inflateEnd(&d_stream);
7872 mem_heap_free(heap);
7873 UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
7874 return(d_stream.total_out);
7875 }
7876
7877 /** Copies the prefix of an externally stored field of a record.
7878 The clustered index record that points to this BLOB must be protected
7879 by a lock or a page latch.
7880 @param[out] buf the externally stored part of the
7881 field, or a prefix of it
7882 @param[in] len length of buf, in bytes
7883 @param[in] page_size BLOB page size
7884 @param[in] space_id space id of the first BLOB page
7885 @param[in] page_no page number of the first BLOB page
7886 @param[in] offset offset on the first BLOB page
7887 @return number of bytes written to buf */
7888 static
7889 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7890 btr_copy_externally_stored_field_prefix_low(
7891 byte* buf,
7892 ulint len,
7893 const page_size_t& page_size,
7894 ulint space_id,
7895 ulint page_no,
7896 ulint offset)
7897 {
7898 if (len == 0) {
7899 return(0);
7900 }
7901
7902 if (page_size.is_compressed()) {
7903 return(btr_copy_zblob_prefix(buf, len, page_size,
7904 space_id, page_no, offset));
7905 } else {
7906 ut_ad(page_size.equals_to(univ_page_size));
7907 return(btr_copy_blob_prefix(buf, len, space_id,
7908 page_no, offset));
7909 }
7910 }
7911
7912 /** Copies the prefix of an externally stored field of a record.
7913 The clustered index record must be protected by a lock or a page latch.
7914 @param[out] buf the field, or a prefix of it
7915 @param[in] len length of buf, in bytes
7916 @param[in] page_size BLOB page size
7917 @param[in] data 'internally' stored part of the field
7918 containing also the reference to the external part; must be protected by
7919 a lock or a page latch
7920 @param[in] local_len length of data, in bytes
7921 @return the length of the copied field, or 0 if the column was being
7922 or has been deleted */
7923 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,const page_size_t & page_size,const byte * data,ulint local_len)7924 btr_copy_externally_stored_field_prefix(
7925 byte* buf,
7926 ulint len,
7927 const page_size_t& page_size,
7928 const byte* data,
7929 ulint local_len)
7930 {
7931 ulint space_id;
7932 ulint page_no;
7933 ulint offset;
7934
7935 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7936
7937 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7938
7939 if (UNIV_UNLIKELY(local_len >= len)) {
7940 memcpy(buf, data, len);
7941 return(len);
7942 }
7943
7944 memcpy(buf, data, local_len);
7945 data += local_len;
7946
7947 ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
7948
7949 if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
7950 /* The externally stored part of the column has been
7951 (partially) deleted. Signal the half-deleted BLOB
7952 to the caller. */
7953
7954 return(0);
7955 }
7956
7957 space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
7958
7959 page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
7960
7961 offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
7962
7963 return(local_len
7964 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7965 len - local_len,
7966 page_size,
7967 space_id, page_no,
7968 offset));
7969 }
7970
7971 /** Copies an externally stored field of a record to mem heap.
7972 The clustered index record must be protected by a lock or a page latch.
7973 @param[out] len length of the whole field
7974 @param[in] data 'internally' stored part of the field
7975 containing also the reference to the external part; must be protected by
7976 a lock or a page latch
7977 @param[in] page_size BLOB page size
7978 @param[in] local_len length of data
7979 @param[in,out] heap mem heap
7980 @return the whole field copied to heap */
7981 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,const page_size_t & page_size,ulint local_len,mem_heap_t * heap)7982 btr_copy_externally_stored_field(
7983 ulint* len,
7984 const byte* data,
7985 const page_size_t& page_size,
7986 ulint local_len,
7987 mem_heap_t* heap)
7988 {
7989 ulint space_id;
7990 ulint page_no;
7991 ulint offset;
7992 ulint extern_len;
7993 byte* buf;
7994
7995 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7996
7997 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7998
7999 space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
8000
8001 page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
8002
8003 offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
8004
8005 /* Currently a BLOB cannot be bigger than 4 GB; we
8006 leave the 4 upper bytes in the length field unused */
8007
8008 extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
8009
8010 buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
8011
8012 memcpy(buf, data, local_len);
8013 *len = local_len
8014 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
8015 extern_len,
8016 page_size,
8017 space_id,
8018 page_no, offset);
8019
8020 return(buf);
8021 }
8022
8023 /** Copies an externally stored field of a record to mem heap.
8024 @param[in] rec record in a clustered index; must be
8025 protected by a lock or a page latch
8026 @param[in] offset array returned by rec_get_offsets()
8027 @param[in] page_size BLOB page size
8028 @param[in] no field number
8029 @param[out] len length of the field
8030 @param[in,out] heap mem heap
8031 @return the field copied to heap, or NULL if the field is incomplete */
8032 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const ulint * offsets,const page_size_t & page_size,ulint no,ulint * len,mem_heap_t * heap)8033 btr_rec_copy_externally_stored_field(
8034 const rec_t* rec,
8035 const ulint* offsets,
8036 const page_size_t& page_size,
8037 ulint no,
8038 ulint* len,
8039 mem_heap_t* heap)
8040 {
8041 ulint local_len;
8042 const byte* data;
8043
8044 ut_a(rec_offs_nth_extern(offsets, no));
8045
8046 /* An externally stored field can contain some initial
8047 data from the field, and in the last 20 bytes it has the
8048 space id, page number, and offset where the rest of the
8049 field data is stored, and the data length in addition to
8050 the data stored locally. We may need to store some data
8051 locally to get the local record length above the 128 byte
8052 limit so that field offsets are stored in two bytes, and
8053 the extern bit is available in those two bytes. */
8054
8055 data = rec_get_nth_field(rec, offsets, no, &local_len);
8056
8057 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8058
8059 if (UNIV_UNLIKELY
8060 (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
8061 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
8062 /* The externally stored field was not written yet.
8063 This record should only be seen by
8064 recv_recovery_rollback_active() or any
8065 TRX_ISO_READ_UNCOMMITTED transactions. */
8066 return(NULL);
8067 }
8068
8069 return(btr_copy_externally_stored_field(len, data,
8070 page_size, local_len, heap));
8071 }
8072 #endif /* !UNIV_HOTBACKUP */
8073