1 /*****************************************************************************
2
3 Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6 Copyright (c) 2015, 2021, MariaDB Corporation.
7
8 Portions of this file contain modifications contributed and copyrighted by
9 Google, Inc. Those modifications are gratefully acknowledged and are described
10 briefly in the InnoDB documentation. The contributions by Google are
11 incorporated with their permission, and subject to the conditions contained in
12 the file COPYING.Google.
13
14 This program is free software; you can redistribute it and/or modify it under
15 the terms of the GNU General Public License as published by the Free Software
16 Foundation; version 2 of the License.
17
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License along with
23 this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
25
26 *****************************************************************************/
27
28 /**************************************************//**
29 @file btr/btr0cur.cc
30 The index tree cursor
31
32 All changes that row operations make to a B-tree or the records
33 there must go through this module! Undo log records are written here
34 of every modify or insert of a clustered index record.
35
36 NOTE!!!
37 To make sure we do not run out of disk space during a pessimistic
38 insert or update, we have to reserve 2 x the height of the index tree
39 many pages in the tablespace before we start the operation, because
40 if leaf splitting has been started, it is difficult to undo, except
41 by crashing the database and doing a roll-forward.
42
43 Created 10/16/1994 Heikki Tuuri
44 *******************************************************/
45
46 #include "btr0cur.h"
47 #include "row0upd.h"
48 #include "mtr0log.h"
49 #include "page0page.h"
50 #include "page0zip.h"
51 #include "rem0rec.h"
52 #include "rem0cmp.h"
53 #include "buf0lru.h"
54 #include "btr0btr.h"
55 #include "btr0sea.h"
56 #include "row0log.h"
57 #include "row0purge.h"
58 #include "row0upd.h"
59 #include "trx0rec.h"
60 #include "trx0roll.h"
61 #include "que0que.h"
62 #include "row0row.h"
63 #include "srv0srv.h"
64 #include "ibuf0ibuf.h"
65 #include "lock0lock.h"
66 #include "zlib.h"
67 #include "srv0start.h"
68 #include "mysql_com.h"
69 #include "dict0stats.h"
70 #ifdef WITH_WSREP
71 #include "mysql/service_wsrep.h"
72 #endif /* WITH_WSREP */
73
74 /** Buffered B-tree operation types, introduced as part of delete buffering. */
75 enum btr_op_t {
76 BTR_NO_OP = 0, /*!< Not buffered */
77 BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
78 BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
79 BTR_DELETE_OP, /*!< Purge a delete-marked record */
80 BTR_DELMARK_OP /*!< Mark a record for deletion */
81 };
82
83 /** Modification types for the B-tree operation.
84 Note that the order must be DELETE, BOTH, INSERT !!
85 */
86 enum btr_intention_t {
87 BTR_INTENTION_DELETE,
88 BTR_INTENTION_BOTH,
89 BTR_INTENTION_INSERT
90 };
91
92 /** For the index->lock scalability improvement, only possibility of clear
93 performance regression observed was caused by grown huge history list length.
94 That is because the exclusive use of index->lock also worked as reserving
95 free blocks and read IO bandwidth with priority. To avoid huge glowing history
96 list as same level with previous implementation, prioritizes pessimistic tree
97 operations by purge as the previous, when it seems to be growing huge.
98
99 Experimentally, the history list length starts to affect to performance
100 throughput clearly from about 100000. */
101 #define BTR_CUR_FINE_HISTORY_LENGTH 100000
102
103 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
104 Atomic_counter<ulint> btr_cur_n_non_sea;
105 /** Old value of btr_cur_n_non_sea. Copied by
106 srv_refresh_innodb_monitor_stats(). Referenced by
107 srv_printf_innodb_monitor(). */
108 ulint btr_cur_n_non_sea_old;
109 #ifdef BTR_CUR_HASH_ADAPT
110 /** Number of successful adaptive hash index lookups in
111 btr_cur_search_to_nth_level(). */
112 ulint btr_cur_n_sea;
113 /** Old value of btr_cur_n_sea. Copied by
114 srv_refresh_innodb_monitor_stats(). Referenced by
115 srv_printf_innodb_monitor(). */
116 ulint btr_cur_n_sea_old;
117 #endif /* BTR_CUR_HASH_ADAPT */
118
119 #ifdef UNIV_DEBUG
120 /* Flag to limit optimistic insert records */
121 uint btr_cur_limit_optimistic_insert_debug;
122 #endif /* UNIV_DEBUG */
123
124 /** In the optimistic insert, if the insert does not fit, but this much space
125 can be released by page reorganize, then it is reorganized */
126 #define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32)
127
128 /** The structure of a BLOB part header */
129 /* @{ */
130 /*--------------------------------------*/
131 #define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
132 page */
133 #define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
134 FIL_NULL if none */
135 /*--------------------------------------*/
136 #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
137 part header, in bytes */
138
139 /** Estimated table level stats from sampled value.
140 @param value sampled stats
141 @param index index being sampled
142 @param sample number of sampled rows
143 @param ext_size external stored data size
144 @param not_empty table not empty
145 @return estimated table wide stats from sampled value */
146 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
147 (((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
148 + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
149
150 /* @} */
151
152 /*******************************************************************//**
153 Marks all extern fields in a record as owned by the record. This function
154 should be called if the delete mark of a record is removed: a not delete
155 marked record always owns all its extern fields. */
156 static
157 void
158 btr_cur_unmark_extern_fields(
159 /*=========================*/
160 buf_block_t* block, /*!< in/out: index page */
161 rec_t* rec, /*!< in/out: record in a clustered index */
162 dict_index_t* index, /*!< in: index of the page */
163 const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
164 mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
165 /*******************************************************************//**
166 Adds path information to the cursor for the current page, for which
167 the binary search has been performed. */
168 static
169 void
170 btr_cur_add_path_info(
171 /*==================*/
172 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
173 ulint height, /*!< in: height of the page in tree;
174 0 means leaf node */
175 ulint root_height); /*!< in: root node height in tree */
176 /***********************************************************//**
177 Frees the externally stored fields for a record, if the field is mentioned
178 in the update vector. */
179 static
180 void
181 btr_rec_free_updated_extern_fields(
182 /*===============================*/
183 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
184 X-latched */
185 rec_t* rec, /*!< in: record */
186 buf_block_t* block, /*!< in: index page of rec */
187 const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
188 const upd_t* update, /*!< in: update vector */
189 bool rollback,/*!< in: performing rollback? */
190 mtr_t* mtr); /*!< in: mini-transaction handle which contains
191 an X-latch to record page and to the tree */
192 /***********************************************************//**
193 Frees the externally stored fields for a record. */
194 static
195 void
196 btr_rec_free_externally_stored_fields(
197 /*==================================*/
198 dict_index_t* index, /*!< in: index of the data, the index
199 tree MUST be X-latched */
200 rec_t* rec, /*!< in: record */
201 const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
202 buf_block_t* block, /*!< in: index page of rec */
203 bool rollback,/*!< in: performing rollback? */
204 mtr_t* mtr); /*!< in: mini-transaction handle which contains
205 an X-latch to record page and to the index
206 tree */
207
208 /*==================== B-TREE SEARCH =========================*/
209
210 /** Latches the leaf page or pages requested.
211 @param[in] block leaf page where the search converged
212 @param[in] latch_mode BTR_SEARCH_LEAF, ...
213 @param[in] cursor cursor
214 @param[in] mtr mini-transaction
215 @return blocks and savepoints which actually latched. */
216 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)217 btr_cur_latch_leaves(
218 buf_block_t* block,
219 ulint latch_mode,
220 btr_cur_t* cursor,
221 mtr_t* mtr)
222 {
223 rw_lock_type_t mode;
224 uint32_t left_page_no;
225 uint32_t right_page_no;
226 buf_block_t* get_block;
227 bool spatial;
228 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
229
230 compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
231 compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
232 compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
233 ut_ad(block->page.id().space() == cursor->index->table->space->id);
234
235 spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
236 ut_ad(block->page.in_file());
237
238 switch (latch_mode) {
239 case BTR_SEARCH_LEAF:
240 case BTR_MODIFY_LEAF:
241 case BTR_SEARCH_TREE:
242 if (spatial) {
243 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
244 = mtr_set_savepoint(mtr);
245 }
246
247 mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
248 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
249 get_block = btr_block_get(*cursor->index,
250 block->page.id().page_no(), mode,
251 true, mtr);
252 latch_leaves.blocks[1] = get_block;
253 #ifdef UNIV_BTR_DEBUG
254 ut_a(page_is_comp(get_block->frame)
255 == page_is_comp(block->frame));
256 #endif /* UNIV_BTR_DEBUG */
257 if (spatial) {
258 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
259 = get_block;
260 }
261
262 return(latch_leaves);
263 case BTR_MODIFY_TREE:
264 /* It is exclusive for other operations which calls
265 btr_page_set_prev() */
266 ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
267 MTR_MEMO_X_LOCK
268 | MTR_MEMO_SX_LOCK));
269 /* x-latch also siblings from left to right */
270 left_page_no = btr_page_get_prev(block->frame);
271
272 if (left_page_no != FIL_NULL) {
273
274 if (spatial) {
275 cursor->rtr_info->tree_savepoints[
276 RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
277 }
278
279 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
280 get_block = btr_block_get(
281 *cursor->index, left_page_no, RW_X_LATCH,
282 true, mtr);
283 latch_leaves.blocks[0] = get_block;
284
285 if (spatial) {
286 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
287 = get_block;
288 }
289 }
290
291 if (spatial) {
292 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
293 = mtr_set_savepoint(mtr);
294 }
295
296 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
297 get_block = btr_block_get(
298 *cursor->index, block->page.id().page_no(),
299 RW_X_LATCH, true, mtr);
300 latch_leaves.blocks[1] = get_block;
301
302 #ifdef UNIV_BTR_DEBUG
303 /* Sanity check only after both the blocks are latched. */
304 if (latch_leaves.blocks[0] != NULL) {
305 ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
306 == page_is_comp(block->frame));
307 ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame)
308 == block->page.id().page_no());
309 }
310 ut_a(page_is_comp(get_block->frame)
311 == page_is_comp(block->frame));
312 #endif /* UNIV_BTR_DEBUG */
313
314 if (spatial) {
315 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
316 = get_block;
317 }
318
319 right_page_no = btr_page_get_next(block->frame);
320
321 if (right_page_no != FIL_NULL) {
322 if (spatial) {
323 cursor->rtr_info->tree_savepoints[
324 RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
325 mtr);
326 }
327 latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
328 get_block = btr_block_get(*cursor->index,
329 right_page_no, RW_X_LATCH,
330 true, mtr);
331 latch_leaves.blocks[2] = get_block;
332 #ifdef UNIV_BTR_DEBUG
333 if (get_block) {
334 ut_a(page_is_comp(get_block->frame)
335 == page_is_comp(block->frame));
336 ut_a(btr_page_get_prev(get_block->frame)
337 == block->page.id().page_no());
338 }
339 #endif /* UNIV_BTR_DEBUG */
340 if (spatial) {
341 cursor->rtr_info->tree_blocks[
342 RTR_MAX_LEVELS + 2] = get_block;
343 }
344 }
345
346 return(latch_leaves);
347
348 case BTR_SEARCH_PREV:
349 case BTR_MODIFY_PREV:
350 mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
351 /* latch also left sibling */
352 rw_lock_s_lock(&block->lock);
353 left_page_no = btr_page_get_prev(block->frame);
354 rw_lock_s_unlock(&block->lock);
355
356 if (left_page_no != FIL_NULL) {
357 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
358 get_block = btr_block_get(
359 *cursor->index, left_page_no, mode,
360 true, mtr);
361 latch_leaves.blocks[0] = get_block;
362 cursor->left_block = get_block;
363 #ifdef UNIV_BTR_DEBUG
364 ut_a(page_is_comp(get_block->frame)
365 == page_is_comp(block->frame));
366 ut_a(btr_page_get_next(get_block->frame)
367 == block->page.id().page_no());
368 #endif /* UNIV_BTR_DEBUG */
369 }
370
371 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
372 get_block = btr_block_get(*cursor->index,
373 block->page.id().page_no(), mode,
374 true, mtr);
375 latch_leaves.blocks[1] = get_block;
376 #ifdef UNIV_BTR_DEBUG
377 ut_a(page_is_comp(get_block->frame)
378 == page_is_comp(block->frame));
379 #endif /* UNIV_BTR_DEBUG */
380 return(latch_leaves);
381 case BTR_CONT_MODIFY_TREE:
382 ut_ad(dict_index_is_spatial(cursor->index));
383 return(latch_leaves);
384 }
385
386 ut_error;
387 return(latch_leaves);
388 }
389
390 /** Load the instant ALTER TABLE metadata from the clustered index
391 when loading a table definition.
392 @param[in,out] index clustered index definition
393 @param[in,out] mtr mini-transaction
394 @return error code
395 @retval DB_SUCCESS if no error occurred
396 @retval DB_CORRUPTION if any corruption was noticed */
btr_cur_instant_init_low(dict_index_t * index,mtr_t * mtr)397 static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
398 {
399 ut_ad(index->is_primary());
400 ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
401 ut_ad(index->table->supports_instant());
402 ut_ad(index->table->is_readable());
403
404 const fil_space_t* space = index->table->space;
405 if (!space) {
406 unreadable:
407 ib::error() << "Table " << index->table->name
408 << " has an unreadable root page";
409 index->table->corrupted = true;
410 return DB_CORRUPTION;
411 }
412
413 page_t* root = btr_root_get(index, mtr);
414
415 if (!root || btr_cur_instant_root_init(index, root)) {
416 goto unreadable;
417 }
418
419 ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
420
421 if (fil_page_get_type(root) == FIL_PAGE_INDEX) {
422 ut_ad(!index->is_instant());
423 return DB_SUCCESS;
424 }
425
426 btr_cur_t cur;
427 /* Relax the assertion in rec_init_offsets(). */
428 ut_ad(!index->in_instant_init);
429 ut_d(index->in_instant_init = true);
430 dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF,
431 &cur, 0, mtr);
432 ut_d(index->in_instant_init = false);
433 if (err != DB_SUCCESS) {
434 index->table->corrupted = true;
435 return err;
436 }
437
438 ut_ad(page_cur_is_before_first(&cur.page_cur));
439 ut_ad(page_is_leaf(cur.page_cur.block->frame));
440
441 page_cur_move_to_next(&cur.page_cur);
442
443 const rec_t* rec = cur.page_cur.rec;
444 const ulint comp = dict_table_is_comp(index->table);
445 const ulint info_bits = rec_get_info_bits(rec, comp);
446
447 if (page_rec_is_supremum(rec)
448 || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
449 if (!index->is_instant()) {
450 /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
451 assigned even if instant ADD COLUMN was not
452 committed. Changes to these page header fields are not
453 undo-logged, but changes to the hidden metadata record
454 are. If the server is killed and restarted, the page
455 header fields could remain set even though no metadata
456 record is present. */
457 return DB_SUCCESS;
458 }
459
460 ib::error() << "Table " << index->table->name
461 << " is missing instant ALTER metadata";
462 index->table->corrupted = true;
463 return DB_CORRUPTION;
464 }
465
466 if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
467 || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
468 incompatible:
469 ib::error() << "Table " << index->table->name
470 << " contains unrecognizable instant ALTER metadata";
471 index->table->corrupted = true;
472 return DB_CORRUPTION;
473 }
474
475 /* Read the metadata. We can get here on server restart
476 or when the table was evicted from the data dictionary cache
477 and is now being accessed again.
478
479 Here, READ COMMITTED and REPEATABLE READ should be equivalent.
480 Committing the ADD COLUMN operation would acquire
481 MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
482 concurrent operations on the table, including table eviction
483 from the cache. */
484
485 if (info_bits & REC_INFO_DELETED_FLAG) {
486 /* This metadata record includes a BLOB that identifies
487 any dropped or reordered columns. */
488 ulint trx_id_offset = index->trx_id_offset;
489 /* If !index->trx_id_offset, the PRIMARY KEY contains
490 variable-length columns. For the metadata record,
491 variable-length columns should be written with zero
492 length. However, before MDEV-21088 was fixed, for
493 variable-length encoded PRIMARY KEY column of type
494 CHAR, we wrote more than zero bytes. That is why we
495 must determine the actual length of each PRIMARY KEY
496 column. The DB_TRX_ID will start right after any
497 PRIMARY KEY columns. */
498 ut_ad(index->n_uniq);
499
500 /* We cannot invoke rec_get_offsets() before
501 index->table->deserialise_columns(). Therefore,
502 we must duplicate some logic here. */
503 if (trx_id_offset) {
504 } else if (index->table->not_redundant()) {
505 /* The PRIMARY KEY contains variable-length columns.
506 For the metadata record, variable-length columns are
507 always written with zero length. The DB_TRX_ID will
508 start right after any fixed-length columns. */
509
510 /* OK, before MDEV-21088 was fixed, for
511 variable-length encoded PRIMARY KEY column of
512 type CHAR, we wrote more than zero bytes. In
513 order to allow affected tables to be accessed,
514 it would be nice to determine the actual
515 length of each PRIMARY KEY column. However, to
516 be able to do that, we should determine the
517 size of the null-bit bitmap in the metadata
518 record. And we cannot know that before reading
519 the metadata BLOB, whose starting point we are
520 trying to find here. (Although the PRIMARY KEY
521 columns cannot be NULL, we would have to know
522 where the lengths of variable-length PRIMARY KEY
523 columns start.)
524
525 So, unfortunately we cannot help users who
526 were affected by MDEV-21088 on a ROW_FORMAT=COMPACT
527 or ROW_FORMAT=DYNAMIC table. */
528
529 for (uint i = index->n_uniq; i--; ) {
530 trx_id_offset += index->fields[i].fixed_len;
531 }
532 } else if (rec_get_1byte_offs_flag(rec)) {
533 trx_id_offset = rec_1_get_field_end_info(
534 rec, index->n_uniq - 1);
535 ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
536 trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK;
537 } else {
538 trx_id_offset = rec_2_get_field_end_info(
539 rec, index->n_uniq - 1);
540 ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
541 trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK;
542 }
543
544 const byte* ptr = rec + trx_id_offset
545 + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
546
547 if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
548 goto incompatible;
549 }
550
551 uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
552 if (!len
553 || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
554 != FIL_PAGE_DATA
555 || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
556 != space->id) {
557 goto incompatible;
558 }
559
560 buf_block_t* block = buf_page_get(
561 page_id_t(space->id,
562 mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
563 0, RW_S_LATCH, mtr);
564 buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
565 if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB
566 || mach_read_from_4(&block->frame[FIL_PAGE_DATA
567 + BTR_BLOB_HDR_NEXT_PAGE_NO])
568 != FIL_NULL
569 || mach_read_from_4(&block->frame[FIL_PAGE_DATA
570 + BTR_BLOB_HDR_PART_LEN])
571 != len) {
572 goto incompatible;
573 }
574
575 /* The unused part of the BLOB page should be zero-filled. */
576 for (const byte* b = block->frame
577 + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
578 * const end = block->frame + srv_page_size
579 - BTR_EXTERN_LEN;
580 b < end; ) {
581 if (*b++) {
582 goto incompatible;
583 }
584 }
585
586 if (index->table->deserialise_columns(
587 &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE],
588 len)) {
589 goto incompatible;
590 }
591
592 /* Proceed to initialize the default values of
593 any instantly added columns. */
594 }
595
596 mem_heap_t* heap = NULL;
597 rec_offs* offsets = rec_get_offsets(rec, index, NULL,
598 index->n_core_fields,
599 ULINT_UNDEFINED, &heap);
600 if (rec_offs_any_default(offsets)) {
601 inconsistent:
602 mem_heap_free(heap);
603 goto incompatible;
604 }
605
606 /* In fact, because we only ever append fields to the metadata
607 record, it is also OK to perform READ UNCOMMITTED and
608 then ignore any extra fields, provided that
609 trx_sys.is_registered(DB_TRX_ID). */
610 if (rec_offs_n_fields(offsets)
611 > ulint(index->n_fields) + !!index->table->instant
612 && !trx_sys.is_registered(current_trx(),
613 row_get_rec_trx_id(rec, index,
614 offsets))) {
615 goto inconsistent;
616 }
617
618 for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
619 dict_col_t* col = index->fields[i].col;
620 const unsigned o = i + !!index->table->instant;
621 ulint len;
622 const byte* data = rec_get_nth_field(rec, offsets, o, &len);
623 ut_ad(!col->is_added());
624 ut_ad(!col->def_val.data);
625 col->def_val.len = len;
626 switch (len) {
627 case UNIV_SQL_NULL:
628 continue;
629 case 0:
630 col->def_val.data = field_ref_zero;
631 continue;
632 }
633 ut_ad(len != UNIV_SQL_DEFAULT);
634 if (!rec_offs_nth_extern(offsets, o)) {
635 col->def_val.data = mem_heap_dup(
636 index->table->heap, data, len);
637 } else if (len < BTR_EXTERN_FIELD_REF_SIZE
638 || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
639 field_ref_zero,
640 BTR_EXTERN_FIELD_REF_SIZE)) {
641 col->def_val.len = UNIV_SQL_DEFAULT;
642 goto inconsistent;
643 } else {
644 col->def_val.data = btr_copy_externally_stored_field(
645 &col->def_val.len, data,
646 cur.page_cur.block->zip_size(),
647 len, index->table->heap);
648 }
649 }
650
651 mem_heap_free(heap);
652 return DB_SUCCESS;
653 }
654
655 /** Load the instant ALTER TABLE metadata from the clustered index
656 when loading a table definition.
657 @param[in,out] table table definition from the data dictionary
658 @return error code
659 @retval DB_SUCCESS if no error occurred */
660 dberr_t
btr_cur_instant_init(dict_table_t * table)661 btr_cur_instant_init(dict_table_t* table)
662 {
663 mtr_t mtr;
664 dict_index_t* index = dict_table_get_first_index(table);
665 mtr.start();
666 dberr_t err = index
667 ? btr_cur_instant_init_low(index, &mtr)
668 : DB_CORRUPTION;
669 mtr.commit();
670 return(err);
671 }
672
673 /** Initialize the n_core_null_bytes on first access to a clustered
674 index root page.
675 @param[in] index clustered index that is on its first access
676 @param[in] page clustered index root page
677 @return whether the page is corrupted */
btr_cur_instant_root_init(dict_index_t * index,const page_t * page)678 bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
679 {
680 ut_ad(!index->is_dummy);
681 ut_ad(fil_page_index_page_check(page));
682 ut_ad(!page_has_siblings(page));
683 ut_ad(page_get_space_id(page) == index->table->space_id);
684 ut_ad(page_get_page_no(page) == index->page);
685 ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table));
686 ut_ad(index->is_primary());
687 ut_ad(!index->is_instant());
688 ut_ad(index->table->supports_instant());
689 /* This is normally executed as part of btr_cur_instant_init()
690 when dict_load_table_one() is loading a table definition.
691 Other threads should not access or modify the n_core_null_bytes,
692 n_core_fields before dict_load_table_one() returns.
693
694 This can also be executed during IMPORT TABLESPACE, where the
695 table definition is exclusively locked. */
696
697 switch (fil_page_get_type(page)) {
698 default:
699 ut_ad("wrong page type" == 0);
700 return true;
701 case FIL_PAGE_INDEX:
702 /* The field PAGE_INSTANT is guaranteed 0 on clustered
703 index root pages of ROW_FORMAT=COMPACT or
704 ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
705 ut_ad(!page_is_comp(page) || !page_get_instant(page));
706 index->n_core_null_bytes = static_cast<uint8_t>(
707 UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
708 return false;
709 case FIL_PAGE_TYPE_INSTANT:
710 break;
711 }
712
713 const uint16_t n = page_get_instant(page);
714
715 if (n < index->n_uniq + DATA_ROLL_PTR) {
716 /* The PRIMARY KEY (or hidden DB_ROW_ID) and
717 DB_TRX_ID,DB_ROLL_PTR columns must always be present
718 as 'core' fields. */
719 return true;
720 }
721
722 if (n > REC_MAX_N_FIELDS) {
723 return true;
724 }
725
726 index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
727
728 const rec_t* infimum = page_get_infimum_rec(page);
729 const rec_t* supremum = page_get_supremum_rec(page);
730
731 if (!memcmp(infimum, "infimum", 8)
732 && !memcmp(supremum, "supremum", 8)) {
733 if (n > index->n_fields) {
734 /* All fields, including those for instantly
735 added columns, must be present in the
736 data dictionary. */
737 return true;
738 }
739
740 ut_ad(!index->is_dummy);
741 ut_d(index->is_dummy = true);
742 index->n_core_null_bytes = static_cast<uint8_t>(
743 UT_BITS_IN_BYTES(index->get_n_nullable(n)));
744 ut_d(index->is_dummy = false);
745 return false;
746 }
747
748 if (memcmp(infimum, field_ref_zero, 8)
749 || memcmp(supremum, field_ref_zero, 7)) {
750 /* The infimum and supremum records must either contain
751 the original strings, or they must be filled with zero
752 bytes, except for the bytes that we have repurposed. */
753 return true;
754 }
755
756 index->n_core_null_bytes = supremum[7];
757 return index->n_core_null_bytes > 128;
758 }
759
760 /** Optimistically latches the leaf page or pages requested.
761 @param[in] block guessed buffer block
762 @param[in] modify_clock modify clock value
763 @param[in,out] latch_mode BTR_SEARCH_LEAF, ...
764 @param[in,out] cursor cursor
765 @param[in] file file name
766 @param[in] line line where called
767 @param[in] mtr mini-transaction
768 @return true if success */
769 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,unsigned line,mtr_t * mtr)770 btr_cur_optimistic_latch_leaves(
771 buf_block_t* block,
772 ib_uint64_t modify_clock,
773 ulint* latch_mode,
774 btr_cur_t* cursor,
775 const char* file,
776 unsigned line,
777 mtr_t* mtr)
778 {
779 ut_ad(block->page.buf_fix_count());
780 ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
781
782 switch (*latch_mode) {
783 default:
784 ut_error;
785 return(false);
786 case BTR_SEARCH_LEAF:
787 case BTR_MODIFY_LEAF:
788 return(buf_page_optimistic_get(*latch_mode, block,
789 modify_clock, file, line, mtr));
790 case BTR_SEARCH_PREV:
791 case BTR_MODIFY_PREV:
792 rw_lock_s_lock(&block->lock);
793 if (block->modify_clock != modify_clock) {
794 rw_lock_s_unlock(&block->lock);
795 return false;
796 }
797 const uint32_t curr_page_no = block->page.id().page_no();
798 const uint32_t left_page_no = btr_page_get_prev(block->frame);
799 rw_lock_s_unlock(&block->lock);
800
801 const rw_lock_type_t mode = *latch_mode == BTR_SEARCH_PREV
802 ? RW_S_LATCH : RW_X_LATCH;
803
804 if (left_page_no != FIL_NULL) {
805 dberr_t err = DB_SUCCESS;
806 cursor->left_block = buf_page_get_gen(
807 page_id_t(cursor->index->table->space_id,
808 left_page_no),
809 cursor->index->table->space->zip_size(),
810 mode, nullptr, BUF_GET_POSSIBLY_FREED,
811 __FILE__, __LINE__, mtr, &err);
812
813 if (!cursor->left_block) {
814 cursor->index->table->file_unreadable = true;
815 }
816
817 if (cursor->left_block->page.status
818 == buf_page_t::FREED
819 || btr_page_get_next(cursor->left_block->frame)
820 != curr_page_no) {
821 /* release the left block */
822 btr_leaf_page_release(
823 cursor->left_block, mode, mtr);
824 return false;
825 }
826 } else {
827 cursor->left_block = NULL;
828 }
829
830 if (buf_page_optimistic_get(mode, block, modify_clock,
831 file, line, mtr)) {
832 if (btr_page_get_prev(block->frame) == left_page_no) {
833 /* block was already buffer-fixed while
834 entering the function and
835 buf_page_optimistic_get() buffer-fixes
836 it again. */
837 ut_ad(2 <= block->page.buf_fix_count());
838 *latch_mode = mode;
839 return(true);
840 } else {
841 /* release the block and decrement of
842 buf_fix_count which was incremented
843 in buf_page_optimistic_get() */
844 btr_leaf_page_release(block, mode, mtr);
845 }
846 }
847
848 ut_ad(block->page.buf_fix_count());
849 /* release the left block */
850 if (cursor->left_block != NULL) {
851 btr_leaf_page_release(cursor->left_block,
852 mode, mtr);
853 }
854 }
855
856 return false;
857 }
858
859 /**
860 Gets intention in btr_intention_t from latch_mode, and cleares the intention
861 at the latch_mode.
862 @param latch_mode in/out: pointer to latch_mode
863 @return intention for latching tree */
864 static
865 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)866 btr_cur_get_and_clear_intention(
867 ulint *latch_mode)
868 {
869 btr_intention_t intention;
870
871 switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
872 case BTR_LATCH_FOR_INSERT:
873 intention = BTR_INTENTION_INSERT;
874 break;
875 case BTR_LATCH_FOR_DELETE:
876 intention = BTR_INTENTION_DELETE;
877 break;
878 default:
879 /* both or unknown */
880 intention = BTR_INTENTION_BOTH;
881 }
882 *latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
883
884 return(intention);
885 }
886
887 /**
888 Gets the desired latch type for the root leaf (root page is root leaf)
889 at the latch mode.
890 @param latch_mode in: BTR_SEARCH_LEAF, ...
891 @return latch type */
892 static
893 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)894 btr_cur_latch_for_root_leaf(
895 ulint latch_mode)
896 {
897 switch (latch_mode) {
898 case BTR_SEARCH_LEAF:
899 case BTR_SEARCH_TREE:
900 case BTR_SEARCH_PREV:
901 return(RW_S_LATCH);
902 case BTR_MODIFY_LEAF:
903 case BTR_MODIFY_TREE:
904 case BTR_MODIFY_PREV:
905 return(RW_X_LATCH);
906 case BTR_CONT_MODIFY_TREE:
907 case BTR_CONT_SEARCH_TREE:
908 /* A root page should be latched already,
909 and don't need to be latched here.
910 fall through (RW_NO_LATCH) */
911 case BTR_NO_LATCHES:
912 return(RW_NO_LATCH);
913 }
914
915 ut_error;
916 return(RW_NO_LATCH); /* avoid compiler warnings */
917 }
918
919 /** Detects whether the modifying record might need a modifying tree structure.
920 @param[in] index index
921 @param[in] page page
922 @param[in] lock_intention lock intention for the tree operation
923 @param[in] rec record (current node_ptr)
924 @param[in] rec_size size of the record or max size of node_ptr
925 @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
926 @param[in] mtr mtr
927 @return true if tree modification is needed */
928 static
929 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,ulint zip_size,mtr_t * mtr)930 btr_cur_will_modify_tree(
931 dict_index_t* index,
932 const page_t* page,
933 btr_intention_t lock_intention,
934 const rec_t* rec,
935 ulint rec_size,
936 ulint zip_size,
937 mtr_t* mtr)
938 {
939 ut_ad(!page_is_leaf(page));
940 ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
941 | MTR_MEMO_SX_LOCK));
942
943 /* Pessimistic delete of the first record causes delete & insert
944 of node_ptr at upper level. And a subsequent page shrink is
945 possible. It causes delete of node_ptr at the upper level.
946 So we should pay attention also to 2nd record not only
947 first record and last record. Because if the "delete & insert" are
948 done for the different page, the 2nd record become
949 first record and following compress might delete the record and causes
950 the uppper level node_ptr modification. */
951
952 const ulint n_recs = page_get_n_recs(page);
953
954 if (lock_intention <= BTR_INTENTION_BOTH) {
955 compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH);
956 compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT);
957
958 if (!page_has_siblings(page)) {
959 return true;
960 }
961
962 ulint margin = rec_size;
963
964 if (lock_intention == BTR_INTENTION_BOTH) {
965 ulint level = btr_page_get_level(page);
966
967 /* This value is the worst expectation for the node_ptr
968 records to be deleted from this page. It is used to
969 expect whether the cursor position can be the left_most
970 record in this page or not. */
971 ulint max_nodes_deleted = 0;
972
973 /* By modifying tree operations from the under of this
974 level, logically (2 ^ (level - 1)) opportunities to
975 deleting records in maximum even unreally rare case. */
976 if (level > 7) {
977 /* TODO: adjust this practical limit. */
978 max_nodes_deleted = 64;
979 } else if (level > 0) {
980 max_nodes_deleted = (ulint)1 << (level - 1);
981 }
982 /* check delete will cause. (BTR_INTENTION_BOTH
983 or BTR_INTENTION_DELETE) */
984 if (n_recs <= max_nodes_deleted * 2
985 || page_rec_is_first(rec, page)) {
986 /* The cursor record can be the left most record
987 in this page. */
988 return true;
989 }
990
991 if (page_has_prev(page)
992 && page_rec_distance_is_at_most(
993 page_get_infimum_rec(page), rec,
994 max_nodes_deleted)) {
995 return true;
996 }
997
998 if (page_has_next(page)
999 && page_rec_distance_is_at_most(
1000 rec, page_get_supremum_rec(page),
1001 max_nodes_deleted)) {
1002 return true;
1003 }
1004
1005 /* Delete at leftmost record in a page causes delete
1006 & insert at its parent page. After that, the delete
1007 might cause btr_compress() and delete record at its
1008 parent page. Thus we should consider max deletes. */
1009 margin *= max_nodes_deleted;
1010 }
1011
1012 /* Safe because we already have SX latch of the index tree */
1013 if (page_get_data_size(page)
1014 < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) {
1015 return(true);
1016 }
1017 }
1018
1019 if (lock_intention >= BTR_INTENTION_BOTH) {
1020 /* check insert will cause. BTR_INTENTION_BOTH
1021 or BTR_INTENTION_INSERT*/
1022
1023 /* Once we invoke the btr_cur_limit_optimistic_insert_debug,
1024 we should check it here in advance, since the max allowable
1025 records in a page is limited. */
1026 LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
1027
1028 /* needs 2 records' space for the case the single split and
1029 insert cannot fit.
1030 page_get_max_insert_size_after_reorganize() includes space
1031 for page directory already */
1032 ulint max_size
1033 = page_get_max_insert_size_after_reorganize(page, 2);
1034
1035 if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
1036 || max_size < rec_size * 2) {
1037 return(true);
1038 }
1039
1040 /* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
1041 This is based on the worst case, and we could invoke
1042 page_zip_available() on the block->page.zip. */
1043 /* needs 2 records' space also for worst compress rate. */
1044 if (zip_size
1045 && page_zip_empty_size(index->n_fields, zip_size)
1046 <= rec_size * 2 + page_get_data_size(page)
1047 + page_dir_calc_reserved_space(n_recs + 2)) {
1048 return(true);
1049 }
1050 }
1051
1052 return(false);
1053 }
1054
1055 /** Detects whether the modifying record might need a opposite modification
1056 to the intention.
1057 @param[in] page page
1058 @param[in] lock_intention lock intention for the tree operation
1059 @param[in] rec record (current node_ptr)
1060 @return true if tree modification is needed */
1061 static
1062 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)1063 btr_cur_need_opposite_intention(
1064 const page_t* page,
1065 btr_intention_t lock_intention,
1066 const rec_t* rec)
1067 {
1068 switch (lock_intention) {
1069 case BTR_INTENTION_DELETE:
1070 return (page_has_prev(page) && page_rec_is_first(rec, page)) ||
1071 (page_has_next(page) && page_rec_is_last(rec, page));
1072 case BTR_INTENTION_INSERT:
1073 return page_has_next(page) && page_rec_is_last(rec, page);
1074 case BTR_INTENTION_BOTH:
1075 return(false);
1076 }
1077
1078 ut_error;
1079 return(false);
1080 }
1081
1082 /**
1083 @param[in] index b-tree
1084 @return maximum size of a node pointer record in bytes */
btr_node_ptr_max_size(const dict_index_t * index)1085 static ulint btr_node_ptr_max_size(const dict_index_t* index)
1086 {
1087 if (dict_index_is_ibuf(index)) {
1088 /* cannot estimate accurately */
1089 /* This is universal index for change buffer.
1090 The max size of the entry is about max key length * 2.
1091 (index key + primary key to be inserted to the index)
1092 (The max key length is UNIV_PAGE_SIZE / 16 * 3 at
1093 ha_innobase::max_supported_key_length(),
1094 considering MAX_KEY_LENGTH = 3072 at MySQL imposes
1095 the 3500 historical InnoDB value for 16K page size case.)
1096 For the universal index, node_ptr contains most of the entry.
1097 And 512 is enough to contain ibuf columns and meta-data */
1098 return srv_page_size / 8 * 3 + 512;
1099 }
1100
1101 /* Each record has page_no, length of page_no and header. */
1102 ulint comp = dict_table_is_comp(index->table);
1103 ulint rec_max_size = comp
1104 ? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
1105 + UT_BITS_IN_BYTES(index->n_nullable)
1106 : REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
1107 + 2 * index->n_fields;
1108
1109 /* Compute the maximum possible record size. */
1110 for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
1111 const dict_field_t* field
1112 = dict_index_get_nth_field(index, i);
1113 const dict_col_t* col
1114 = dict_field_get_col(field);
1115 ulint field_max_size;
1116 ulint field_ext_max_size;
1117
1118 /* Determine the maximum length of the index field. */
1119
1120 field_max_size = dict_col_get_fixed_size(col, comp);
1121 if (field_max_size) {
1122 /* dict_index_add_col() should guarantee this */
1123 ut_ad(!field->prefix_len
1124 || field->fixed_len == field->prefix_len);
1125 /* Fixed lengths are not encoded
1126 in ROW_FORMAT=COMPACT. */
1127 rec_max_size += field_max_size;
1128 continue;
1129 }
1130
1131 field_max_size = dict_col_get_max_size(col);
1132 if (UNIV_UNLIKELY(!field_max_size)) {
1133 switch (col->mtype) {
1134 case DATA_VARCHAR:
1135 if (!comp
1136 && (!strcmp(index->table->name.m_name,
1137 "SYS_FOREIGN")
1138 || !strcmp(index->table->name.m_name,
1139 "SYS_FOREIGN_COLS"))) {
1140 break;
1141 }
1142 /* fall through */
1143 case DATA_VARMYSQL:
1144 case DATA_CHAR:
1145 case DATA_MYSQL:
1146 /* CHAR(0) and VARCHAR(0) are possible
1147 data type definitions in MariaDB.
1148 The InnoDB internal SQL parser maps
1149 CHAR to DATA_VARCHAR, so DATA_CHAR (or
1150 DATA_MYSQL) is only coming from the
1151 MariaDB SQL layer. */
1152 if (comp) {
1153 /* Add a length byte, because
1154 fixed-length empty field are
1155 encoded as variable-length.
1156 For ROW_FORMAT=REDUNDANT,
1157 these bytes were added to
1158 rec_max_size before this loop. */
1159 rec_max_size++;
1160 }
1161 continue;
1162 }
1163
1164 /* SYS_FOREIGN.ID is defined as CHAR in the
1165 InnoDB internal SQL parser, which translates
1166 into the incorrect VARCHAR(0). InnoDB does
1167 not enforce maximum lengths of columns, so
1168 that is why any data can be inserted in the
1169 first place.
1170
1171 Likewise, SYS_FOREIGN.FOR_NAME,
1172 SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
1173 defined as CHAR, and also they are part of a key. */
1174
1175 ut_ad(!strcmp(index->table->name.m_name,
1176 "SYS_FOREIGN")
1177 || !strcmp(index->table->name.m_name,
1178 "SYS_FOREIGN_COLS"));
1179 ut_ad(!comp);
1180 ut_ad(col->mtype == DATA_VARCHAR);
1181
1182 rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
1183 ? REDUNDANT_REC_MAX_DATA_SIZE
1184 : page_get_free_space_of_empty(FALSE) / 2;
1185 } else if (field_max_size == NAME_LEN && i == 1
1186 && (!strcmp(index->table->name.m_name,
1187 TABLE_STATS_NAME)
1188 || !strcmp(index->table->name.m_name,
1189 INDEX_STATS_NAME))) {
1190 /* Interpret "table_name" as VARCHAR(199) even
1191 if it was incorrectly defined as VARCHAR(64).
1192 While the caller of ha_innobase enforces the
1193 maximum length on any data written, the InnoDB
1194 internal SQL parser will happily write as much
1195 data as is provided. The purpose of this hack
1196 is to avoid InnoDB hangs after persistent
1197 statistics on partitioned tables are
1198 deleted. */
1199 field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
1200 }
1201 field_ext_max_size = field_max_size < 256 ? 1 : 2;
1202
1203 if (field->prefix_len
1204 && field->prefix_len < field_max_size) {
1205 field_max_size = field->prefix_len;
1206 }
1207
1208 if (comp) {
1209 /* Add the extra size for ROW_FORMAT=COMPACT.
1210 For ROW_FORMAT=REDUNDANT, these bytes were
1211 added to rec_max_size before this loop. */
1212 rec_max_size += field_ext_max_size;
1213 }
1214
1215 rec_max_size += field_max_size;
1216 }
1217
1218 return rec_max_size;
1219 }
1220
1221 /********************************************************************//**
1222 Searches an index tree and positions a tree cursor on a given level.
1223 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
1224 to node pointer page number fields on the upper levels of the tree!
1225 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
1226 cursor->up_match and cursor->low_match both will have sensible values.
1227 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
1228
1229 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
1230 search tuple should be performed in the B-tree. InnoDB does an insert
1231 immediately after the cursor. Thus, the cursor may end up on a user record,
1232 or on a page infimum record. */
1233 dberr_t
btr_cur_search_to_nth_level_func(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,rw_lock_t * ahi_latch,const char * file,unsigned line,mtr_t * mtr,ib_uint64_t autoinc)1234 btr_cur_search_to_nth_level_func(
1235 dict_index_t* index, /*!< in: index */
1236 ulint level, /*!< in: the tree level of search */
1237 const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
1238 tuple must be set so that it cannot get
1239 compared to the node ptr page number field! */
1240 page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
1241 Inserts should always be made using
1242 PAGE_CUR_LE to search the position! */
1243 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
1244 at most one of BTR_INSERT, BTR_DELETE_MARK,
1245 BTR_DELETE, or BTR_ESTIMATE;
1246 cursor->left_block is used to store a pointer
1247 to the left neighbor page, in the cases
1248 BTR_SEARCH_PREV and BTR_MODIFY_PREV;
1249 NOTE that if ahi_latch, we might not have a
1250 cursor page latch, we assume that ahi_latch
1251 protects the record! */
1252 btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
1253 s- or x-latched, but see also above! */
1254 #ifdef BTR_CUR_HASH_ADAPT
1255 rw_lock_t* ahi_latch,
1256 /*!< in: currently held btr_search_latch
1257 (in RW_S_LATCH mode), or NULL */
1258 #endif /* BTR_CUR_HASH_ADAPT */
1259 const char* file, /*!< in: file name */
1260 unsigned line, /*!< in: line where called */
1261 mtr_t* mtr, /*!< in: mtr */
1262 ib_uint64_t autoinc)/*!< in: PAGE_ROOT_AUTO_INC to be written
1263 (0 if none) */
1264 {
1265 page_t* page = NULL; /* remove warning */
1266 buf_block_t* block;
1267 buf_block_t* guess;
1268 ulint height;
1269 ulint up_match;
1270 ulint up_bytes;
1271 ulint low_match;
1272 ulint low_bytes;
1273 ulint rw_latch;
1274 page_cur_mode_t page_mode;
1275 page_cur_mode_t search_mode = PAGE_CUR_UNSUPP;
1276 ulint buf_mode;
1277 ulint estimate;
1278 ulint node_ptr_max_size = srv_page_size / 2;
1279 page_cur_t* page_cursor;
1280 btr_op_t btr_op;
1281 ulint root_height = 0; /* remove warning */
1282 dberr_t err = DB_SUCCESS;
1283
1284 btr_intention_t lock_intention;
1285 bool modify_external;
1286 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
1287 ulint tree_savepoints[BTR_MAX_LEVELS];
1288 ulint n_blocks = 0;
1289 ulint n_releases = 0;
1290 bool detected_same_key_root = false;
1291
1292 bool retrying_for_search_prev = false;
1293 ulint leftmost_from_level = 0;
1294 buf_block_t** prev_tree_blocks = NULL;
1295 ulint* prev_tree_savepoints = NULL;
1296 ulint prev_n_blocks = 0;
1297 ulint prev_n_releases = 0;
1298 bool need_path = true;
1299 bool rtree_parent_modified = false;
1300 bool mbr_adj = false;
1301 bool found = false;
1302
1303 DBUG_ENTER("btr_cur_search_to_nth_level");
1304
1305 #ifdef BTR_CUR_ADAPT
1306 btr_search_t* info;
1307 #endif /* BTR_CUR_ADAPT */
1308 mem_heap_t* heap = NULL;
1309 rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
1310 rec_offs* offsets = offsets_;
1311 rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
1312 rec_offs* offsets2 = offsets2_;
1313 rec_offs_init(offsets_);
1314 rec_offs_init(offsets2_);
1315 /* Currently, PAGE_CUR_LE is the only search mode used for searches
1316 ending to upper levels */
1317
1318 ut_ad(level == 0 || mode == PAGE_CUR_LE
1319 || RTREE_SEARCH_MODE(mode));
1320 ut_ad(dict_index_check_search_tuple(index, tuple));
1321 ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
1322 ut_ad(dtuple_check_typed(tuple));
1323 ut_ad(!(index->type & DICT_FTS));
1324 ut_ad(index->page != FIL_NULL);
1325
1326 MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match);
1327 MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
1328 MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match);
1329 MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
1330 #ifdef UNIV_DEBUG
1331 cursor->up_match = ULINT_UNDEFINED;
1332 cursor->low_match = ULINT_UNDEFINED;
1333 #endif /* UNIV_DEBUG */
1334
1335 ibool s_latch_by_caller;
1336
1337 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
1338
1339 ut_ad(!s_latch_by_caller
1340 || srv_read_only_mode
1341 || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
1342 | MTR_MEMO_SX_LOCK));
1343
1344 /* These flags are mutually exclusive, they are lumped together
1345 with the latch mode for historical reasons. It's possible for
1346 none of the flags to be set. */
1347 switch (UNIV_EXPECT(latch_mode
1348 & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
1349 0)) {
1350 case 0:
1351 btr_op = BTR_NO_OP;
1352 break;
1353 case BTR_INSERT:
1354 btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
1355 ? BTR_INSERT_IGNORE_UNIQUE_OP
1356 : BTR_INSERT_OP;
1357 break;
1358 case BTR_DELETE:
1359 btr_op = BTR_DELETE_OP;
1360 ut_a(cursor->purge_node);
1361 break;
1362 case BTR_DELETE_MARK:
1363 btr_op = BTR_DELMARK_OP;
1364 break;
1365 default:
1366 /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
1367 should be specified at a time */
1368 ut_error;
1369 }
1370
1371 /* Operations on the insert buffer tree cannot be buffered. */
1372 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
1373 /* Operations on the clustered index cannot be buffered. */
1374 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
1375 /* Operations on the temporary table(indexes) cannot be buffered. */
1376 ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary());
1377 /* Operation on the spatial index cannot be buffered. */
1378 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
1379
1380 estimate = latch_mode & BTR_ESTIMATE;
1381
1382 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
1383
1384 modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
1385
1386 /* Turn the flags unrelated to the latch mode off. */
1387 latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
1388
1389 ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
1390
1391 ut_ad(!s_latch_by_caller
1392 || latch_mode == BTR_SEARCH_LEAF
1393 || latch_mode == BTR_SEARCH_TREE
1394 || latch_mode == BTR_MODIFY_LEAF);
1395
1396 ut_ad(autoinc == 0 || dict_index_is_clust(index));
1397 ut_ad(autoinc == 0
1398 || latch_mode == BTR_MODIFY_TREE
1399 || latch_mode == BTR_MODIFY_LEAF);
1400 ut_ad(autoinc == 0 || level == 0);
1401
1402 cursor->flag = BTR_CUR_BINARY;
1403 cursor->index = index;
1404
1405 #ifndef BTR_CUR_ADAPT
1406 guess = NULL;
1407 #else
1408 info = btr_search_get_info(index);
1409 guess = info->root_guess;
1410
1411 #ifdef BTR_CUR_HASH_ADAPT
1412
1413 # ifdef UNIV_SEARCH_PERF_STAT
1414 info->n_searches++;
1415 # endif
1416 if (autoinc == 0
1417 && latch_mode <= BTR_MODIFY_LEAF
1418 && info->last_hash_succ
1419 # ifdef MYSQL_INDEX_DISABLE_AHI
1420 && !index->disable_ahi
1421 # endif
1422 && !estimate
1423 # ifdef PAGE_CUR_LE_OR_EXTENDS
1424 && mode != PAGE_CUR_LE_OR_EXTENDS
1425 # endif /* PAGE_CUR_LE_OR_EXTENDS */
1426 && !dict_index_is_spatial(index)
1427 /* If !ahi_latch, we do a dirty read of
1428 btr_search_enabled below, and btr_search_guess_on_hash()
1429 will have to check it again. */
1430 && btr_search_enabled
1431 && !modify_external
1432 && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
1433 && btr_search_guess_on_hash(index, info, tuple, mode,
1434 latch_mode, cursor,
1435 ahi_latch, mtr)) {
1436
1437 /* Search using the hash index succeeded */
1438
1439 ut_ad(cursor->up_match != ULINT_UNDEFINED
1440 || mode != PAGE_CUR_GE);
1441 ut_ad(cursor->up_match != ULINT_UNDEFINED
1442 || mode != PAGE_CUR_LE);
1443 ut_ad(cursor->low_match != ULINT_UNDEFINED
1444 || mode != PAGE_CUR_LE);
1445 btr_cur_n_sea++;
1446
1447 DBUG_RETURN(err);
1448 }
1449 # endif /* BTR_CUR_HASH_ADAPT */
1450 #endif /* BTR_CUR_ADAPT */
1451 btr_cur_n_non_sea++;
1452
1453 /* If the hash search did not succeed, do binary search down the
1454 tree */
1455
1456 #ifdef BTR_CUR_HASH_ADAPT
1457 if (ahi_latch) {
1458 /* Release possible search latch to obey latching order */
1459 rw_lock_s_unlock(ahi_latch);
1460 }
1461 #endif /* BTR_CUR_HASH_ADAPT */
1462
1463 /* Store the position of the tree latch we push to mtr so that we
1464 know how to release it when we have latched leaf node(s) */
1465
1466 ulint savepoint = mtr_set_savepoint(mtr);
1467
1468 rw_lock_type_t upper_rw_latch;
1469
1470 switch (latch_mode) {
1471 case BTR_MODIFY_TREE:
1472 /* Most of delete-intended operations are purging.
1473 Free blocks and read IO bandwidth should be prior
1474 for them, when the history list is glowing huge. */
1475 if (lock_intention == BTR_INTENTION_DELETE
1476 && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
1477 && buf_pool.n_pend_reads) {
1478 x_latch_index:
1479 mtr_x_lock_index(index, mtr);
1480 } else if (index->is_spatial()
1481 && lock_intention <= BTR_INTENTION_BOTH) {
1482 /* X lock the if there is possibility of
1483 pessimistic delete on spatial index. As we could
1484 lock upward for the tree */
1485 goto x_latch_index;
1486 } else {
1487 mtr_sx_lock_index(index, mtr);
1488 }
1489 upper_rw_latch = RW_X_LATCH;
1490 break;
1491 case BTR_CONT_MODIFY_TREE:
1492 case BTR_CONT_SEARCH_TREE:
1493 /* Do nothing */
1494 ut_ad(srv_read_only_mode
1495 || mtr->memo_contains_flagged(&index->lock,
1496 MTR_MEMO_X_LOCK
1497 | MTR_MEMO_SX_LOCK));
1498 if (dict_index_is_spatial(index)
1499 && latch_mode == BTR_CONT_MODIFY_TREE) {
1500 /* If we are about to locating parent page for split
1501 and/or merge operation for R-Tree index, X latch
1502 the parent */
1503 upper_rw_latch = RW_X_LATCH;
1504 } else {
1505 upper_rw_latch = RW_NO_LATCH;
1506 }
1507 break;
1508 default:
1509 if (!srv_read_only_mode) {
1510 if (s_latch_by_caller) {
1511 ut_ad(rw_lock_own(dict_index_get_lock(index),
1512 RW_LOCK_S));
1513 } else if (!modify_external) {
1514 /* BTR_SEARCH_TREE is intended to be used with
1515 BTR_ALREADY_S_LATCHED */
1516 ut_ad(latch_mode != BTR_SEARCH_TREE);
1517
1518 mtr_s_lock_index(index, mtr);
1519 } else {
1520 /* BTR_MODIFY_EXTERNAL needs to be excluded */
1521 mtr_sx_lock_index(index, mtr);
1522 }
1523 upper_rw_latch = RW_S_LATCH;
1524 } else {
1525 upper_rw_latch = RW_NO_LATCH;
1526 }
1527 }
1528 const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
1529 latch_mode);
1530
1531 page_cursor = btr_cur_get_page_cur(cursor);
1532
1533 const ulint zip_size = index->table->space->zip_size();
1534
1535 /* Start with the root page. */
1536 page_id_t page_id(index->table->space_id, index->page);
1537
1538 if (root_leaf_rw_latch == RW_X_LATCH) {
1539 node_ptr_max_size = btr_node_ptr_max_size(index);
1540 }
1541
1542 up_match = 0;
1543 up_bytes = 0;
1544 low_match = 0;
1545 low_bytes = 0;
1546
1547 height = ULINT_UNDEFINED;
1548
1549 /* We use these modified search modes on non-leaf levels of the
1550 B-tree. These let us end up in the right B-tree leaf. In that leaf
1551 we use the original search mode. */
1552
1553 switch (mode) {
1554 case PAGE_CUR_GE:
1555 page_mode = PAGE_CUR_L;
1556 break;
1557 case PAGE_CUR_G:
1558 page_mode = PAGE_CUR_LE;
1559 break;
1560 default:
1561 #ifdef PAGE_CUR_LE_OR_EXTENDS
1562 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1563 || RTREE_SEARCH_MODE(mode)
1564 || mode == PAGE_CUR_LE_OR_EXTENDS);
1565 #else /* PAGE_CUR_LE_OR_EXTENDS */
1566 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1567 || RTREE_SEARCH_MODE(mode));
1568 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1569 page_mode = mode;
1570 break;
1571 }
1572
1573 /* Loop and search until we arrive at the desired level */
1574 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1575
1576 search_loop:
1577 buf_mode = BUF_GET;
1578 rw_latch = RW_NO_LATCH;
1579 rtree_parent_modified = false;
1580
1581 if (height != 0) {
1582 /* We are about to fetch the root or a non-leaf page. */
1583 if ((latch_mode != BTR_MODIFY_TREE || height == level)
1584 && !retrying_for_search_prev) {
1585 /* If doesn't have SX or X latch of index,
1586 each pages should be latched before reading. */
1587 if (height == ULINT_UNDEFINED
1588 && upper_rw_latch == RW_S_LATCH
1589 && (modify_external || autoinc)) {
1590 /* needs sx-latch of root page
1591 for fseg operation or for writing
1592 PAGE_ROOT_AUTO_INC */
1593 rw_latch = RW_SX_LATCH;
1594 } else {
1595 rw_latch = upper_rw_latch;
1596 }
1597 }
1598 } else if (latch_mode <= BTR_MODIFY_LEAF) {
1599 rw_latch = latch_mode;
1600
1601 if (btr_op != BTR_NO_OP
1602 && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1603
1604 /* Try to buffer the operation if the leaf
1605 page is not in the buffer pool. */
1606
1607 buf_mode = btr_op == BTR_DELETE_OP
1608 ? BUF_GET_IF_IN_POOL_OR_WATCH
1609 : BUF_GET_IF_IN_POOL;
1610 }
1611 }
1612
1613 retry_page_get:
1614 ut_ad(n_blocks < BTR_MAX_LEVELS);
1615 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1616 block = buf_page_get_gen(page_id, zip_size, rw_latch, guess,
1617 buf_mode, file, line, mtr, &err,
1618 height == 0 && !index->is_clust());
1619 tree_blocks[n_blocks] = block;
1620
1621 /* Note that block==NULL signifies either an error or change
1622 buffering. */
1623
1624 if (err != DB_SUCCESS) {
1625 ut_ad(block == NULL);
1626 if (err == DB_DECRYPTION_FAILED) {
1627 ib_push_warning((void *)NULL,
1628 DB_DECRYPTION_FAILED,
1629 "Table %s is encrypted but encryption service or"
1630 " used key_id is not available. "
1631 " Can't continue reading table.",
1632 index->table->name.m_name);
1633 index->table->file_unreadable = true;
1634 }
1635
1636 goto func_exit;
1637 }
1638
1639 if (block == NULL) {
1640 /* This must be a search to perform an insert/delete
1641 mark/ delete; try using the insert/delete buffer */
1642
1643 ut_ad(height == 0);
1644 ut_ad(cursor->thr);
1645
1646 switch (btr_op) {
1647 case BTR_INSERT_OP:
1648 case BTR_INSERT_IGNORE_UNIQUE_OP:
1649 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1650 ut_ad(!dict_index_is_spatial(index));
1651
1652 if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1653 page_id, zip_size, cursor->thr)) {
1654
1655 cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1656
1657 goto func_exit;
1658 }
1659 break;
1660
1661 case BTR_DELMARK_OP:
1662 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1663 ut_ad(!dict_index_is_spatial(index));
1664
1665 if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1666 index, page_id, zip_size,
1667 cursor->thr)) {
1668
1669 cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1670
1671 goto func_exit;
1672 }
1673
1674 break;
1675
1676 case BTR_DELETE_OP:
1677 ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1678 ut_ad(!dict_index_is_spatial(index));
1679
1680 if (!row_purge_poss_sec(cursor->purge_node,
1681 index, tuple)) {
1682
1683 /* The record cannot be purged yet. */
1684 cursor->flag = BTR_CUR_DELETE_REF;
1685 } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1686 index, page_id, zip_size,
1687 cursor->thr)) {
1688
1689 /* The purge was buffered. */
1690 cursor->flag = BTR_CUR_DELETE_IBUF;
1691 } else {
1692 /* The purge could not be buffered. */
1693 buf_pool.watch_unset(page_id);
1694 break;
1695 }
1696
1697 buf_pool.watch_unset(page_id);
1698 goto func_exit;
1699
1700 default:
1701 ut_error;
1702 }
1703
1704 /* Insert to the insert/delete buffer did not succeed, we
1705 must read the page from disk. */
1706
1707 buf_mode = BUF_GET;
1708
1709 goto retry_page_get;
1710 }
1711
1712 if (retrying_for_search_prev && height != 0) {
1713 /* also latch left sibling */
1714 uint32_t left_page_no;
1715 buf_block_t* get_block;
1716
1717 ut_ad(rw_latch == RW_NO_LATCH);
1718
1719 rw_latch = upper_rw_latch;
1720
1721 rw_lock_s_lock(&block->lock);
1722 left_page_no = btr_page_get_prev(buf_block_get_frame(block));
1723 rw_lock_s_unlock(&block->lock);
1724
1725 if (left_page_no != FIL_NULL) {
1726 ut_ad(prev_n_blocks < leftmost_from_level);
1727
1728 prev_tree_savepoints[prev_n_blocks]
1729 = mtr_set_savepoint(mtr);
1730 get_block = buf_page_get_gen(
1731 page_id_t(page_id.space(), left_page_no),
1732 zip_size, rw_latch, NULL, buf_mode,
1733 file, line, mtr, &err);
1734 prev_tree_blocks[prev_n_blocks] = get_block;
1735 prev_n_blocks++;
1736
1737 if (err != DB_SUCCESS) {
1738 if (err == DB_DECRYPTION_FAILED) {
1739 ib_push_warning((void *)NULL,
1740 DB_DECRYPTION_FAILED,
1741 "Table %s is encrypted but encryption service or"
1742 " used key_id is not available. "
1743 " Can't continue reading table.",
1744 index->table->name.m_name);
1745 index->table->file_unreadable = true;
1746 }
1747
1748 goto func_exit;
1749 }
1750
1751 /* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1752 without their parent page's lock. So, not needed to
1753 retry here, because we have the parent page's lock. */
1754 }
1755
1756 /* release RW_NO_LATCH page and lock with RW_S_LATCH */
1757 mtr_release_block_at_savepoint(
1758 mtr, tree_savepoints[n_blocks],
1759 tree_blocks[n_blocks]);
1760
1761 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1762 block = buf_page_get_gen(page_id, zip_size,
1763 rw_latch, NULL, buf_mode,
1764 file, line, mtr, &err);
1765 tree_blocks[n_blocks] = block;
1766
1767 if (err != DB_SUCCESS) {
1768 if (err == DB_DECRYPTION_FAILED) {
1769 ib_push_warning((void *)NULL,
1770 DB_DECRYPTION_FAILED,
1771 "Table %s is encrypted but encryption service or"
1772 " used key_id is not available. "
1773 " Can't continue reading table.",
1774 index->table->name.m_name);
1775 index->table->file_unreadable = true;
1776 }
1777
1778 goto func_exit;
1779 }
1780 }
1781
1782 page = buf_block_get_frame(block);
1783
1784 if (height == ULINT_UNDEFINED
1785 && page_is_leaf(page)
1786 && rw_latch != RW_NO_LATCH
1787 && rw_latch != root_leaf_rw_latch) {
1788 /* The root page is also a leaf page (root_leaf).
1789 We should reacquire the page, because the root page
1790 is latched differently from leaf pages. */
1791 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1792 ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1793 ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc);
1794 ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH);
1795
1796 ut_ad(n_blocks == 0);
1797 mtr_release_block_at_savepoint(
1798 mtr, tree_savepoints[n_blocks],
1799 tree_blocks[n_blocks]);
1800
1801 upper_rw_latch = root_leaf_rw_latch;
1802 goto search_loop;
1803 }
1804
1805 if (rw_latch != RW_NO_LATCH) {
1806 #ifdef UNIV_ZIP_DEBUG
1807 const page_zip_des_t* page_zip
1808 = buf_block_get_page_zip(block);
1809 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1810 #endif /* UNIV_ZIP_DEBUG */
1811
1812 buf_block_dbg_add_level(
1813 block, dict_index_is_ibuf(index)
1814 ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1815 }
1816
1817 ut_ad(fil_page_index_page_check(page));
1818 ut_ad(index->id == btr_page_get_index_id(page));
1819
1820 if (height == ULINT_UNDEFINED) {
1821 /* We are in the root node */
1822
1823 height = btr_page_get_level(page);
1824 root_height = height;
1825 cursor->tree_height = root_height + 1;
1826
1827 if (dict_index_is_spatial(index)) {
1828 ut_ad(cursor->rtr_info);
1829
1830 /* If SSN in memory is not initialized, fetch
1831 it from root page */
1832 if (!rtr_get_current_ssn_id(index)) {
1833 /* FIXME: do this in dict_load_table_one() */
1834 index->set_ssn(page_get_ssn_id(page) + 1);
1835 }
1836
1837 /* Save the MBR */
1838 cursor->rtr_info->thr = cursor->thr;
1839 rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1840 }
1841
1842 #ifdef BTR_CUR_ADAPT
1843 info->root_guess = block;
1844 #endif
1845 }
1846
1847 if (height == 0) {
1848 if (rw_latch == RW_NO_LATCH) {
1849 latch_leaves = btr_cur_latch_leaves(
1850 block, latch_mode, cursor, mtr);
1851 }
1852
1853 switch (latch_mode) {
1854 case BTR_MODIFY_TREE:
1855 case BTR_CONT_MODIFY_TREE:
1856 case BTR_CONT_SEARCH_TREE:
1857 break;
1858 default:
1859 if (!s_latch_by_caller
1860 && !srv_read_only_mode
1861 && !modify_external) {
1862 /* Release the tree s-latch */
1863 /* NOTE: BTR_MODIFY_EXTERNAL
1864 needs to keep tree sx-latch */
1865 mtr_release_s_latch_at_savepoint(
1866 mtr, savepoint,
1867 dict_index_get_lock(index));
1868 }
1869
1870 /* release upper blocks */
1871 if (retrying_for_search_prev) {
1872 ut_ad(!autoinc);
1873 for (;
1874 prev_n_releases < prev_n_blocks;
1875 prev_n_releases++) {
1876 mtr_release_block_at_savepoint(
1877 mtr,
1878 prev_tree_savepoints[
1879 prev_n_releases],
1880 prev_tree_blocks[
1881 prev_n_releases]);
1882 }
1883 }
1884
1885 for (; n_releases < n_blocks; n_releases++) {
1886 if (n_releases == 0
1887 && (modify_external || autoinc)) {
1888 /* keep the root page latch */
1889 ut_ad(mtr->memo_contains_flagged(
1890 tree_blocks[n_releases],
1891 MTR_MEMO_PAGE_SX_FIX
1892 | MTR_MEMO_PAGE_X_FIX));
1893 continue;
1894 }
1895
1896 mtr_release_block_at_savepoint(
1897 mtr, tree_savepoints[n_releases],
1898 tree_blocks[n_releases]);
1899 }
1900 }
1901
1902 page_mode = mode;
1903 }
1904
1905 if (dict_index_is_spatial(index)) {
1906 /* Remember the page search mode */
1907 search_mode = page_mode;
1908
1909 /* Some adjustment on search mode, when the
1910 page search mode is PAGE_CUR_RTREE_LOCATE
1911 or PAGE_CUR_RTREE_INSERT, as we are searching
1912 with MBRs. When it is not the target level, we
1913 should search all sub-trees that "CONTAIN" the
1914 search range/MBR. When it is at the target
1915 level, the search becomes PAGE_CUR_LE */
1916 if (page_mode == PAGE_CUR_RTREE_LOCATE
1917 && level == height) {
1918 if (level == 0) {
1919 page_mode = PAGE_CUR_LE;
1920 } else {
1921 page_mode = PAGE_CUR_RTREE_GET_FATHER;
1922 }
1923 }
1924
1925 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1926 page_mode = (level == height)
1927 ? PAGE_CUR_LE
1928 : PAGE_CUR_RTREE_INSERT;
1929
1930 ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1931 }
1932
1933 /* "need_path" indicates if we need to tracking the parent
1934 pages, if it is not spatial comparison, then no need to
1935 track it */
1936 if (page_mode < PAGE_CUR_CONTAIN) {
1937 need_path = false;
1938 }
1939
1940 up_match = 0;
1941 low_match = 0;
1942
1943 if (latch_mode == BTR_MODIFY_TREE
1944 || latch_mode == BTR_CONT_MODIFY_TREE
1945 || latch_mode == BTR_CONT_SEARCH_TREE) {
1946 /* Tree are locked, no need for Page Lock to protect
1947 the "path" */
1948 cursor->rtr_info->need_page_lock = false;
1949 }
1950 }
1951
1952 if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1953 ut_ad(need_path);
1954 found = rtr_cur_search_with_match(
1955 block, index, tuple, page_mode, page_cursor,
1956 cursor->rtr_info);
1957
1958 /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1959 if (search_mode == PAGE_CUR_RTREE_INSERT
1960 && cursor->rtr_info->mbr_adj) {
1961 if (latch_mode & BTR_MODIFY_LEAF) {
1962 /* Parent MBR needs updated, should retry
1963 with BTR_MODIFY_TREE */
1964 goto func_exit;
1965 } else if (latch_mode & BTR_MODIFY_TREE) {
1966 rtree_parent_modified = true;
1967 cursor->rtr_info->mbr_adj = false;
1968 mbr_adj = true;
1969 } else {
1970 ut_ad(0);
1971 }
1972 }
1973
1974 if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1975 cursor->low_match =
1976 DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1977 }
1978 #ifdef BTR_CUR_HASH_ADAPT
1979 } else if (height == 0 && btr_search_enabled
1980 && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
1981 && !dict_index_is_spatial(index)) {
1982 /* The adaptive hash index is only used when searching
1983 for leaf pages (height==0), but not in r-trees.
1984 We only need the byte prefix comparison for the purpose
1985 of updating the adaptive hash index. */
1986 page_cur_search_with_match_bytes(
1987 block, index, tuple, page_mode, &up_match, &up_bytes,
1988 &low_match, &low_bytes, page_cursor);
1989 #endif /* BTR_CUR_HASH_ADAPT */
1990 } else {
1991 /* Search for complete index fields. */
1992 up_bytes = low_bytes = 0;
1993 page_cur_search_with_match(
1994 block, index, tuple, page_mode, &up_match,
1995 &low_match, page_cursor,
1996 need_path ? cursor->rtr_info : NULL);
1997 }
1998
1999 if (estimate) {
2000 btr_cur_add_path_info(cursor, height, root_height);
2001 }
2002
2003 /* If this is the desired level, leave the loop */
2004
2005 ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor)));
2006
2007 /* Add Predicate lock if it is serializable isolation
2008 and only if it is in the search case */
2009 if (dict_index_is_spatial(index)
2010 && cursor->rtr_info->need_prdt_lock
2011 && mode != PAGE_CUR_RTREE_INSERT
2012 && mode != PAGE_CUR_RTREE_LOCATE
2013 && mode >= PAGE_CUR_CONTAIN) {
2014 trx_t* trx = thr_get_trx(cursor->thr);
2015 lock_prdt_t prdt;
2016
2017 lock_mutex_enter();
2018 lock_init_prdt_from_mbr(
2019 &prdt, &cursor->rtr_info->mbr, mode,
2020 trx->lock.lock_heap);
2021 lock_mutex_exit();
2022
2023 if (rw_latch == RW_NO_LATCH && height != 0) {
2024 rw_lock_s_lock(&(block->lock));
2025 }
2026
2027 lock_prdt_lock(block, &prdt, index, LOCK_S,
2028 LOCK_PREDICATE, cursor->thr);
2029
2030 if (rw_latch == RW_NO_LATCH && height != 0) {
2031 rw_lock_s_unlock(&(block->lock));
2032 }
2033 }
2034
2035 if (level != height) {
2036
2037 const rec_t* node_ptr;
2038 ut_ad(height > 0);
2039
2040 height--;
2041 guess = NULL;
2042
2043 node_ptr = page_cur_get_rec(page_cursor);
2044
2045 offsets = rec_get_offsets(node_ptr, index, offsets, 0,
2046 ULINT_UNDEFINED, &heap);
2047
2048 /* If the rec is the first or last in the page for
2049 pessimistic delete intention, it might cause node_ptr insert
2050 for the upper level. We should change the intention and retry.
2051 */
2052 if (latch_mode == BTR_MODIFY_TREE
2053 && btr_cur_need_opposite_intention(
2054 page, lock_intention, node_ptr)) {
2055
2056 need_opposite_intention:
2057 ut_ad(upper_rw_latch == RW_X_LATCH);
2058
2059 if (n_releases > 0) {
2060 /* release root block */
2061 mtr_release_block_at_savepoint(
2062 mtr, tree_savepoints[0],
2063 tree_blocks[0]);
2064 }
2065
2066 /* release all blocks */
2067 for (; n_releases <= n_blocks; n_releases++) {
2068 mtr_release_block_at_savepoint(
2069 mtr, tree_savepoints[n_releases],
2070 tree_blocks[n_releases]);
2071 }
2072
2073 lock_intention = BTR_INTENTION_BOTH;
2074
2075 page_id.set_page_no(index->page);
2076 up_match = 0;
2077 low_match = 0;
2078 height = ULINT_UNDEFINED;
2079
2080 n_blocks = 0;
2081 n_releases = 0;
2082
2083 goto search_loop;
2084 }
2085
2086 if (dict_index_is_spatial(index)) {
2087 if (page_rec_is_supremum(node_ptr)) {
2088 cursor->low_match = 0;
2089 cursor->up_match = 0;
2090 goto func_exit;
2091 }
2092
2093 /* If we are doing insertion or record locating,
2094 remember the tree nodes we visited */
2095 if (page_mode == PAGE_CUR_RTREE_INSERT
2096 || (search_mode == PAGE_CUR_RTREE_LOCATE
2097 && (latch_mode != BTR_MODIFY_LEAF))) {
2098 bool add_latch = false;
2099
2100 if (latch_mode == BTR_MODIFY_TREE
2101 && rw_latch == RW_NO_LATCH) {
2102 ut_ad(mtr->memo_contains_flagged(
2103 &index->lock, MTR_MEMO_X_LOCK
2104 | MTR_MEMO_SX_LOCK));
2105 rw_lock_s_lock(&block->lock);
2106 add_latch = true;
2107 }
2108
2109 /* Store the parent cursor location */
2110 #ifdef UNIV_DEBUG
2111 ulint num_stored = rtr_store_parent_path(
2112 block, cursor, latch_mode,
2113 height + 1, mtr);
2114 #else
2115 rtr_store_parent_path(
2116 block, cursor, latch_mode,
2117 height + 1, mtr);
2118 #endif
2119
2120 if (page_mode == PAGE_CUR_RTREE_INSERT) {
2121 btr_pcur_t* r_cursor =
2122 rtr_get_parent_cursor(
2123 cursor, height + 1,
2124 true);
2125 /* If it is insertion, there should
2126 be only one parent for each level
2127 traverse */
2128 #ifdef UNIV_DEBUG
2129 ut_ad(num_stored == 1);
2130 #endif
2131
2132 node_ptr = btr_pcur_get_rec(r_cursor);
2133
2134 }
2135
2136 if (add_latch) {
2137 rw_lock_s_unlock(&block->lock);
2138 }
2139
2140 ut_ad(!page_rec_is_supremum(node_ptr));
2141 }
2142
2143 ut_ad(page_mode == search_mode
2144 || (page_mode == PAGE_CUR_WITHIN
2145 && search_mode == PAGE_CUR_RTREE_LOCATE));
2146
2147 page_mode = search_mode;
2148 }
2149
2150 /* If the first or the last record of the page
2151 or the same key value to the first record or last record,
2152 the another page might be chosen when BTR_CONT_MODIFY_TREE.
2153 So, the parent page should not released to avoiding deadlock
2154 with blocking the another search with the same key value. */
2155 if (!detected_same_key_root
2156 && lock_intention == BTR_INTENTION_BOTH
2157 && !dict_index_is_unique(index)
2158 && latch_mode == BTR_MODIFY_TREE
2159 && (up_match >= rec_offs_n_fields(offsets) - 1
2160 || low_match >= rec_offs_n_fields(offsets) - 1)) {
2161 const rec_t* first_rec = page_rec_get_next_const(
2162 page_get_infimum_rec(page));
2163 ulint matched_fields;
2164
2165 ut_ad(upper_rw_latch == RW_X_LATCH);
2166
2167 if (node_ptr == first_rec
2168 || page_rec_is_last(node_ptr, page)) {
2169 detected_same_key_root = true;
2170 } else {
2171 matched_fields = 0;
2172
2173 offsets2 = rec_get_offsets(
2174 first_rec, index, offsets2,
2175 0, ULINT_UNDEFINED, &heap);
2176 cmp_rec_rec(node_ptr, first_rec,
2177 offsets, offsets2, index, false,
2178 &matched_fields);
2179
2180 if (matched_fields
2181 >= rec_offs_n_fields(offsets) - 1) {
2182 detected_same_key_root = true;
2183 } else {
2184 const rec_t* last_rec;
2185
2186 last_rec = page_rec_get_prev_const(
2187 page_get_supremum_rec(page));
2188
2189 matched_fields = 0;
2190
2191 offsets2 = rec_get_offsets(
2192 last_rec, index, offsets2,
2193 0, ULINT_UNDEFINED, &heap);
2194 cmp_rec_rec(
2195 node_ptr, last_rec,
2196 offsets, offsets2, index,
2197 false, &matched_fields);
2198 if (matched_fields
2199 >= rec_offs_n_fields(offsets) - 1) {
2200 detected_same_key_root = true;
2201 }
2202 }
2203 }
2204 }
2205
2206 /* If the page might cause modify_tree,
2207 we should not release the parent page's lock. */
2208 if (!detected_same_key_root
2209 && latch_mode == BTR_MODIFY_TREE
2210 && !btr_cur_will_modify_tree(
2211 index, page, lock_intention, node_ptr,
2212 node_ptr_max_size, zip_size, mtr)
2213 && !rtree_parent_modified) {
2214 ut_ad(upper_rw_latch == RW_X_LATCH);
2215 ut_ad(n_releases <= n_blocks);
2216
2217 /* we can release upper blocks */
2218 for (; n_releases < n_blocks; n_releases++) {
2219 if (n_releases == 0) {
2220 /* we should not release root page
2221 to pin to same block. */
2222 continue;
2223 }
2224
2225 /* release unused blocks to unpin */
2226 mtr_release_block_at_savepoint(
2227 mtr, tree_savepoints[n_releases],
2228 tree_blocks[n_releases]);
2229 }
2230 }
2231
2232 if (height == level
2233 && latch_mode == BTR_MODIFY_TREE) {
2234 ut_ad(upper_rw_latch == RW_X_LATCH);
2235 /* we should sx-latch root page, if released already.
2236 It contains seg_header. */
2237 if (n_releases > 0) {
2238 mtr_block_sx_latch_at_savepoint(
2239 mtr, tree_savepoints[0],
2240 tree_blocks[0]);
2241 }
2242
2243 /* x-latch the branch blocks not released yet. */
2244 for (ulint i = n_releases; i <= n_blocks; i++) {
2245 mtr_block_x_latch_at_savepoint(
2246 mtr, tree_savepoints[i],
2247 tree_blocks[i]);
2248 }
2249 }
2250
2251 /* We should consider prev_page of parent page, if the node_ptr
2252 is the leftmost of the page. because BTR_SEARCH_PREV and
2253 BTR_MODIFY_PREV latches prev_page of the leaf page. */
2254 if ((latch_mode == BTR_SEARCH_PREV
2255 || latch_mode == BTR_MODIFY_PREV)
2256 && !retrying_for_search_prev) {
2257 /* block should be latched for consistent
2258 btr_page_get_prev() */
2259 ut_ad(mtr->memo_contains_flagged(
2260 block, MTR_MEMO_PAGE_S_FIX
2261 | MTR_MEMO_PAGE_X_FIX));
2262
2263 if (page_has_prev(page)
2264 && page_rec_is_first(node_ptr, page)) {
2265
2266 if (leftmost_from_level == 0) {
2267 leftmost_from_level = height + 1;
2268 }
2269 } else {
2270 leftmost_from_level = 0;
2271 }
2272
2273 if (height == 0 && leftmost_from_level > 0) {
2274 /* should retry to get also prev_page
2275 from level==leftmost_from_level. */
2276 retrying_for_search_prev = true;
2277
2278 prev_tree_blocks = static_cast<buf_block_t**>(
2279 ut_malloc_nokey(sizeof(buf_block_t*)
2280 * leftmost_from_level));
2281
2282 prev_tree_savepoints = static_cast<ulint*>(
2283 ut_malloc_nokey(sizeof(ulint)
2284 * leftmost_from_level));
2285
2286 /* back to the level (leftmost_from_level+1) */
2287 ulint idx = n_blocks
2288 - (leftmost_from_level - 1);
2289
2290 page_id.set_page_no(
2291 tree_blocks[idx]->page.id().page_no());
2292
2293 for (ulint i = n_blocks
2294 - (leftmost_from_level - 1);
2295 i <= n_blocks; i++) {
2296 mtr_release_block_at_savepoint(
2297 mtr, tree_savepoints[i],
2298 tree_blocks[i]);
2299 }
2300
2301 n_blocks -= (leftmost_from_level - 1);
2302 height = leftmost_from_level;
2303 ut_ad(n_releases == 0);
2304
2305 /* replay up_match, low_match */
2306 up_match = 0;
2307 low_match = 0;
2308 rtr_info_t* rtr_info = need_path
2309 ? cursor->rtr_info : NULL;
2310
2311 for (ulint i = 0; i < n_blocks; i++) {
2312 page_cur_search_with_match(
2313 tree_blocks[i], index, tuple,
2314 page_mode, &up_match,
2315 &low_match, page_cursor,
2316 rtr_info);
2317 }
2318
2319 goto search_loop;
2320 }
2321 }
2322
2323 /* Go to the child node */
2324 page_id.set_page_no(
2325 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2326
2327 n_blocks++;
2328
2329 if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
2330 /* We're doing a search on an ibuf tree and we're one
2331 level above the leaf page. */
2332
2333 ut_ad(level == 0);
2334
2335 buf_mode = BUF_GET;
2336 rw_latch = RW_NO_LATCH;
2337 goto retry_page_get;
2338 }
2339
2340 if (dict_index_is_spatial(index)
2341 && page_mode >= PAGE_CUR_CONTAIN
2342 && page_mode != PAGE_CUR_RTREE_INSERT) {
2343 ut_ad(need_path);
2344 rtr_node_path_t* path =
2345 cursor->rtr_info->path;
2346
2347 if (!path->empty() && found) {
2348 ut_ad(path->back().page_no
2349 == page_id.page_no());
2350 path->pop_back();
2351 #ifdef UNIV_DEBUG
2352 if (page_mode == PAGE_CUR_RTREE_LOCATE
2353 && (latch_mode != BTR_MODIFY_LEAF)) {
2354 btr_pcur_t* cur
2355 = cursor->rtr_info->parent_path->back(
2356 ).cursor;
2357 rec_t* my_node_ptr
2358 = btr_pcur_get_rec(cur);
2359
2360 offsets = rec_get_offsets(
2361 my_node_ptr, index, offsets,
2362 0, ULINT_UNDEFINED, &heap);
2363
2364 ulint my_page_no
2365 = btr_node_ptr_get_child_page_no(
2366 my_node_ptr, offsets);
2367
2368 ut_ad(page_id.page_no() == my_page_no);
2369 }
2370 #endif
2371 }
2372 }
2373
2374 goto search_loop;
2375 } else if (!dict_index_is_spatial(index)
2376 && latch_mode == BTR_MODIFY_TREE
2377 && lock_intention == BTR_INTENTION_INSERT
2378 && page_has_next(page)
2379 && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
2380
2381 /* btr_insert_into_right_sibling() might cause
2382 deleting node_ptr at upper level */
2383
2384 guess = NULL;
2385
2386 if (height == 0) {
2387 /* release the leaf pages if latched */
2388 for (uint i = 0; i < 3; i++) {
2389 if (latch_leaves.blocks[i] != NULL) {
2390 mtr_release_block_at_savepoint(
2391 mtr, latch_leaves.savepoints[i],
2392 latch_leaves.blocks[i]);
2393 latch_leaves.blocks[i] = NULL;
2394 }
2395 }
2396 }
2397
2398 goto need_opposite_intention;
2399 }
2400
2401 if (level != 0) {
2402 ut_ad(!autoinc);
2403
2404 if (upper_rw_latch == RW_NO_LATCH) {
2405 ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
2406 || latch_mode == BTR_CONT_SEARCH_TREE);
2407 buf_block_t* child_block = btr_block_get(
2408 *index, page_id.page_no(),
2409 latch_mode == BTR_CONT_MODIFY_TREE
2410 ? RW_X_LATCH : RW_SX_LATCH, false, mtr);
2411 btr_assert_not_corrupted(child_block, index);
2412 } else {
2413 ut_ad(mtr->memo_contains_flagged(block,
2414 upper_rw_latch));
2415 btr_assert_not_corrupted(block, index);
2416
2417 if (s_latch_by_caller) {
2418 ut_ad(latch_mode == BTR_SEARCH_TREE);
2419 /* to exclude modifying tree operations
2420 should sx-latch the index. */
2421 ut_ad(mtr->memo_contains(index->lock,
2422 MTR_MEMO_SX_LOCK));
2423 /* because has sx-latch of index,
2424 can release upper blocks. */
2425 for (; n_releases < n_blocks; n_releases++) {
2426 mtr_release_block_at_savepoint(
2427 mtr,
2428 tree_savepoints[n_releases],
2429 tree_blocks[n_releases]);
2430 }
2431 }
2432 }
2433
2434 if (page_mode <= PAGE_CUR_LE) {
2435 cursor->low_match = low_match;
2436 cursor->up_match = up_match;
2437 }
2438 } else {
2439 cursor->low_match = low_match;
2440 cursor->low_bytes = low_bytes;
2441 cursor->up_match = up_match;
2442 cursor->up_bytes = up_bytes;
2443
2444 if (autoinc) {
2445 page_set_autoinc(tree_blocks[0], autoinc, mtr, false);
2446 }
2447
2448 #ifdef BTR_CUR_HASH_ADAPT
2449 /* We do a dirty read of btr_search_enabled here. We
2450 will properly check btr_search_enabled again in
2451 btr_search_build_page_hash_index() before building a
2452 page hash index, while holding search latch. */
2453 if (!btr_search_enabled) {
2454 # ifdef MYSQL_INDEX_DISABLE_AHI
2455 } else if (index->disable_ahi) {
2456 # endif
2457 } else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) {
2458 ut_ad(index->is_instant());
2459 /* This may be a search tuple for
2460 btr_pcur_restore_position(). */
2461 ut_ad(tuple->is_metadata()
2462 || (tuple->is_metadata(tuple->info_bits
2463 ^ REC_STATUS_INSTANT)));
2464 } else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
2465 /* Only user records belong in the adaptive
2466 hash index. */
2467 } else {
2468 btr_search_info_update(index, cursor);
2469 }
2470 #endif /* BTR_CUR_HASH_ADAPT */
2471 ut_ad(cursor->up_match != ULINT_UNDEFINED
2472 || mode != PAGE_CUR_GE);
2473 ut_ad(cursor->up_match != ULINT_UNDEFINED
2474 || mode != PAGE_CUR_LE);
2475 ut_ad(cursor->low_match != ULINT_UNDEFINED
2476 || mode != PAGE_CUR_LE);
2477 }
2478
2479 /* For spatial index, remember what blocks are still latched */
2480 if (dict_index_is_spatial(index)
2481 && (latch_mode == BTR_MODIFY_TREE
2482 || latch_mode == BTR_MODIFY_LEAF)) {
2483 for (ulint i = 0; i < n_releases; i++) {
2484 cursor->rtr_info->tree_blocks[i] = NULL;
2485 cursor->rtr_info->tree_savepoints[i] = 0;
2486 }
2487
2488 for (ulint i = n_releases; i <= n_blocks; i++) {
2489 cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
2490 cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
2491 }
2492 }
2493
2494 func_exit:
2495
2496 if (UNIV_LIKELY_NULL(heap)) {
2497 mem_heap_free(heap);
2498 }
2499
2500 if (retrying_for_search_prev) {
2501 ut_free(prev_tree_blocks);
2502 ut_free(prev_tree_savepoints);
2503 }
2504
2505 if (mbr_adj) {
2506 /* remember that we will need to adjust parent MBR */
2507 cursor->rtr_info->mbr_adj = true;
2508 }
2509
2510 #ifdef BTR_CUR_HASH_ADAPT
2511 if (ahi_latch) {
2512 rw_lock_s_lock(ahi_latch);
2513 }
2514 #endif /* BTR_CUR_HASH_ADAPT */
2515
2516 DBUG_RETURN(err);
2517 }
2518
2519 /*****************************************************************//**
2520 Opens a cursor at either end of an index. */
2521 dberr_t
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,unsigned line,mtr_t * mtr)2522 btr_cur_open_at_index_side_func(
2523 /*============================*/
2524 bool from_left, /*!< in: true if open to the low end,
2525 false if to the high end */
2526 dict_index_t* index, /*!< in: index */
2527 ulint latch_mode, /*!< in: latch mode */
2528 btr_cur_t* cursor, /*!< in/out: cursor */
2529 ulint level, /*!< in: level to search for
2530 (0=leaf). */
2531 const char* file, /*!< in: file name */
2532 unsigned line, /*!< in: line where called */
2533 mtr_t* mtr) /*!< in/out: mini-transaction */
2534 {
2535 page_cur_t* page_cursor;
2536 ulint node_ptr_max_size = srv_page_size / 2;
2537 ulint height;
2538 ulint root_height = 0; /* remove warning */
2539 rec_t* node_ptr;
2540 ulint estimate;
2541 btr_intention_t lock_intention;
2542 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2543 ulint tree_savepoints[BTR_MAX_LEVELS];
2544 ulint n_blocks = 0;
2545 ulint n_releases = 0;
2546 mem_heap_t* heap = NULL;
2547 rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
2548 rec_offs* offsets = offsets_;
2549 dberr_t err = DB_SUCCESS;
2550
2551 rec_offs_init(offsets_);
2552
2553 estimate = latch_mode & BTR_ESTIMATE;
2554 latch_mode &= ulint(~BTR_ESTIMATE);
2555
2556 ut_ad(level != ULINT_UNDEFINED);
2557
2558 bool s_latch_by_caller;
2559
2560 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2561 latch_mode &= ulint(~BTR_ALREADY_S_LATCHED);
2562
2563 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2564
2565 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2566
2567 /* This function doesn't need to lock left page of the leaf page */
2568 if (latch_mode == BTR_SEARCH_PREV) {
2569 latch_mode = BTR_SEARCH_LEAF;
2570 } else if (latch_mode == BTR_MODIFY_PREV) {
2571 latch_mode = BTR_MODIFY_LEAF;
2572 }
2573
2574 /* Store the position of the tree latch we push to mtr so that we
2575 know how to release it when we have latched the leaf node */
2576
2577 ulint savepoint = mtr_set_savepoint(mtr);
2578
2579 rw_lock_type_t upper_rw_latch;
2580
2581 switch (latch_mode) {
2582 case BTR_CONT_MODIFY_TREE:
2583 case BTR_CONT_SEARCH_TREE:
2584 upper_rw_latch = RW_NO_LATCH;
2585 break;
2586 case BTR_MODIFY_TREE:
2587 /* Most of delete-intended operations are purging.
2588 Free blocks and read IO bandwidth should be prior
2589 for them, when the history list is glowing huge. */
2590 if (lock_intention == BTR_INTENTION_DELETE
2591 && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2592 && buf_pool.n_pend_reads) {
2593 mtr_x_lock_index(index, mtr);
2594 } else {
2595 mtr_sx_lock_index(index, mtr);
2596 }
2597 upper_rw_latch = RW_X_LATCH;
2598 break;
2599 default:
2600 ut_ad(!s_latch_by_caller
2601 || mtr->memo_contains_flagged(&index->lock,
2602 MTR_MEMO_SX_LOCK
2603 | MTR_MEMO_S_LOCK));
2604 if (!srv_read_only_mode) {
2605 if (!s_latch_by_caller) {
2606 /* BTR_SEARCH_TREE is intended to be used with
2607 BTR_ALREADY_S_LATCHED */
2608 ut_ad(latch_mode != BTR_SEARCH_TREE);
2609
2610 mtr_s_lock_index(index, mtr);
2611 }
2612 upper_rw_latch = RW_S_LATCH;
2613 } else {
2614 upper_rw_latch = RW_NO_LATCH;
2615 }
2616 }
2617
2618 const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
2619 latch_mode);
2620
2621 page_cursor = btr_cur_get_page_cur(cursor);
2622 cursor->index = index;
2623
2624 page_id_t page_id(index->table->space_id, index->page);
2625 const ulint zip_size = index->table->space->zip_size();
2626
2627 if (root_leaf_rw_latch == RW_X_LATCH) {
2628 node_ptr_max_size = btr_node_ptr_max_size(index);
2629 }
2630
2631 height = ULINT_UNDEFINED;
2632
2633 for (;;) {
2634 ut_ad(n_blocks < BTR_MAX_LEVELS);
2635 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2636
2637 const ulint rw_latch = height
2638 && (latch_mode != BTR_MODIFY_TREE || height == level)
2639 ? upper_rw_latch : RW_NO_LATCH;
2640 buf_block_t* block = buf_page_get_gen(page_id, zip_size,
2641 rw_latch, NULL, BUF_GET,
2642 file, line, mtr, &err,
2643 height == 0
2644 && !index->is_clust());
2645 ut_ad((block != NULL) == (err == DB_SUCCESS));
2646 tree_blocks[n_blocks] = block;
2647
2648 if (err != DB_SUCCESS) {
2649 if (err == DB_DECRYPTION_FAILED) {
2650 ib_push_warning((void *)NULL,
2651 DB_DECRYPTION_FAILED,
2652 "Table %s is encrypted but encryption service or"
2653 " used key_id is not available. "
2654 " Can't continue reading table.",
2655 index->table->name.m_name);
2656 index->table->file_unreadable = true;
2657 }
2658
2659 goto exit_loop;
2660 }
2661
2662 const page_t* page = buf_block_get_frame(block);
2663
2664 if (height == ULINT_UNDEFINED
2665 && page_is_leaf(page)
2666 && rw_latch != RW_NO_LATCH
2667 && rw_latch != root_leaf_rw_latch) {
2668 /* We should retry to get the page, because the root page
2669 is latched with different level as a leaf page. */
2670 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2671 ut_ad(rw_latch == RW_S_LATCH);
2672
2673 ut_ad(n_blocks == 0);
2674 mtr_release_block_at_savepoint(
2675 mtr, tree_savepoints[n_blocks],
2676 tree_blocks[n_blocks]);
2677
2678 upper_rw_latch = root_leaf_rw_latch;
2679 continue;
2680 }
2681
2682 ut_ad(fil_page_index_page_check(page));
2683 ut_ad(index->id == btr_page_get_index_id(page));
2684
2685 if (height == ULINT_UNDEFINED) {
2686 /* We are in the root node */
2687
2688 height = btr_page_get_level(page);
2689 root_height = height;
2690 ut_a(height >= level);
2691 } else {
2692 /* TODO: flag the index corrupted if this fails */
2693 ut_ad(height == btr_page_get_level(page));
2694 }
2695
2696 if (height == 0) {
2697 if (rw_latch == RW_NO_LATCH) {
2698 btr_cur_latch_leaves(block, latch_mode,
2699 cursor, mtr);
2700 }
2701
2702 /* In versions <= 3.23.52 we had forgotten to
2703 release the tree latch here. If in an index
2704 scan we had to scan far to find a record
2705 visible to the current transaction, that could
2706 starve others waiting for the tree latch. */
2707
2708 switch (latch_mode) {
2709 case BTR_MODIFY_TREE:
2710 case BTR_CONT_MODIFY_TREE:
2711 case BTR_CONT_SEARCH_TREE:
2712 break;
2713 default:
2714 if (UNIV_UNLIKELY(srv_read_only_mode)) {
2715 break;
2716 }
2717 if (!s_latch_by_caller) {
2718 /* Release the tree s-latch */
2719 mtr_release_s_latch_at_savepoint(
2720 mtr, savepoint, &index->lock);
2721 }
2722
2723 /* release upper blocks */
2724 for (; n_releases < n_blocks; n_releases++) {
2725 mtr_release_block_at_savepoint(
2726 mtr,
2727 tree_savepoints[n_releases],
2728 tree_blocks[n_releases]);
2729 }
2730 }
2731 } else if (height == level /* height != 0 */
2732 && UNIV_LIKELY(!srv_read_only_mode)) {
2733 /* We already have the block latched. */
2734 ut_ad(latch_mode == BTR_SEARCH_TREE);
2735 ut_ad(s_latch_by_caller);
2736 ut_ad(upper_rw_latch == RW_S_LATCH);
2737 ut_ad(mtr->memo_contains_flagged(block,
2738 MTR_MEMO_PAGE_S_FIX));
2739
2740 if (s_latch_by_caller) {
2741 /* to exclude modifying tree operations
2742 should sx-latch the index. */
2743 ut_ad(mtr->memo_contains(index->lock,
2744 MTR_MEMO_SX_LOCK));
2745 /* because has sx-latch of index,
2746 can release upper blocks. */
2747 for (; n_releases < n_blocks; n_releases++) {
2748 mtr_release_block_at_savepoint(
2749 mtr,
2750 tree_savepoints[n_releases],
2751 tree_blocks[n_releases]);
2752 }
2753 }
2754 }
2755
2756 if (from_left) {
2757 page_cur_set_before_first(block, page_cursor);
2758 } else {
2759 page_cur_set_after_last(block, page_cursor);
2760 }
2761
2762 if (height == level) {
2763 if (estimate) {
2764 btr_cur_add_path_info(cursor, height,
2765 root_height);
2766 }
2767
2768 break;
2769 }
2770
2771 ut_ad(height > 0);
2772
2773 if (from_left) {
2774 page_cur_move_to_next(page_cursor);
2775 } else {
2776 page_cur_move_to_prev(page_cursor);
2777 }
2778
2779 if (estimate) {
2780 btr_cur_add_path_info(cursor, height, root_height);
2781 }
2782
2783 height--;
2784
2785 node_ptr = page_cur_get_rec(page_cursor);
2786 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2787 0, ULINT_UNDEFINED, &heap);
2788
2789 /* If the rec is the first or last in the page for
2790 pessimistic delete intention, it might cause node_ptr insert
2791 for the upper level. We should change the intention and retry.
2792 */
2793 if (latch_mode == BTR_MODIFY_TREE
2794 && btr_cur_need_opposite_intention(
2795 page, lock_intention, node_ptr)) {
2796
2797 ut_ad(upper_rw_latch == RW_X_LATCH);
2798 /* release all blocks */
2799 for (; n_releases <= n_blocks; n_releases++) {
2800 mtr_release_block_at_savepoint(
2801 mtr, tree_savepoints[n_releases],
2802 tree_blocks[n_releases]);
2803 }
2804
2805 lock_intention = BTR_INTENTION_BOTH;
2806
2807 page_id.set_page_no(dict_index_get_page(index));
2808
2809 height = ULINT_UNDEFINED;
2810
2811 n_blocks = 0;
2812 n_releases = 0;
2813
2814 continue;
2815 }
2816
2817 if (latch_mode == BTR_MODIFY_TREE
2818 && !btr_cur_will_modify_tree(
2819 cursor->index, page, lock_intention, node_ptr,
2820 node_ptr_max_size, zip_size, mtr)) {
2821 ut_ad(upper_rw_latch == RW_X_LATCH);
2822 ut_ad(n_releases <= n_blocks);
2823
2824 /* we can release upper blocks */
2825 for (; n_releases < n_blocks; n_releases++) {
2826 if (n_releases == 0) {
2827 /* we should not release root page
2828 to pin to same block. */
2829 continue;
2830 }
2831
2832 /* release unused blocks to unpin */
2833 mtr_release_block_at_savepoint(
2834 mtr, tree_savepoints[n_releases],
2835 tree_blocks[n_releases]);
2836 }
2837 }
2838
2839 if (height == level
2840 && latch_mode == BTR_MODIFY_TREE) {
2841 ut_ad(upper_rw_latch == RW_X_LATCH);
2842 /* we should sx-latch root page, if released already.
2843 It contains seg_header. */
2844 if (n_releases > 0) {
2845 mtr_block_sx_latch_at_savepoint(
2846 mtr, tree_savepoints[0],
2847 tree_blocks[0]);
2848 }
2849
2850 /* x-latch the branch blocks not released yet. */
2851 for (ulint i = n_releases; i <= n_blocks; i++) {
2852 mtr_block_x_latch_at_savepoint(
2853 mtr, tree_savepoints[i],
2854 tree_blocks[i]);
2855 }
2856 }
2857
2858 /* Go to the child node */
2859 page_id.set_page_no(
2860 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2861
2862 n_blocks++;
2863 }
2864
2865 exit_loop:
2866 if (heap) {
2867 mem_heap_free(heap);
2868 }
2869
2870 return err;
2871 }
2872
2873 /**********************************************************************//**
2874 Positions a cursor at a randomly chosen position within a B-tree.
2875 @return true if the index is available and we have put the cursor, false
2876 if the index is unavailable */
2877 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,unsigned line,mtr_t * mtr)2878 btr_cur_open_at_rnd_pos_func(
2879 /*=========================*/
2880 dict_index_t* index, /*!< in: index */
2881 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
2882 btr_cur_t* cursor, /*!< in/out: B-tree cursor */
2883 const char* file, /*!< in: file name */
2884 unsigned line, /*!< in: line where called */
2885 mtr_t* mtr) /*!< in: mtr */
2886 {
2887 page_cur_t* page_cursor;
2888 ulint node_ptr_max_size = srv_page_size / 2;
2889 ulint height;
2890 rec_t* node_ptr;
2891 btr_intention_t lock_intention;
2892 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2893 ulint tree_savepoints[BTR_MAX_LEVELS];
2894 ulint n_blocks = 0;
2895 ulint n_releases = 0;
2896 mem_heap_t* heap = NULL;
2897 rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
2898 rec_offs* offsets = offsets_;
2899 rec_offs_init(offsets_);
2900
2901 ut_ad(!index->is_spatial());
2902
2903 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2904
2905 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2906
2907 ulint savepoint = mtr_set_savepoint(mtr);
2908
2909 rw_lock_type_t upper_rw_latch;
2910
2911 switch (latch_mode) {
2912 case BTR_MODIFY_TREE:
2913 /* Most of delete-intended operations are purging.
2914 Free blocks and read IO bandwidth should be prior
2915 for them, when the history list is glowing huge. */
2916 if (lock_intention == BTR_INTENTION_DELETE
2917 && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2918 && buf_pool.n_pend_reads) {
2919 mtr_x_lock_index(index, mtr);
2920 } else {
2921 mtr_sx_lock_index(index, mtr);
2922 }
2923 upper_rw_latch = RW_X_LATCH;
2924 break;
2925 case BTR_SEARCH_PREV:
2926 case BTR_MODIFY_PREV:
2927 /* This function doesn't support left uncle
2928 page lock for left leaf page lock, when
2929 needed. */
2930 case BTR_SEARCH_TREE:
2931 case BTR_CONT_MODIFY_TREE:
2932 case BTR_CONT_SEARCH_TREE:
2933 ut_ad(0);
2934 /* fall through */
2935 default:
2936 if (!srv_read_only_mode) {
2937 mtr_s_lock_index(index, mtr);
2938 upper_rw_latch = RW_S_LATCH;
2939 } else {
2940 upper_rw_latch = RW_NO_LATCH;
2941 }
2942 }
2943
2944 DBUG_EXECUTE_IF("test_index_is_unavailable",
2945 return(false););
2946
2947 if (index->page == FIL_NULL) {
2948 /* Since we don't hold index lock until just now, the index
2949 could be modified by others, for example, if this is a
2950 statistics updater for referenced table, it could be marked
2951 as unavailable by 'DROP TABLE' in the mean time, since
2952 we don't hold lock for statistics updater */
2953 return(false);
2954 }
2955
2956 const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
2957 latch_mode);
2958
2959 page_cursor = btr_cur_get_page_cur(cursor);
2960 cursor->index = index;
2961
2962 page_id_t page_id(index->table->space_id, index->page);
2963 const ulint zip_size = index->table->space->zip_size();
2964 dberr_t err = DB_SUCCESS;
2965
2966 if (root_leaf_rw_latch == RW_X_LATCH) {
2967 node_ptr_max_size = btr_node_ptr_max_size(index);
2968 }
2969
2970 height = ULINT_UNDEFINED;
2971
2972 for (;;) {
2973 page_t* page;
2974
2975 ut_ad(n_blocks < BTR_MAX_LEVELS);
2976 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2977
2978 const rw_lock_type_t rw_latch = height
2979 && latch_mode != BTR_MODIFY_TREE
2980 ? upper_rw_latch : RW_NO_LATCH;
2981 buf_block_t* block = buf_page_get_gen(page_id, zip_size,
2982 rw_latch, NULL, BUF_GET,
2983 file, line, mtr, &err,
2984 height == 0
2985 && !index->is_clust());
2986 tree_blocks[n_blocks] = block;
2987
2988 ut_ad((block != NULL) == (err == DB_SUCCESS));
2989
2990 if (err != DB_SUCCESS) {
2991 if (err == DB_DECRYPTION_FAILED) {
2992 ib_push_warning((void *)NULL,
2993 DB_DECRYPTION_FAILED,
2994 "Table %s is encrypted but encryption service or"
2995 " used key_id is not available. "
2996 " Can't continue reading table.",
2997 index->table->name.m_name);
2998 index->table->file_unreadable = true;
2999 }
3000
3001 break;
3002 }
3003
3004 page = buf_block_get_frame(block);
3005
3006 if (height == ULINT_UNDEFINED
3007 && page_is_leaf(page)
3008 && rw_latch != RW_NO_LATCH
3009 && rw_latch != root_leaf_rw_latch) {
3010 /* We should retry to get the page, because the root page
3011 is latched with different level as a leaf page. */
3012 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
3013 ut_ad(rw_latch == RW_S_LATCH);
3014
3015 ut_ad(n_blocks == 0);
3016 mtr_release_block_at_savepoint(
3017 mtr, tree_savepoints[n_blocks],
3018 tree_blocks[n_blocks]);
3019
3020 upper_rw_latch = root_leaf_rw_latch;
3021 continue;
3022 }
3023
3024 ut_ad(fil_page_index_page_check(page));
3025 ut_ad(index->id == btr_page_get_index_id(page));
3026
3027 if (height == ULINT_UNDEFINED) {
3028 /* We are in the root node */
3029
3030 height = btr_page_get_level(page);
3031 }
3032
3033 if (height == 0) {
3034 if (rw_latch == RW_NO_LATCH
3035 || srv_read_only_mode) {
3036 btr_cur_latch_leaves(block, latch_mode, cursor,
3037 mtr);
3038 }
3039
3040 /* btr_cur_open_at_index_side_func() and
3041 btr_cur_search_to_nth_level() release
3042 tree s-latch here.*/
3043 switch (latch_mode) {
3044 case BTR_MODIFY_TREE:
3045 case BTR_CONT_MODIFY_TREE:
3046 case BTR_CONT_SEARCH_TREE:
3047 break;
3048 default:
3049 /* Release the tree s-latch */
3050 if (!srv_read_only_mode) {
3051 mtr_release_s_latch_at_savepoint(
3052 mtr, savepoint,
3053 dict_index_get_lock(index));
3054 }
3055
3056 /* release upper blocks */
3057 for (; n_releases < n_blocks; n_releases++) {
3058 mtr_release_block_at_savepoint(
3059 mtr,
3060 tree_savepoints[n_releases],
3061 tree_blocks[n_releases]);
3062 }
3063 }
3064 }
3065
3066 page_cur_open_on_rnd_user_rec(block, page_cursor);
3067
3068 if (height == 0) {
3069
3070 break;
3071 }
3072
3073 ut_ad(height > 0);
3074
3075 height--;
3076
3077 node_ptr = page_cur_get_rec(page_cursor);
3078 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
3079 0, ULINT_UNDEFINED, &heap);
3080
3081 /* If the rec is the first or last in the page for
3082 pessimistic delete intention, it might cause node_ptr insert
3083 for the upper level. We should change the intention and retry.
3084 */
3085 if (latch_mode == BTR_MODIFY_TREE
3086 && btr_cur_need_opposite_intention(
3087 page, lock_intention, node_ptr)) {
3088
3089 ut_ad(upper_rw_latch == RW_X_LATCH);
3090 /* release all blocks */
3091 for (; n_releases <= n_blocks; n_releases++) {
3092 mtr_release_block_at_savepoint(
3093 mtr, tree_savepoints[n_releases],
3094 tree_blocks[n_releases]);
3095 }
3096
3097 lock_intention = BTR_INTENTION_BOTH;
3098
3099 page_id.set_page_no(dict_index_get_page(index));
3100
3101 height = ULINT_UNDEFINED;
3102
3103 n_blocks = 0;
3104 n_releases = 0;
3105
3106 continue;
3107 }
3108
3109 if (latch_mode == BTR_MODIFY_TREE
3110 && !btr_cur_will_modify_tree(
3111 cursor->index, page, lock_intention, node_ptr,
3112 node_ptr_max_size, zip_size, mtr)) {
3113 ut_ad(upper_rw_latch == RW_X_LATCH);
3114 ut_ad(n_releases <= n_blocks);
3115
3116 /* we can release upper blocks */
3117 for (; n_releases < n_blocks; n_releases++) {
3118 if (n_releases == 0) {
3119 /* we should not release root page
3120 to pin to same block. */
3121 continue;
3122 }
3123
3124 /* release unused blocks to unpin */
3125 mtr_release_block_at_savepoint(
3126 mtr, tree_savepoints[n_releases],
3127 tree_blocks[n_releases]);
3128 }
3129 }
3130
3131 if (height == 0
3132 && latch_mode == BTR_MODIFY_TREE) {
3133 ut_ad(upper_rw_latch == RW_X_LATCH);
3134 /* we should sx-latch root page, if released already.
3135 It contains seg_header. */
3136 if (n_releases > 0) {
3137 mtr_block_sx_latch_at_savepoint(
3138 mtr, tree_savepoints[0],
3139 tree_blocks[0]);
3140 }
3141
3142 /* x-latch the branch blocks not released yet. */
3143 for (ulint i = n_releases; i <= n_blocks; i++) {
3144 mtr_block_x_latch_at_savepoint(
3145 mtr, tree_savepoints[i],
3146 tree_blocks[i]);
3147 }
3148 }
3149
3150 /* Go to the child node */
3151 page_id.set_page_no(
3152 btr_node_ptr_get_child_page_no(node_ptr, offsets));
3153
3154 n_blocks++;
3155 }
3156
3157 if (UNIV_LIKELY_NULL(heap)) {
3158 mem_heap_free(heap);
3159 }
3160
3161 return err == DB_SUCCESS;
3162 }
3163
3164 /*==================== B-TREE INSERT =========================*/
3165
3166 /*************************************************************//**
3167 Inserts a record if there is enough space, or if enough space can
3168 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
3169 no heuristics is applied to whether it pays to use CPU time for
3170 reorganizing the page or not.
3171
3172 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
3173 if this is a compressed leaf page in a secondary index.
3174 This has to be done either within the same mini-transaction,
3175 or by invoking ibuf_reset_free_bits() before mtr_commit().
3176
3177 @return pointer to inserted record if succeed, else NULL */
3178 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3179 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,rec_offs ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)3180 btr_cur_insert_if_possible(
3181 /*=======================*/
3182 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
3183 cursor stays valid */
3184 const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
3185 have been stored to tuple */
3186 rec_offs** offsets,/*!< out: offsets on *rec */
3187 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
3188 ulint n_ext, /*!< in: number of externally stored columns */
3189 mtr_t* mtr) /*!< in/out: mini-transaction */
3190 {
3191 page_cur_t* page_cursor;
3192 rec_t* rec;
3193
3194 ut_ad(dtuple_check_typed(tuple));
3195
3196 ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
3197 MTR_MEMO_PAGE_X_FIX));
3198 page_cursor = btr_cur_get_page_cur(cursor);
3199
3200 /* Now, try the insert */
3201 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3202 offsets, heap, n_ext, mtr);
3203
3204 /* If the record did not fit, reorganize.
3205 For compressed pages, page_cur_tuple_insert()
3206 attempted this already. */
3207 if (!rec && !page_cur_get_page_zip(page_cursor)
3208 && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
3209 rec = page_cur_tuple_insert(
3210 page_cursor, tuple, cursor->index,
3211 offsets, heap, n_ext, mtr);
3212 }
3213
3214 ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
3215 return(rec);
3216 }
3217
3218 /*************************************************************//**
3219 For an insert, checks the locks and does the undo logging if desired.
3220 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3221 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
3222 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,bool * inherit)3223 btr_cur_ins_lock_and_undo(
3224 /*======================*/
3225 ulint flags, /*!< in: undo logging and locking flags: if
3226 not zero, the parameters index and thr
3227 should be specified */
3228 btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
3229 dtuple_t* entry, /*!< in/out: entry to insert */
3230 que_thr_t* thr, /*!< in: query thread or NULL */
3231 mtr_t* mtr, /*!< in/out: mini-transaction */
3232 bool* inherit)/*!< out: true if the inserted new record maybe
3233 should inherit LOCK_GAP type locks from the
3234 successor record */
3235 {
3236 dict_index_t* index;
3237 dberr_t err = DB_SUCCESS;
3238 rec_t* rec;
3239 roll_ptr_t roll_ptr;
3240
3241 /* Check if we have to wait for a lock: enqueue an explicit lock
3242 request if yes */
3243
3244 rec = btr_cur_get_rec(cursor);
3245 index = cursor->index;
3246
3247 ut_ad(!dict_index_is_online_ddl(index)
3248 || dict_index_is_clust(index)
3249 || (flags & BTR_CREATE_FLAG));
3250 ut_ad(mtr->is_named_space(index->table->space));
3251
3252 /* Check if there is predicate or GAP lock preventing the insertion */
3253 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3254 const unsigned type = index->type;
3255 if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
3256 lock_prdt_t prdt;
3257 rtr_mbr_t mbr;
3258
3259 rtr_get_mbr_from_tuple(entry, &mbr);
3260
3261 /* Use on stack MBR variable to test if a lock is
3262 needed. If so, the predicate (MBR) will be allocated
3263 from lock heap in lock_prdt_insert_check_and_lock() */
3264 lock_init_prdt_from_mbr(
3265 &prdt, &mbr, 0, NULL);
3266
3267 err = lock_prdt_insert_check_and_lock(
3268 flags, rec, btr_cur_get_block(cursor),
3269 index, thr, mtr, &prdt);
3270 *inherit = false;
3271 } else {
3272 #ifdef WITH_WSREP
3273 trx_t* trx= thr_get_trx(thr);
3274 /* If transaction scanning an unique secondary
3275 key is wsrep high priority thread (brute
3276 force) this scanning may involve GAP-locking
3277 in the index. As this locking happens also
3278 when applying replication events in high
3279 priority applier threads, there is a
3280 probability for lock conflicts between two
3281 wsrep high priority threads. To avoid this
3282 GAP-locking we mark that this transaction
3283 is using unique key scan here. */
3284 if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
3285 && trx->is_wsrep()
3286 && wsrep_thd_is_BF(trx->mysql_thd, false)) {
3287 trx->wsrep_UK_scan= true;
3288 }
3289 #endif /* WITH_WSREP */
3290 err = lock_rec_insert_check_and_lock(
3291 flags, rec, btr_cur_get_block(cursor),
3292 index, thr, mtr, inherit);
3293 #ifdef WITH_WSREP
3294 trx->wsrep_UK_scan= false;
3295 #endif /* WITH_WSREP */
3296 }
3297 }
3298
3299 if (err != DB_SUCCESS
3300 || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
3301 || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
3302
3303 return(err);
3304 }
3305
3306 if (flags & BTR_NO_UNDO_LOG_FLAG) {
3307 roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
3308 if (!(flags & BTR_KEEP_SYS_FLAG)) {
3309 upd_sys:
3310 dfield_t* r = dtuple_get_nth_field(
3311 entry, index->db_roll_ptr());
3312 ut_ad(r->len == DATA_ROLL_PTR_LEN);
3313 trx_write_roll_ptr(static_cast<byte*>(r->data),
3314 roll_ptr);
3315 }
3316 } else {
3317 err = trx_undo_report_row_operation(thr, index, entry,
3318 NULL, 0, NULL, NULL,
3319 &roll_ptr);
3320 if (err == DB_SUCCESS) {
3321 goto upd_sys;
3322 }
3323 }
3324
3325 return(err);
3326 }
3327
3328 /**
3329 Prefetch siblings of the leaf for the pessimistic operation.
3330 @param block leaf page
3331 @param index index of the page */
btr_cur_prefetch_siblings(const buf_block_t * block,const dict_index_t * index)3332 static void btr_cur_prefetch_siblings(const buf_block_t *block,
3333 const dict_index_t *index)
3334 {
3335 ut_ad(page_is_leaf(block->frame));
3336
3337 if (index->is_ibuf())
3338 return;
3339
3340 const page_t *page= block->frame;
3341 uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
3342 uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
3343
3344 fil_space_t *space= index->table->space;
3345
3346 if (prev == FIL_NULL);
3347 else if (space->acquire())
3348 buf_read_page_background(space, page_id_t(space->id, prev),
3349 block->zip_size());
3350 if (next == FIL_NULL);
3351 else if (space->acquire())
3352 buf_read_page_background(space, page_id_t(space->id, next),
3353 block->zip_size());
3354 }
3355
3356 /*************************************************************//**
3357 Tries to perform an insert to a page in an index tree, next to cursor.
3358 It is assumed that mtr holds an x-latch on the page. The operation does
3359 not succeed if there is too little space on the page. If there is just
3360 one record on the page, the insert will always succeed; this is to
3361 prevent trying to split a page with just one record.
3362 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3363 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3364 btr_cur_optimistic_insert(
3365 /*======================*/
3366 ulint flags, /*!< in: undo logging and locking flags: if not
3367 zero, the parameters index and thr should be
3368 specified */
3369 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
3370 cursor stays valid */
3371 rec_offs** offsets,/*!< out: offsets on *rec */
3372 mem_heap_t** heap, /*!< in/out: pointer to memory heap */
3373 dtuple_t* entry, /*!< in/out: entry to insert */
3374 rec_t** rec, /*!< out: pointer to inserted record if
3375 succeed */
3376 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3377 be stored externally by the caller */
3378 ulint n_ext, /*!< in: number of externally stored columns */
3379 que_thr_t* thr, /*!< in/out: query thread; can be NULL if
3380 !(~flags
3381 & (BTR_NO_LOCKING_FLAG
3382 | BTR_NO_UNDO_LOG_FLAG)) */
3383 mtr_t* mtr) /*!< in/out: mini-transaction;
3384 if this function returns DB_SUCCESS on
3385 a leaf page of a secondary index in a
3386 compressed tablespace, the caller must
3387 mtr_commit(mtr) before latching
3388 any further pages */
3389 {
3390 big_rec_t* big_rec_vec = NULL;
3391 dict_index_t* index;
3392 page_cur_t* page_cursor;
3393 buf_block_t* block;
3394 page_t* page;
3395 rec_t* dummy;
3396 bool leaf;
3397 bool reorg __attribute__((unused));
3398 bool inherit = true;
3399 ulint rec_size;
3400 dberr_t err;
3401
3402 ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
3403 *big_rec = NULL;
3404
3405 block = btr_cur_get_block(cursor);
3406 page = buf_block_get_frame(block);
3407 index = cursor->index;
3408
3409 ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
3410 ut_ad(!dict_index_is_online_ddl(index)
3411 || dict_index_is_clust(index)
3412 || (flags & BTR_CREATE_FLAG));
3413 ut_ad(dtuple_check_typed(entry));
3414
3415 #ifdef HAVE_valgrind
3416 if (block->page.zip.data) {
3417 MEM_CHECK_DEFINED(page, srv_page_size);
3418 MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size());
3419 }
3420 #endif /* HAVE_valgrind */
3421
3422 leaf = page_is_leaf(page);
3423
3424 if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
3425 ut_ad(leaf);
3426 goto convert_big_rec;
3427 }
3428
3429 /* Calculate the record size when entry is converted to a record */
3430 rec_size = rec_get_converted_size(index, entry, n_ext);
3431
3432 if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3433 dtuple_get_n_fields(entry),
3434 block->zip_size())) {
3435 convert_big_rec:
3436 /* The record is so big that we have to store some fields
3437 externally on separate database pages */
3438 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3439
3440 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3441
3442 return(DB_TOO_BIG_RECORD);
3443 }
3444
3445 rec_size = rec_get_converted_size(index, entry, n_ext);
3446 }
3447
3448 if (block->page.zip.data && page_zip_is_too_big(index, entry)) {
3449 if (big_rec_vec != NULL) {
3450 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3451 }
3452
3453 return(DB_TOO_BIG_RECORD);
3454 }
3455
3456 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3457 goto fail);
3458
3459 if (block->page.zip.data && leaf
3460 && (page_get_data_size(page) + rec_size
3461 >= dict_index_zip_pad_optimal_page_size(index))) {
3462 /* If compression padding tells us that insertion will
3463 result in too packed up page i.e.: which is likely to
3464 cause compression failure then don't do an optimistic
3465 insertion. */
3466 fail:
3467 err = DB_FAIL;
3468
3469 /* prefetch siblings of the leaf for the pessimistic
3470 operation, if the page is leaf. */
3471 if (page_is_leaf(page)) {
3472 btr_cur_prefetch_siblings(block, index);
3473 }
3474 fail_err:
3475
3476 if (big_rec_vec) {
3477 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3478 }
3479
3480 return(err);
3481 }
3482
3483 ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
3484 if (max_size < rec_size) {
3485 goto fail;
3486 }
3487
3488 const ulint n_recs = page_get_n_recs(page);
3489 if (UNIV_UNLIKELY(n_recs >= 8189)) {
3490 ut_ad(srv_page_size == 65536);
3491 goto fail;
3492 }
3493
3494 if (page_has_garbage(page)) {
3495 if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
3496 && n_recs > 1
3497 && page_get_max_insert_size(page, 1) < rec_size) {
3498
3499 goto fail;
3500 }
3501 }
3502
3503 /* If there have been many consecutive inserts to the
3504 clustered index leaf page of an uncompressed table, check if
3505 we have to split the page to reserve enough free space for
3506 future updates of records. */
3507
3508 if (leaf && !block->page.zip.data && dict_index_is_clust(index)
3509 && page_get_n_recs(page) >= 2
3510 && dict_index_get_space_reserve() + rec_size > max_size
3511 && (btr_page_get_split_rec_to_right(cursor, &dummy)
3512 || btr_page_get_split_rec_to_left(cursor))) {
3513 goto fail;
3514 }
3515
3516 page_cursor = btr_cur_get_page_cur(cursor);
3517
3518 DBUG_LOG("ib_cur",
3519 "insert " << index->name << " (" << index->id << ") by "
3520 << ib::hex(thr ? thr->graph->trx->id : 0)
3521 << ' ' << rec_printer(entry).str());
3522 DBUG_EXECUTE_IF("do_page_reorganize",
3523 btr_page_reorganize(page_cursor, index, mtr););
3524
3525 /* Now, try the insert */
3526 {
3527 const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
3528
3529 /* Check locks and write to the undo log,
3530 if specified */
3531 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3532 thr, mtr, &inherit);
3533 if (err != DB_SUCCESS) {
3534 goto fail_err;
3535 }
3536
3537 #ifdef UNIV_DEBUG
3538 if (!(flags & BTR_CREATE_FLAG)
3539 && index->is_primary() && page_is_leaf(page)) {
3540 const dfield_t* trx_id = dtuple_get_nth_field(
3541 entry, dict_col_get_clust_pos(
3542 dict_table_get_sys_col(index->table,
3543 DATA_TRX_ID),
3544 index));
3545
3546 ut_ad(trx_id->len == DATA_TRX_ID_LEN);
3547 ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
3548 ut_ad(*static_cast<const byte*>
3549 (trx_id[1].data) & 0x80);
3550 if (flags & BTR_NO_UNDO_LOG_FLAG) {
3551 ut_ad(!memcmp(trx_id->data, reset_trx_id,
3552 DATA_TRX_ID_LEN));
3553 } else {
3554 ut_ad(thr->graph->trx->id);
3555 ut_ad(thr->graph->trx->id
3556 == trx_read_trx_id(
3557 static_cast<const byte*>(
3558 trx_id->data))
3559 || index->table->is_temporary());
3560 }
3561 }
3562 #endif
3563
3564 *rec = page_cur_tuple_insert(
3565 page_cursor, entry, index, offsets, heap,
3566 n_ext, mtr);
3567
3568 reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3569 }
3570
3571 if (*rec) {
3572 } else if (block->page.zip.data) {
3573 ut_ad(!index->table->is_temporary());
3574 /* Reset the IBUF_BITMAP_FREE bits, because
3575 page_cur_tuple_insert() will have attempted page
3576 reorganize before failing. */
3577 if (leaf
3578 && !dict_index_is_clust(index)) {
3579 ibuf_reset_free_bits(block);
3580 }
3581
3582 goto fail;
3583 } else {
3584 ut_ad(!reorg);
3585
3586 /* If the record did not fit, reorganize */
3587 if (!btr_page_reorganize(page_cursor, index, mtr)) {
3588 ut_ad(0);
3589 goto fail;
3590 }
3591
3592 ut_ad(page_get_max_insert_size(page, 1) == max_size);
3593
3594 reorg = TRUE;
3595
3596 *rec = page_cur_tuple_insert(page_cursor, entry, index,
3597 offsets, heap, n_ext, mtr);
3598
3599 if (UNIV_UNLIKELY(!*rec)) {
3600 ib::fatal() << "Cannot insert tuple " << *entry
3601 << "into index " << index->name
3602 << " of table " << index->table->name
3603 << ". Max size: " << max_size;
3604 }
3605 }
3606
3607 #ifdef BTR_CUR_HASH_ADAPT
3608 if (!leaf) {
3609 # ifdef MYSQL_INDEX_DISABLE_AHI
3610 } else if (index->disable_ahi) {
3611 # endif
3612 } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
3613 ut_ad(entry->is_metadata());
3614 ut_ad(index->is_instant());
3615 ut_ad(flags == BTR_NO_LOCKING_FLAG);
3616 } else {
3617 rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
3618 if (!reorg && cursor->flag == BTR_CUR_HASH) {
3619 btr_search_update_hash_node_on_insert(
3620 cursor, ahi_latch);
3621 } else {
3622 btr_search_update_hash_on_insert(cursor, ahi_latch);
3623 }
3624 }
3625 #endif /* BTR_CUR_HASH_ADAPT */
3626
3627 if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3628
3629 lock_update_insert(block, *rec);
3630 }
3631
3632 if (leaf
3633 && !dict_index_is_clust(index)
3634 && !index->table->is_temporary()) {
3635 /* Update the free bits of the B-tree page in the
3636 insert buffer bitmap. */
3637
3638 /* The free bits in the insert buffer bitmap must
3639 never exceed the free space on a page. It is safe to
3640 decrement or reset the bits in the bitmap in a
3641 mini-transaction that is committed before the
3642 mini-transaction that affects the free space. */
3643
3644 /* It is unsafe to increment the bits in a separately
3645 committed mini-transaction, because in crash recovery,
3646 the free bits could momentarily be set too high. */
3647
3648 if (block->page.zip.data) {
3649 /* Update the bits in the same mini-transaction. */
3650 ibuf_update_free_bits_zip(block, mtr);
3651 } else {
3652 /* Decrement the bits in a separate
3653 mini-transaction. */
3654 ibuf_update_free_bits_if_full(
3655 block, max_size,
3656 rec_size + PAGE_DIR_SLOT_SIZE);
3657 }
3658 }
3659
3660 *big_rec = big_rec_vec;
3661
3662 return(DB_SUCCESS);
3663 }
3664
3665 /*************************************************************//**
3666 Performs an insert on a page of an index tree. It is assumed that mtr
3667 holds an x-latch on the tree and on the cursor page. If the insert is
3668 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3669 to brothers of page, if those brothers exist.
3670 @return DB_SUCCESS or error number */
3671 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3672 btr_cur_pessimistic_insert(
3673 /*=======================*/
3674 ulint flags, /*!< in: undo logging and locking flags: if not
3675 zero, the parameter thr should be
3676 specified; if no undo logging is specified,
3677 then the caller must have reserved enough
3678 free extents in the file space so that the
3679 insertion will certainly succeed */
3680 btr_cur_t* cursor, /*!< in: cursor after which to insert;
3681 cursor stays valid */
3682 rec_offs** offsets,/*!< out: offsets on *rec */
3683 mem_heap_t** heap, /*!< in/out: pointer to memory heap
3684 that can be emptied */
3685 dtuple_t* entry, /*!< in/out: entry to insert */
3686 rec_t** rec, /*!< out: pointer to inserted record if
3687 succeed */
3688 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3689 be stored externally by the caller */
3690 ulint n_ext, /*!< in: number of externally stored columns */
3691 que_thr_t* thr, /*!< in/out: query thread; can be NULL if
3692 !(~flags
3693 & (BTR_NO_LOCKING_FLAG
3694 | BTR_NO_UNDO_LOG_FLAG)) */
3695 mtr_t* mtr) /*!< in/out: mini-transaction */
3696 {
3697 dict_index_t* index = cursor->index;
3698 big_rec_t* big_rec_vec = NULL;
3699 dberr_t err;
3700 bool inherit = false;
3701 bool success;
3702 uint32_t n_reserved = 0;
3703
3704 ut_ad(dtuple_check_typed(entry));
3705 ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
3706
3707 *big_rec = NULL;
3708
3709 ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
3710 | MTR_MEMO_SX_LOCK));
3711 ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
3712 MTR_MEMO_PAGE_X_FIX));
3713 ut_ad(!dict_index_is_online_ddl(index)
3714 || dict_index_is_clust(index)
3715 || (flags & BTR_CREATE_FLAG));
3716
3717 cursor->flag = BTR_CUR_BINARY;
3718
3719 /* Check locks and write to undo log, if specified */
3720
3721 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3722 thr, mtr, &inherit);
3723
3724 if (err != DB_SUCCESS) {
3725
3726 return(err);
3727 }
3728
3729 if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
3730 /* First reserve enough free space for the file segments
3731 of the index tree, so that the insert will not fail because
3732 of lack of space */
3733
3734 uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
3735
3736 success = fsp_reserve_free_extents(&n_reserved,
3737 index->table->space,
3738 n_extents, FSP_NORMAL, mtr);
3739 if (!success) {
3740 return(DB_OUT_OF_FILE_SPACE);
3741 }
3742 }
3743
3744 if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3745 index->table->not_redundant(),
3746 dtuple_get_n_fields(entry),
3747 btr_cur_get_block(cursor)->zip_size())
3748 || UNIV_UNLIKELY(entry->is_alter_metadata()
3749 && !dfield_is_ext(
3750 dtuple_get_nth_field(
3751 entry,
3752 index->first_user_field())))) {
3753 /* The record is so big that we have to store some fields
3754 externally on separate database pages */
3755
3756 if (UNIV_LIKELY_NULL(big_rec_vec)) {
3757 /* This should never happen, but we handle
3758 the situation in a robust manner. */
3759 ut_ad(0);
3760 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3761 }
3762
3763 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3764
3765 if (big_rec_vec == NULL) {
3766
3767 index->table->space->release_free_extents(n_reserved);
3768 return(DB_TOO_BIG_RECORD);
3769 }
3770 }
3771
3772 if (dict_index_get_page(index)
3773 == btr_cur_get_block(cursor)->page.id().page_no()) {
3774
3775 /* The page is the root page */
3776 *rec = btr_root_raise_and_insert(
3777 flags, cursor, offsets, heap, entry, n_ext, mtr);
3778 } else {
3779 *rec = btr_page_split_and_insert(
3780 flags, cursor, offsets, heap, entry, n_ext, mtr);
3781 }
3782
3783 if (*rec == NULL && os_has_said_disk_full) {
3784 return(DB_OUT_OF_FILE_SPACE);
3785 }
3786
3787 ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3788 || dict_index_is_spatial(index));
3789
3790 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3791 ut_ad(!index->table->is_temporary());
3792 if (dict_index_is_spatial(index)) {
3793 /* Do nothing */
3794 } else {
3795 /* The cursor might be moved to the other page
3796 and the max trx id field should be updated after
3797 the cursor was fixed. */
3798 if (!dict_index_is_clust(index)) {
3799 page_update_max_trx_id(
3800 btr_cur_get_block(cursor),
3801 btr_cur_get_page_zip(cursor),
3802 thr_get_trx(thr)->id, mtr);
3803 }
3804
3805 if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3806 || !page_has_prev(btr_cur_get_page(cursor))) {
3807 /* split and inserted need to call
3808 lock_update_insert() always. */
3809 inherit = true;
3810 }
3811 }
3812 }
3813
3814 if (!page_is_leaf(btr_cur_get_page(cursor))) {
3815 ut_ad(!big_rec_vec);
3816 } else {
3817 #ifdef BTR_CUR_HASH_ADAPT
3818 # ifdef MYSQL_INDEX_DISABLE_AHI
3819 if (index->disable_ahi); else
3820 # endif
3821 if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
3822 ut_ad(entry->is_metadata());
3823 ut_ad(index->is_instant());
3824 ut_ad(flags & BTR_NO_LOCKING_FLAG);
3825 ut_ad(!(flags & BTR_CREATE_FLAG));
3826 } else {
3827 btr_search_update_hash_on_insert(
3828 cursor, btr_search_sys.get_latch(*index));
3829 }
3830 #endif /* BTR_CUR_HASH_ADAPT */
3831 if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3832
3833 lock_update_insert(btr_cur_get_block(cursor), *rec);
3834 }
3835 }
3836
3837 index->table->space->release_free_extents(n_reserved);
3838 *big_rec = big_rec_vec;
3839
3840 return(DB_SUCCESS);
3841 }
3842
3843 /*==================== B-TREE UPDATE =========================*/
3844
3845 /*************************************************************//**
3846 For an update, checks the locks and does the undo logging.
3847 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3848 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3849 dberr_t
3850 btr_cur_upd_lock_and_undo(
3851 /*======================*/
3852 ulint flags, /*!< in: undo logging and locking flags */
3853 btr_cur_t* cursor, /*!< in: cursor on record to update */
3854 const rec_offs* offsets,/*!< in: rec_get_offsets() on cursor */
3855 const upd_t* update, /*!< in: update vector */
3856 ulint cmpl_info,/*!< in: compiler info on secondary index
3857 updates */
3858 que_thr_t* thr, /*!< in: query thread
3859 (can be NULL if BTR_NO_LOCKING_FLAG) */
3860 mtr_t* mtr, /*!< in/out: mini-transaction */
3861 roll_ptr_t* roll_ptr)/*!< out: roll pointer */
3862 {
3863 dict_index_t* index;
3864 const rec_t* rec;
3865 dberr_t err;
3866
3867 ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
3868
3869 rec = btr_cur_get_rec(cursor);
3870 index = cursor->index;
3871
3872 ut_ad(rec_offs_validate(rec, index, offsets));
3873 ut_ad(mtr->is_named_space(index->table->space));
3874
3875 if (!dict_index_is_clust(index)) {
3876 ut_ad(dict_index_is_online_ddl(index)
3877 == !!(flags & BTR_CREATE_FLAG));
3878
3879 /* We do undo logging only when we update a clustered index
3880 record */
3881 return(lock_sec_rec_modify_check_and_lock(
3882 flags, btr_cur_get_block(cursor), rec,
3883 index, thr, mtr));
3884 }
3885
3886 /* Check if we have to wait for a lock: enqueue an explicit lock
3887 request if yes */
3888
3889 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3890 err = lock_clust_rec_modify_check_and_lock(
3891 flags, btr_cur_get_block(cursor), rec, index,
3892 offsets, thr);
3893 if (err != DB_SUCCESS) {
3894 return(err);
3895 }
3896 }
3897
3898 /* Append the info about the update in the undo log */
3899
3900 return((flags & BTR_NO_UNDO_LOG_FLAG)
3901 ? DB_SUCCESS
3902 : trx_undo_report_row_operation(
3903 thr, index, NULL, update,
3904 cmpl_info, rec, offsets, roll_ptr));
3905 }
3906
3907 /** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
3908 @param[in,out] entry clustered index entry
3909 @param[in] index clustered index
3910 @param[in] trx_id DB_TRX_ID
3911 @param[in] roll_ptr DB_ROLL_PTR */
btr_cur_write_sys(dtuple_t * entry,const dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr)3912 static void btr_cur_write_sys(
3913 dtuple_t* entry,
3914 const dict_index_t* index,
3915 trx_id_t trx_id,
3916 roll_ptr_t roll_ptr)
3917 {
3918 dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
3919 ut_ad(t->len == DATA_TRX_ID_LEN);
3920 trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
3921 dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
3922 ut_ad(r->len == DATA_ROLL_PTR_LEN);
3923 trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
3924 }
3925
3926 /** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
3927 @param[in,out] block clustered index leaf page
3928 @param[in,out] rec clustered index record
3929 @param[in] index clustered index
3930 @param[in] offsets rec_get_offsets(rec, index)
3931 @param[in] trx transaction
3932 @param[in] roll_ptr DB_ROLL_PTR value
3933 @param[in,out] mtr mini-transaction */
btr_cur_upd_rec_sys(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,const trx_t * trx,roll_ptr_t roll_ptr,mtr_t * mtr)3934 static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
3935 dict_index_t *index, const rec_offs *offsets,
3936 const trx_t *trx, roll_ptr_t roll_ptr,
3937 mtr_t *mtr)
3938 {
3939 ut_ad(index->is_primary());
3940 ut_ad(rec_offs_validate(rec, index, offsets));
3941
3942 if (UNIV_LIKELY_NULL(block->page.zip.data))
3943 {
3944 page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
3945 trx->id, roll_ptr, mtr);
3946 return;
3947 }
3948
3949 ulint offset= index->trx_id_offset;
3950
3951 if (!offset)
3952 offset= row_get_trx_id_offset(index, offsets);
3953
3954 compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
3955
3956 /* During IMPORT the trx id in the record can be in the future, if
3957 the .ibd file is being imported from another instance. During IMPORT
3958 roll_ptr will be 0. */
3959 ut_ad(roll_ptr == 0 ||
3960 lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
3961 rec, index, offsets));
3962
3963 byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
3964
3965 trx_write_trx_id(sys, trx->id);
3966 trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
3967
3968 ulint d= 0;
3969 const byte *src= nullptr;
3970 byte *dest= rec + offset;
3971 ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
3972
3973 if (UNIV_LIKELY(index->trx_id_offset))
3974 {
3975 const rec_t *prev= page_rec_get_prev_const(rec);
3976 if (UNIV_UNLIKELY(prev == rec))
3977 ut_ad(0);
3978 else if (page_rec_is_infimum(prev));
3979 else
3980 for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
3981 if (src[d] != sys[d])
3982 break;
3983 if (d > 6 && memcmp(dest, sys, d))
3984 {
3985 /* We save space by replacing a single record
3986
3987 WRITE,page_offset(dest),byte[13]
3988
3989 with two records:
3990
3991 MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
3992 WRITE|0x80,0,byte[13-d]
3993
3994 The single WRITE record would be x+13 bytes long, with x>2.
3995 The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
3996 second WRITE would be 1+1+13-d = 15-d bytes.
3997
3998 The total size is: x+13 versus x+4+15-d = x+19-d bytes.
3999 To save space, we must have d>6, that is, the complete DB_TRX_ID and
4000 the first byte(s) of DB_ROLL_PTR must match the previous record. */
4001 memcpy(dest, src, d);
4002 mtr->memmove(*block, page_offset(dest), page_offset(src), d);
4003 dest+= d;
4004 len-= d;
4005 /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
4006 DB_TRX_ID refers to an active transaction. */
4007 ut_ad(len);
4008 }
4009 else
4010 d= 0;
4011 }
4012
4013 if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
4014 mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
4015 }
4016
4017 /*************************************************************//**
4018 See if there is enough place in the page modification log to log
4019 an update-in-place.
4020
4021 @retval false if out of space; IBUF_BITMAP_FREE will be reset
4022 outside mtr if the page was recompressed
4023 @retval true if enough place;
4024
4025 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
4026 a secondary index leaf page. This has to be done either within the
4027 same mini-transaction, or by invoking ibuf_reset_free_bits() before
4028 mtr_commit(mtr). */
4029 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,rec_offs * offsets,ulint length,bool create,mtr_t * mtr)4030 btr_cur_update_alloc_zip_func(
4031 /*==========================*/
4032 page_zip_des_t* page_zip,/*!< in/out: compressed page */
4033 page_cur_t* cursor, /*!< in/out: B-tree page cursor */
4034 dict_index_t* index, /*!< in: the index corresponding to cursor */
4035 #ifdef UNIV_DEBUG
4036 rec_offs* offsets,/*!< in/out: offsets of the cursor record */
4037 #endif /* UNIV_DEBUG */
4038 ulint length, /*!< in: size needed */
4039 bool create, /*!< in: true=delete-and-insert,
4040 false=update-in-place */
4041 mtr_t* mtr) /*!< in/out: mini-transaction */
4042 {
4043
4044 /* Have a local copy of the variables as these can change
4045 dynamically. */
4046 const page_t* page = page_cur_get_page(cursor);
4047
4048 ut_ad(page_zip == page_cur_get_page_zip(cursor));
4049 ut_ad(!dict_index_is_ibuf(index));
4050 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
4051
4052 if (page_zip_available(page_zip, dict_index_is_clust(index),
4053 length, create)) {
4054 return(true);
4055 }
4056
4057 if (!page_zip->m_nonempty && !page_has_garbage(page)) {
4058 /* The page has been freshly compressed, so
4059 reorganizing it will not help. */
4060 return(false);
4061 }
4062
4063 if (create && page_is_leaf(page)
4064 && (length + page_get_data_size(page)
4065 >= dict_index_zip_pad_optimal_page_size(index))) {
4066 return(false);
4067 }
4068
4069 if (!btr_page_reorganize(cursor, index, mtr)) {
4070 goto out_of_space;
4071 }
4072
4073 rec_offs_make_valid(page_cur_get_rec(cursor), index,
4074 page_is_leaf(page), offsets);
4075
4076 /* After recompressing a page, we must make sure that the free
4077 bits in the insert buffer bitmap will not exceed the free
4078 space on the page. Because this function will not attempt
4079 recompression unless page_zip_available() fails above, it is
4080 safe to reset the free bits if page_zip_available() fails
4081 again, below. The free bits can safely be reset in a separate
4082 mini-transaction. If page_zip_available() succeeds below, we
4083 can be sure that the btr_page_reorganize() above did not reduce
4084 the free space available on the page. */
4085
4086 if (page_zip_available(page_zip, dict_index_is_clust(index),
4087 length, create)) {
4088 return(true);
4089 }
4090
4091 out_of_space:
4092 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
4093
4094 /* Out of space: reset the free bits. */
4095 if (!dict_index_is_clust(index)
4096 && !index->table->is_temporary()
4097 && page_is_leaf(page)) {
4098 ibuf_reset_free_bits(page_cur_get_block(cursor));
4099 }
4100
4101 return(false);
4102 }
4103
4104 /** Apply an update vector to a record. No field size changes are allowed.
4105
4106 This is usually invoked on a clustered index. The only use case for a
4107 secondary index is row_ins_sec_index_entry_by_modify() or its
4108 counterpart in ibuf_insert_to_index_page().
4109 @param[in,out] rec index record
4110 @param[in] index the index of the record
4111 @param[in] offsets rec_get_offsets(rec, index)
4112 @param[in] update update vector
4113 @param[in,out] block index page
4114 @param[in,out] mtr mini-transaction */
btr_cur_upd_rec_in_place(rec_t * rec,const dict_index_t * index,const rec_offs * offsets,const upd_t * update,buf_block_t * block,mtr_t * mtr)4115 void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
4116 const rec_offs *offsets, const upd_t *update,
4117 buf_block_t *block, mtr_t *mtr)
4118 {
4119 ut_ad(rec_offs_validate(rec, index, offsets));
4120 ut_ad(!index->table->skip_alter_undo);
4121 ut_ad(!block->page.zip.data || index->table->not_redundant());
4122
4123 #ifdef UNIV_DEBUG
4124 if (rec_offs_comp(offsets)) {
4125 switch (rec_get_status(rec)) {
4126 case REC_STATUS_ORDINARY:
4127 break;
4128 case REC_STATUS_INSTANT:
4129 ut_ad(index->is_instant());
4130 break;
4131 case REC_STATUS_NODE_PTR:
4132 case REC_STATUS_INFIMUM:
4133 case REC_STATUS_SUPREMUM:
4134 ut_ad("wrong record status in update" == 0);
4135 }
4136 }
4137 #endif /* UNIV_DEBUG */
4138
4139 static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
4140 if (UNIV_LIKELY_NULL(block->page.zip.data)) {
4141 ut_ad(rec_offs_comp(offsets));
4142 byte* info_bits = &rec[-REC_NEW_INFO_BITS];
4143 const bool flip_del_mark = (*info_bits ^ update->info_bits)
4144 & REC_INFO_DELETED_FLAG;
4145 *info_bits &= byte(~REC_INFO_BITS_MASK);
4146 *info_bits |= update->info_bits;
4147
4148 if (flip_del_mark) {
4149 page_zip_rec_set_deleted(block, rec, update->info_bits
4150 & REC_INFO_DELETED_FLAG, mtr);
4151 }
4152 } else {
4153 byte* info_bits = &rec[rec_offs_comp(offsets)
4154 ? -REC_NEW_INFO_BITS
4155 : -REC_OLD_INFO_BITS];
4156
4157 mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
4158 (*info_bits
4159 & ~REC_INFO_BITS_MASK)
4160 | update->info_bits);
4161 }
4162
4163 for (ulint i = 0; i < update->n_fields; i++) {
4164 const upd_field_t* uf = upd_get_nth_field(update, i);
4165 if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
4166 continue;
4167 }
4168 const ulint n = uf->field_no;
4169
4170 ut_ad(!dfield_is_ext(&uf->new_val)
4171 == !rec_offs_nth_extern(offsets, n));
4172 ut_ad(!rec_offs_nth_default(offsets, n));
4173
4174 if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
4175 if (rec_offs_nth_sql_null(offsets, n)) {
4176 ut_ad(index->table->is_instant());
4177 ut_ad(n >= index->n_core_fields);
4178 continue;
4179 }
4180
4181 ut_ad(!index->table->not_redundant());
4182 switch (ulint size = rec_get_nth_field_size(rec, n)) {
4183 case 0:
4184 break;
4185 case 1:
4186 mtr->write<1,mtr_t::MAYBE_NOP>(
4187 *block,
4188 rec_get_field_start_offs(rec, n) + rec,
4189 0U);
4190 break;
4191 default:
4192 mtr->memset(
4193 block,
4194 page_offset(rec_get_field_start_offs(
4195 rec, n) + rec),
4196 size, 0);
4197 }
4198 ulint l = rec_get_1byte_offs_flag(rec)
4199 ? (n + 1) : (n + 1) * 2;
4200 byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
4201 compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
4202 == REC_2BYTE_SQL_NULL_MASK);
4203 mtr->write<1>(*block, b,
4204 byte(*b | REC_1BYTE_SQL_NULL_MASK));
4205 continue;
4206 }
4207
4208 ulint len;
4209 byte* data = rec_get_nth_field(rec, offsets, n, &len);
4210 if (UNIV_LIKELY_NULL(block->page.zip.data)) {
4211 ut_ad(len == uf->new_val.len);
4212 memcpy(data, uf->new_val.data, len);
4213 continue;
4214 }
4215
4216 if (UNIV_UNLIKELY(len != uf->new_val.len)) {
4217 ut_ad(len == UNIV_SQL_NULL);
4218 ut_ad(!rec_offs_comp(offsets));
4219 len = uf->new_val.len;
4220 ut_ad(len == rec_get_nth_field_size(rec, n));
4221 ulint l = rec_get_1byte_offs_flag(rec)
4222 ? (n + 1) : (n + 1) * 2;
4223 byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
4224 compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
4225 == REC_2BYTE_SQL_NULL_MASK);
4226 mtr->write<1>(*block, b,
4227 byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
4228 }
4229
4230 if (len) {
4231 mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
4232 uf->new_val.data, len);
4233 }
4234 }
4235
4236 if (UNIV_LIKELY_NULL(block->page.zip.data)) {
4237 page_zip_write_rec(block, rec, index, offsets, 0, mtr);
4238 }
4239 }
4240
4241 /*************************************************************//**
4242 Updates a record when the update causes no size changes in its fields.
4243 We assume here that the ordering fields of the record do not change.
4244 @return locking or undo log related error code, or
4245 @retval DB_SUCCESS on success
4246 @retval DB_ZIP_OVERFLOW if there is not enough space left
4247 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
4248 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,rec_offs * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4249 btr_cur_update_in_place(
4250 /*====================*/
4251 ulint flags, /*!< in: undo logging and locking flags */
4252 btr_cur_t* cursor, /*!< in: cursor on the record to update;
4253 cursor stays valid and positioned on the
4254 same record */
4255 rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
4256 const upd_t* update, /*!< in: update vector */
4257 ulint cmpl_info,/*!< in: compiler info on secondary index
4258 updates */
4259 que_thr_t* thr, /*!< in: query thread */
4260 trx_id_t trx_id, /*!< in: transaction id */
4261 mtr_t* mtr) /*!< in/out: mini-transaction; if this
4262 is a secondary index, the caller must
4263 mtr_commit(mtr) before latching any
4264 further pages */
4265 {
4266 dict_index_t* index;
4267 dberr_t err;
4268 rec_t* rec;
4269 roll_ptr_t roll_ptr = 0;
4270 ulint was_delete_marked;
4271
4272 ut_ad(page_is_leaf(cursor->page_cur.block->frame));
4273 rec = btr_cur_get_rec(cursor);
4274 index = cursor->index;
4275 ut_ad(rec_offs_validate(rec, index, offsets));
4276 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4277 ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
4278 || index->table->is_temporary());
4279 /* The insert buffer tree should never be updated in place. */
4280 ut_ad(!dict_index_is_ibuf(index));
4281 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4282 || dict_index_is_clust(index));
4283 ut_ad(thr_get_trx(thr)->id == trx_id
4284 || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
4285 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4286 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4287 ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
4288 ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
4289 ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
4290
4291 DBUG_LOG("ib_cur",
4292 "update-in-place " << index->name << " (" << index->id
4293 << ") by " << ib::hex(trx_id) << ": "
4294 << rec_printer(rec, offsets).str());
4295
4296 buf_block_t* block = btr_cur_get_block(cursor);
4297 page_zip_des_t* page_zip = buf_block_get_page_zip(block);
4298
4299 /* Check that enough space is available on the compressed page. */
4300 if (UNIV_LIKELY_NULL(page_zip)) {
4301 ut_ad(!index->table->is_temporary());
4302
4303 if (!btr_cur_update_alloc_zip(
4304 page_zip, btr_cur_get_page_cur(cursor),
4305 index, offsets, rec_offs_size(offsets),
4306 false, mtr)) {
4307 return(DB_ZIP_OVERFLOW);
4308 }
4309
4310 rec = btr_cur_get_rec(cursor);
4311 }
4312
4313 /* Do lock checking and undo logging */
4314 err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
4315 update, cmpl_info,
4316 thr, mtr, &roll_ptr);
4317 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
4318 /* We may need to update the IBUF_BITMAP_FREE
4319 bits after a reorganize that was done in
4320 btr_cur_update_alloc_zip(). */
4321 goto func_exit;
4322 }
4323
4324 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4325 btr_cur_upd_rec_sys(block, rec, index, offsets,
4326 thr_get_trx(thr), roll_ptr, mtr);
4327 }
4328
4329 was_delete_marked = rec_get_deleted_flag(
4330 rec, page_is_comp(buf_block_get_frame(block)));
4331 /* In delete-marked records, DB_TRX_ID must always refer to an
4332 existing undo log record. */
4333 ut_ad(!was_delete_marked
4334 || !dict_index_is_clust(index)
4335 || row_get_rec_trx_id(rec, index, offsets));
4336
4337 #ifdef BTR_CUR_HASH_ADAPT
4338 {
4339 rw_lock_t* ahi_latch = block->index
4340 ? btr_search_sys.get_latch(*index) : NULL;
4341 if (ahi_latch) {
4342 /* TO DO: Can we skip this if none of the fields
4343 index->search_info->curr_n_fields
4344 are being updated? */
4345
4346 /* The function row_upd_changes_ord_field_binary
4347 does not work on a secondary index. */
4348
4349 if (!dict_index_is_clust(index)
4350 || row_upd_changes_ord_field_binary(
4351 index, update, thr, NULL, NULL)) {
4352 ut_ad(!(update->info_bits
4353 & REC_INFO_MIN_REC_FLAG));
4354 /* Remove possible hash index pointer
4355 to this record */
4356 btr_search_update_hash_on_delete(cursor);
4357 }
4358
4359 rw_lock_x_lock(ahi_latch);
4360 }
4361
4362 assert_block_ahi_valid(block);
4363 #endif /* BTR_CUR_HASH_ADAPT */
4364
4365 btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
4366 mtr);
4367
4368 #ifdef BTR_CUR_HASH_ADAPT
4369 if (ahi_latch) {
4370 rw_lock_x_unlock(ahi_latch);
4371 }
4372 }
4373 #endif /* BTR_CUR_HASH_ADAPT */
4374
4375 if (was_delete_marked
4376 && !rec_get_deleted_flag(
4377 rec, page_is_comp(buf_block_get_frame(block)))) {
4378 /* The new updated record owns its possible externally
4379 stored fields */
4380
4381 btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
4382 }
4383
4384 ut_ad(err == DB_SUCCESS);
4385
4386 func_exit:
4387 if (page_zip
4388 && !(flags & BTR_KEEP_IBUF_BITMAP)
4389 && !dict_index_is_clust(index)
4390 && page_is_leaf(buf_block_get_frame(block))) {
4391 /* Update the free bits in the insert buffer. */
4392 ut_ad(!index->table->is_temporary());
4393 ibuf_update_free_bits_zip(block, mtr);
4394 }
4395
4396 return(err);
4397 }
4398
4399 /** Trim a metadata record during the rollback of instant ALTER TABLE.
4400 @param[in] entry metadata tuple
4401 @param[in] index primary key
4402 @param[in] update update vector for the rollback */
4403 ATTRIBUTE_COLD
btr_cur_trim_alter_metadata(dtuple_t * entry,const dict_index_t * index,const upd_t * update)4404 static void btr_cur_trim_alter_metadata(dtuple_t* entry,
4405 const dict_index_t* index,
4406 const upd_t* update)
4407 {
4408 ut_ad(index->is_instant());
4409 ut_ad(update->is_alter_metadata());
4410 ut_ad(entry->is_alter_metadata());
4411
4412 ut_ad(update->fields[0].field_no == index->first_user_field());
4413 ut_ad(update->fields[0].new_val.ext);
4414 ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
4415 ut_ad(entry->n_fields - 1 == index->n_fields);
4416
4417 const byte* ptr = static_cast<const byte*>(
4418 update->fields[0].new_val.data);
4419 ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
4420 ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
4421 ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
4422 ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
4423 == index->table->space->id);
4424
4425 ulint n_fields = update->fields[1].field_no;
4426 ut_ad(n_fields <= index->n_fields);
4427 if (n_fields != index->n_uniq) {
4428 ut_ad(n_fields
4429 >= index->n_core_fields);
4430 entry->n_fields = n_fields;
4431 return;
4432 }
4433
4434 /* This is based on dict_table_t::deserialise_columns()
4435 and btr_cur_instant_init_low(). */
4436 mtr_t mtr;
4437 mtr.start();
4438 buf_block_t* block = buf_page_get(
4439 page_id_t(index->table->space->id,
4440 mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
4441 0, RW_S_LATCH, &mtr);
4442 buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
4443 ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB);
4444 ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
4445 + BTR_BLOB_HDR_NEXT_PAGE_NO])
4446 == FIL_NULL);
4447 ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
4448 + BTR_BLOB_HDR_PART_LEN])
4449 == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
4450 n_fields = mach_read_from_4(
4451 &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
4452 + index->first_user_field();
4453 /* Rollback should not increase the number of fields. */
4454 ut_ad(n_fields <= index->n_fields);
4455 ut_ad(n_fields + 1 <= entry->n_fields);
4456 /* dict_index_t::clear_instant_alter() cannot be invoked while
4457 rollback of an instant ALTER TABLE transaction is in progress
4458 for an is_alter_metadata() record. */
4459 ut_ad(n_fields >= index->n_core_fields);
4460
4461 mtr.commit();
4462 entry->n_fields = n_fields + 1;
4463 }
4464
4465 /** Trim an update tuple due to instant ADD COLUMN, if needed.
4466 For normal records, the trailing instantly added fields that match
4467 the initial default values are omitted.
4468
4469 For the special metadata record on a table on which instant
4470 ADD COLUMN has already been executed, both ADD COLUMN and the
4471 rollback of ADD COLUMN need to be handled specially.
4472
4473 @param[in,out] entry index entry
4474 @param[in] index index
4475 @param[in] update update vector
4476 @param[in] thr execution thread */
4477 static inline
4478 void
btr_cur_trim(dtuple_t * entry,const dict_index_t * index,const upd_t * update,const que_thr_t * thr)4479 btr_cur_trim(
4480 dtuple_t* entry,
4481 const dict_index_t* index,
4482 const upd_t* update,
4483 const que_thr_t* thr)
4484 {
4485 if (!index->is_instant()) {
4486 } else if (UNIV_UNLIKELY(update->is_metadata())) {
4487 /* We are either updating a metadata record
4488 (instant ALTER TABLE on a table where instant ALTER was
4489 already executed) or rolling back such an operation. */
4490 ut_ad(!upd_get_nth_field(update, 0)->orig_len);
4491 ut_ad(entry->is_metadata());
4492
4493 if (thr->graph->trx->in_rollback) {
4494 /* This rollback can occur either as part of
4495 ha_innobase::commit_inplace_alter_table() rolling
4496 back after a failed innobase_add_instant_try(),
4497 or as part of crash recovery. Either way, the
4498 table will be in the data dictionary cache, with
4499 the instantly added columns going to be removed
4500 later in the rollback. */
4501 ut_ad(index->table->cached);
4502 /* The DB_TRX_ID,DB_ROLL_PTR are always last,
4503 and there should be some change to roll back.
4504 The first field in the update vector is the
4505 first instantly added column logged by
4506 innobase_add_instant_try(). */
4507 ut_ad(update->n_fields > 2);
4508 if (update->is_alter_metadata()) {
4509 btr_cur_trim_alter_metadata(
4510 entry, index, update);
4511 return;
4512 }
4513 ut_ad(!entry->is_alter_metadata());
4514
4515 ulint n_fields = upd_get_nth_field(update, 0)
4516 ->field_no;
4517 ut_ad(n_fields + 1 >= entry->n_fields);
4518 entry->n_fields = n_fields;
4519 }
4520 } else {
4521 entry->trim(*index);
4522 }
4523 }
4524
4525 /*************************************************************//**
4526 Tries to update a record on a page in an index tree. It is assumed that mtr
4527 holds an x-latch on the page. The operation does not succeed if there is too
4528 little space on the page or if the update would result in too empty a page,
4529 so that tree compression is recommended. We assume here that the ordering
4530 fields of the record do not change.
4531 @return error code, including
4532 @retval DB_SUCCESS on success
4533 @retval DB_OVERFLOW if the updated record does not fit
4534 @retval DB_UNDERFLOW if the page would become too empty
4535 @retval DB_ZIP_OVERFLOW if there is not enough space left
4536 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
4537 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4538 btr_cur_optimistic_update(
4539 /*======================*/
4540 ulint flags, /*!< in: undo logging and locking flags */
4541 btr_cur_t* cursor, /*!< in: cursor on the record to update;
4542 cursor stays valid and positioned on the
4543 same record */
4544 rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
4545 mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
4546 const upd_t* update, /*!< in: update vector; this must also
4547 contain trx id and roll ptr fields */
4548 ulint cmpl_info,/*!< in: compiler info on secondary index
4549 updates */
4550 que_thr_t* thr, /*!< in: query thread */
4551 trx_id_t trx_id, /*!< in: transaction id */
4552 mtr_t* mtr) /*!< in/out: mini-transaction; if this
4553 is a secondary index, the caller must
4554 mtr_commit(mtr) before latching any
4555 further pages */
4556 {
4557 dict_index_t* index;
4558 page_cur_t* page_cursor;
4559 dberr_t err;
4560 buf_block_t* block;
4561 page_t* page;
4562 page_zip_des_t* page_zip;
4563 rec_t* rec;
4564 ulint max_size;
4565 ulint new_rec_size;
4566 ulint old_rec_size;
4567 ulint max_ins_size = 0;
4568 dtuple_t* new_entry;
4569 roll_ptr_t roll_ptr;
4570 ulint i;
4571
4572 block = btr_cur_get_block(cursor);
4573 page = buf_block_get_frame(block);
4574 rec = btr_cur_get_rec(cursor);
4575 index = cursor->index;
4576 ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
4577 || index->table->is_temporary());
4578 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4579 ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4580 /* This is intended only for leaf page updates */
4581 ut_ad(page_is_leaf(page));
4582 /* The insert buffer tree should never be updated in place. */
4583 ut_ad(!dict_index_is_ibuf(index));
4584 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4585 || dict_index_is_clust(index));
4586 ut_ad(thr_get_trx(thr)->id == trx_id
4587 || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
4588 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4589 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4590 ut_ad(fil_page_index_page_check(page));
4591 ut_ad(btr_page_get_index_id(page) == index->id);
4592
4593 *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
4594 ULINT_UNDEFINED, heap);
4595 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4596 ut_a(!rec_offs_any_null_extern(rec, *offsets)
4597 || thr_get_trx(thr) == trx_roll_crash_recv_trx);
4598 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4599
4600 if (UNIV_LIKELY(!update->is_metadata())
4601 && !row_upd_changes_field_size_or_external(index, *offsets,
4602 update)) {
4603
4604 /* The simplest and the most common case: the update does not
4605 change the size of any field and none of the updated fields is
4606 externally stored in rec or update, and there is enough space
4607 on the compressed page to log the update. */
4608
4609 return(btr_cur_update_in_place(
4610 flags, cursor, *offsets, update,
4611 cmpl_info, thr, trx_id, mtr));
4612 }
4613
4614 if (rec_offs_any_extern(*offsets)) {
4615 any_extern:
4616 ut_ad(!index->is_ibuf());
4617 /* Externally stored fields are treated in pessimistic
4618 update */
4619
4620 /* prefetch siblings of the leaf for the pessimistic
4621 operation. */
4622 btr_cur_prefetch_siblings(block, index);
4623
4624 return(DB_OVERFLOW);
4625 }
4626
4627 if (rec_is_metadata(rec, *index) && index->table->instant) {
4628 goto any_extern;
4629 }
4630
4631 for (i = 0; i < upd_get_n_fields(update); i++) {
4632 if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4633
4634 goto any_extern;
4635 }
4636 }
4637
4638 DBUG_LOG("ib_cur",
4639 "update " << index->name << " (" << index->id << ") by "
4640 << ib::hex(trx_id) << ": "
4641 << rec_printer(rec, *offsets).str());
4642
4643 page_cursor = btr_cur_get_page_cur(cursor);
4644
4645 if (!*heap) {
4646 *heap = mem_heap_create(
4647 rec_offs_size(*offsets)
4648 + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4649 }
4650
4651 new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap);
4652 ut_ad(!dtuple_get_n_ext(new_entry));
4653
4654 /* The page containing the clustered index record
4655 corresponding to new_entry is latched in mtr.
4656 Thus the following call is safe. */
4657 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4658 *heap);
4659 btr_cur_trim(new_entry, index, update, thr);
4660 old_rec_size = rec_offs_size(*offsets);
4661 new_rec_size = rec_get_converted_size(index, new_entry, 0);
4662
4663 page_zip = buf_block_get_page_zip(block);
4664 #ifdef UNIV_ZIP_DEBUG
4665 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4666 #endif /* UNIV_ZIP_DEBUG */
4667
4668 if (page_zip) {
4669 ut_ad(!index->table->is_temporary());
4670
4671 if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
4672 dict_index_get_n_fields(index),
4673 block->zip_size())) {
4674 goto any_extern;
4675 }
4676
4677 if (!btr_cur_update_alloc_zip(
4678 page_zip, page_cursor, index, *offsets,
4679 new_rec_size, true, mtr)) {
4680 return(DB_ZIP_OVERFLOW);
4681 }
4682
4683 rec = page_cur_get_rec(page_cursor);
4684 }
4685
4686 /* We limit max record size to 16k even for 64k page size. */
4687 if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
4688 (!dict_table_is_comp(index->table)
4689 && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
4690 err = DB_OVERFLOW;
4691
4692 goto func_exit;
4693 }
4694
4695 if (UNIV_UNLIKELY(new_rec_size
4696 >= (page_get_free_space_of_empty(page_is_comp(page))
4697 / 2))) {
4698 /* We may need to update the IBUF_BITMAP_FREE
4699 bits after a reorganize that was done in
4700 btr_cur_update_alloc_zip(). */
4701 err = DB_OVERFLOW;
4702 goto func_exit;
4703 }
4704
4705 if (UNIV_UNLIKELY(page_get_data_size(page)
4706 - old_rec_size + new_rec_size
4707 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4708 /* We may need to update the IBUF_BITMAP_FREE
4709 bits after a reorganize that was done in
4710 btr_cur_update_alloc_zip(). */
4711
4712 /* The page would become too empty */
4713 err = DB_UNDERFLOW;
4714 goto func_exit;
4715 }
4716
4717 /* We do not attempt to reorganize if the page is compressed.
4718 This is because the page may fail to compress after reorganization. */
4719 max_size = page_zip
4720 ? page_get_max_insert_size(page, 1)
4721 : (old_rec_size
4722 + page_get_max_insert_size_after_reorganize(page, 1));
4723
4724 if (!page_zip) {
4725 max_ins_size = page_get_max_insert_size_after_reorganize(
4726 page, 1);
4727 }
4728
4729 if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4730 && (max_size >= new_rec_size))
4731 || (page_get_n_recs(page) <= 1))) {
4732
4733 /* We may need to update the IBUF_BITMAP_FREE
4734 bits after a reorganize that was done in
4735 btr_cur_update_alloc_zip(). */
4736
4737 /* There was not enough space, or it did not pay to
4738 reorganize: for simplicity, we decide what to do assuming a
4739 reorganization is needed, though it might not be necessary */
4740
4741 err = DB_OVERFLOW;
4742 goto func_exit;
4743 }
4744
4745 /* Do lock checking and undo logging */
4746 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4747 update, cmpl_info,
4748 thr, mtr, &roll_ptr);
4749 if (err != DB_SUCCESS) {
4750 /* We may need to update the IBUF_BITMAP_FREE
4751 bits after a reorganize that was done in
4752 btr_cur_update_alloc_zip(). */
4753 goto func_exit;
4754 }
4755
4756 /* Ok, we may do the replacement. Store on the page infimum the
4757 explicit locks on rec, before deleting rec (see the comment in
4758 btr_cur_pessimistic_update). */
4759 if (!dict_table_is_locking_disabled(index->table)) {
4760 lock_rec_store_on_page_infimum(block, rec);
4761 }
4762
4763 if (UNIV_UNLIKELY(update->is_metadata())) {
4764 ut_ad(new_entry->is_metadata());
4765 ut_ad(index->is_instant());
4766 /* This can be innobase_add_instant_try() performing a
4767 subsequent instant ADD COLUMN, or its rollback by
4768 row_undo_mod_clust_low(). */
4769 ut_ad(flags & BTR_NO_LOCKING_FLAG);
4770 } else {
4771 btr_search_update_hash_on_delete(cursor);
4772 }
4773
4774 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4775
4776 page_cur_move_to_prev(page_cursor);
4777
4778 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4779 btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
4780 }
4781
4782 /* There are no externally stored columns in new_entry */
4783 rec = btr_cur_insert_if_possible(
4784 cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4785 ut_a(rec); /* <- We calculated above the insert would fit */
4786
4787 if (UNIV_UNLIKELY(update->is_metadata())) {
4788 /* We must empty the PAGE_FREE list, because if this
4789 was a rollback, the shortened metadata record
4790 would have too many fields, and we would be unable to
4791 know the size of the freed record. */
4792 btr_page_reorganize(page_cursor, index, mtr);
4793 } else if (!dict_table_is_locking_disabled(index->table)) {
4794 /* Restore the old explicit lock state on the record */
4795 lock_rec_restore_from_page_infimum(block, rec, block);
4796 }
4797
4798 page_cur_move_to_next(page_cursor);
4799 ut_ad(err == DB_SUCCESS);
4800
4801 func_exit:
4802 if (!(flags & BTR_KEEP_IBUF_BITMAP)
4803 && !dict_index_is_clust(index)) {
4804 /* Update the free bits in the insert buffer. */
4805 if (page_zip) {
4806 ut_ad(!index->table->is_temporary());
4807 ibuf_update_free_bits_zip(block, mtr);
4808 } else if (!index->table->is_temporary()) {
4809 ibuf_update_free_bits_low(block, max_ins_size, mtr);
4810 }
4811 }
4812
4813 if (err != DB_SUCCESS) {
4814 /* prefetch siblings of the leaf for the pessimistic
4815 operation. */
4816 btr_cur_prefetch_siblings(block, index);
4817 }
4818
4819 return(err);
4820 }
4821
4822 /*************************************************************//**
4823 If, in a split, a new supremum record was created as the predecessor of the
4824 updated record, the supremum record must inherit exactly the locks on the
4825 updated record. In the split it may have inherited locks from the successor
4826 of the updated record, which is not correct. This function restores the
4827 right locks for the new supremum. */
4828 static
4829 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4830 btr_cur_pess_upd_restore_supremum(
4831 /*==============================*/
4832 buf_block_t* block, /*!< in: buffer block of rec */
4833 const rec_t* rec, /*!< in: updated record */
4834 mtr_t* mtr) /*!< in: mtr */
4835 {
4836 page_t* page;
4837 buf_block_t* prev_block;
4838
4839 page = buf_block_get_frame(block);
4840
4841 if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4842 /* Updated record is not the first user record on its page */
4843
4844 return;
4845 }
4846
4847 const uint32_t prev_page_no = btr_page_get_prev(page);
4848
4849 const page_id_t page_id(block->page.id().space(), prev_page_no);
4850
4851 ut_ad(prev_page_no != FIL_NULL);
4852 prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(),
4853 mtr);
4854 #ifdef UNIV_BTR_DEBUG
4855 ut_a(btr_page_get_next(prev_block->frame)
4856 == block->page.id().page_no());
4857 #endif /* UNIV_BTR_DEBUG */
4858
4859 /* We must already have an x-latch on prev_block! */
4860 ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
4861
4862 lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4863 PAGE_HEAP_NO_SUPREMUM,
4864 page_rec_get_heap_no(rec));
4865 }
4866
4867 /*************************************************************//**
4868 Performs an update of a record on a page of a tree. It is assumed
4869 that mtr holds an x-latch on the tree and on the cursor page. If the
4870 update is made on the leaf level, to avoid deadlocks, mtr must also
4871 own x-latches to brothers of page, if those brothers exist. We assume
4872 here that the ordering fields of the record do not change.
4873 @return DB_SUCCESS or error code */
4874 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4875 btr_cur_pessimistic_update(
4876 /*=======================*/
4877 ulint flags, /*!< in: undo logging, locking, and rollback
4878 flags */
4879 btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
4880 cursor may become invalid if *big_rec == NULL
4881 || !(flags & BTR_KEEP_POS_FLAG) */
4882 rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */
4883 mem_heap_t** offsets_heap,
4884 /*!< in/out: pointer to memory heap
4885 that can be emptied */
4886 mem_heap_t* entry_heap,
4887 /*!< in/out: memory heap for allocating
4888 big_rec and the index tuple */
4889 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
4890 be stored externally by the caller */
4891 upd_t* update, /*!< in/out: update vector; this is allowed to
4892 also contain trx id and roll ptr fields.
4893 Non-updated columns that are moved offpage will
4894 be appended to this. */
4895 ulint cmpl_info,/*!< in: compiler info on secondary index
4896 updates */
4897 que_thr_t* thr, /*!< in: query thread */
4898 trx_id_t trx_id, /*!< in: transaction id */
4899 mtr_t* mtr) /*!< in/out: mini-transaction; must be
4900 committed before latching any further pages */
4901 {
4902 big_rec_t* big_rec_vec = NULL;
4903 big_rec_t* dummy_big_rec;
4904 dict_index_t* index;
4905 buf_block_t* block;
4906 page_zip_des_t* page_zip;
4907 rec_t* rec;
4908 page_cur_t* page_cursor;
4909 dberr_t err;
4910 dberr_t optim_err;
4911 roll_ptr_t roll_ptr;
4912 bool was_first;
4913 uint32_t n_reserved = 0;
4914
4915 *offsets = NULL;
4916 *big_rec = NULL;
4917
4918 block = btr_cur_get_block(cursor);
4919 page_zip = buf_block_get_page_zip(block);
4920 index = cursor->index;
4921
4922 ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
4923 MTR_MEMO_SX_LOCK));
4924 ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4925 #ifdef UNIV_ZIP_DEBUG
4926 ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
4927 #endif /* UNIV_ZIP_DEBUG */
4928 ut_ad(!page_zip || !index->table->is_temporary());
4929 /* The insert buffer tree should never be updated in place. */
4930 ut_ad(!dict_index_is_ibuf(index));
4931 ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
4932 || index->table->is_temporary());
4933 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4934 || dict_index_is_clust(index));
4935 ut_ad(thr_get_trx(thr)->id == trx_id
4936 || (flags & ulint(~BTR_KEEP_POS_FLAG))
4937 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4938 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4939
4940 err = optim_err = btr_cur_optimistic_update(
4941 flags | BTR_KEEP_IBUF_BITMAP,
4942 cursor, offsets, offsets_heap, update,
4943 cmpl_info, thr, trx_id, mtr);
4944
4945 switch (err) {
4946 case DB_ZIP_OVERFLOW:
4947 case DB_UNDERFLOW:
4948 case DB_OVERFLOW:
4949 break;
4950 default:
4951 err_exit:
4952 /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4953 For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4954 already reset by btr_cur_update_alloc_zip() if the
4955 page was recompressed. */
4956 if (page_zip
4957 && optim_err != DB_ZIP_OVERFLOW
4958 && !dict_index_is_clust(index)
4959 && page_is_leaf(block->frame)) {
4960 ut_ad(!index->table->is_temporary());
4961 ibuf_update_free_bits_zip(block, mtr);
4962 }
4963
4964 if (big_rec_vec != NULL) {
4965 dtuple_big_rec_free(big_rec_vec);
4966 }
4967
4968 return(err);
4969 }
4970
4971 rec = btr_cur_get_rec(cursor);
4972 ut_ad(rec_offs_validate(rec, index, *offsets));
4973
4974 dtuple_t* new_entry;
4975
4976 const bool is_metadata = rec_is_metadata(rec, *index);
4977
4978 if (UNIV_UNLIKELY(is_metadata)) {
4979 ut_ad(update->is_metadata());
4980 ut_ad(flags & BTR_NO_LOCKING_FLAG);
4981 ut_ad(index->is_instant());
4982 new_entry = row_metadata_to_tuple(
4983 rec, index, *offsets, entry_heap,
4984 update->info_bits, !thr_get_trx(thr)->in_rollback);
4985 ut_ad(new_entry->n_fields
4986 == ulint(index->n_fields)
4987 + update->is_alter_metadata());
4988 } else {
4989 new_entry = row_rec_to_index_entry(rec, index, *offsets,
4990 entry_heap);
4991 }
4992
4993 /* The page containing the clustered index record
4994 corresponding to new_entry is latched in mtr. If the
4995 clustered index record is delete-marked, then its externally
4996 stored fields cannot have been purged yet, because then the
4997 purge would also have removed the clustered index record
4998 itself. Thus the following call is safe. */
4999 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
5000 entry_heap);
5001 btr_cur_trim(new_entry, index, update, thr);
5002
5003 /* We have to set appropriate extern storage bits in the new
5004 record to be inserted: we have to remember which fields were such */
5005
5006 ut_ad(!page_is_comp(block->frame) || !rec_get_node_ptr_flag(rec));
5007 ut_ad(rec_offs_validate(rec, index, *offsets));
5008
5009 if ((flags & BTR_NO_UNDO_LOG_FLAG)
5010 && rec_offs_any_extern(*offsets)) {
5011 /* We are in a transaction rollback undoing a row
5012 update: we must free possible externally stored fields
5013 which got new values in the update, if they are not
5014 inherited values. They can be inherited if we have
5015 updated the primary key to another value, and then
5016 update it back again. */
5017
5018 ut_ad(big_rec_vec == NULL);
5019 ut_ad(dict_index_is_clust(index));
5020 ut_ad(thr_get_trx(thr)->in_rollback);
5021
5022 DEBUG_SYNC_C("blob_rollback_middle");
5023
5024 btr_rec_free_updated_extern_fields(
5025 index, rec, block, *offsets, update, true, mtr);
5026 }
5027
5028 ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
5029
5030 if (page_zip_rec_needs_ext(
5031 rec_get_converted_size(index, new_entry, n_ext),
5032 page_is_comp(block->frame),
5033 dict_index_get_n_fields(index),
5034 block->zip_size())
5035 || (UNIV_UNLIKELY(update->is_alter_metadata())
5036 && !dfield_is_ext(dtuple_get_nth_field(
5037 new_entry,
5038 index->first_user_field())))) {
5039 big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
5040 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
5041
5042 /* We cannot goto return_after_reservations,
5043 because we may need to update the
5044 IBUF_BITMAP_FREE bits, which was suppressed by
5045 BTR_KEEP_IBUF_BITMAP. */
5046 #ifdef UNIV_ZIP_DEBUG
5047 ut_a(!page_zip
5048 || page_zip_validate(page_zip, block->frame,
5049 index));
5050 #endif /* UNIV_ZIP_DEBUG */
5051 index->table->space->release_free_extents(n_reserved);
5052 err = DB_TOO_BIG_RECORD;
5053 goto err_exit;
5054 }
5055
5056 ut_ad(page_is_leaf(block->frame));
5057 ut_ad(dict_index_is_clust(index));
5058 ut_ad(flags & BTR_KEEP_POS_FLAG);
5059 }
5060
5061 /* Do lock checking and undo logging */
5062 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
5063 update, cmpl_info,
5064 thr, mtr, &roll_ptr);
5065 if (err != DB_SUCCESS) {
5066 goto err_exit;
5067 }
5068
5069 if (optim_err == DB_OVERFLOW) {
5070
5071 /* First reserve enough free space for the file segments
5072 of the index tree, so that the update will not fail because
5073 of lack of space */
5074
5075 uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
5076
5077 if (!fsp_reserve_free_extents(
5078 &n_reserved, index->table->space, n_extents,
5079 flags & BTR_NO_UNDO_LOG_FLAG
5080 ? FSP_CLEANING : FSP_NORMAL,
5081 mtr)) {
5082 err = DB_OUT_OF_FILE_SPACE;
5083 goto err_exit;
5084 }
5085 }
5086
5087 if (!(flags & BTR_KEEP_SYS_FLAG)) {
5088 btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
5089 }
5090
5091 const ulint max_ins_size = page_zip
5092 ? 0 : page_get_max_insert_size_after_reorganize(block->frame,
5093 1);
5094
5095 if (UNIV_UNLIKELY(is_metadata)) {
5096 ut_ad(new_entry->is_metadata());
5097 ut_ad(index->is_instant());
5098 /* This can be innobase_add_instant_try() performing a
5099 subsequent instant ALTER TABLE, or its rollback by
5100 row_undo_mod_clust_low(). */
5101 ut_ad(flags & BTR_NO_LOCKING_FLAG);
5102 } else {
5103 btr_search_update_hash_on_delete(cursor);
5104
5105 /* Store state of explicit locks on rec on the page
5106 infimum record, before deleting rec. The page infimum
5107 acts as a dummy carrier of the locks, taking care also
5108 of lock releases, before we can move the locks back on
5109 the actual record. There is a special case: if we are
5110 inserting on the root page and the insert causes a
5111 call of btr_root_raise_and_insert. Therefore we cannot
5112 in the lock system delete the lock structs set on the
5113 root page even if the root page carries just node
5114 pointers. */
5115 if (!dict_table_is_locking_disabled(index->table)) {
5116 lock_rec_store_on_page_infimum(block, rec);
5117 }
5118 }
5119
5120 #ifdef UNIV_ZIP_DEBUG
5121 ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
5122 #endif /* UNIV_ZIP_DEBUG */
5123 page_cursor = btr_cur_get_page_cur(cursor);
5124
5125 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
5126
5127 page_cur_move_to_prev(page_cursor);
5128
5129 rec = btr_cur_insert_if_possible(cursor, new_entry,
5130 offsets, offsets_heap, n_ext, mtr);
5131
5132 if (rec) {
5133 page_cursor->rec = rec;
5134
5135 if (UNIV_UNLIKELY(is_metadata)) {
5136 /* We must empty the PAGE_FREE list, because if this
5137 was a rollback, the shortened metadata record
5138 would have too many fields, and we would be unable to
5139 know the size of the freed record. */
5140 btr_page_reorganize(page_cursor, index, mtr);
5141 rec = page_cursor->rec;
5142 rec_offs_make_valid(rec, index, true, *offsets);
5143 if (page_cursor->block->page.id().page_no()
5144 == index->page) {
5145 btr_set_instant(page_cursor->block, *index,
5146 mtr);
5147 }
5148 } else if (!dict_table_is_locking_disabled(index->table)) {
5149 lock_rec_restore_from_page_infimum(
5150 btr_cur_get_block(cursor), rec, block);
5151 }
5152
5153 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
5154 || rec_is_alter_metadata(rec, *index)) {
5155 /* The new inserted record owns its possible externally
5156 stored fields */
5157 btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
5158 rec, index, *offsets, mtr);
5159 } else {
5160 /* In delete-marked records, DB_TRX_ID must
5161 always refer to an existing undo log record. */
5162 ut_ad(row_get_rec_trx_id(rec, index, *offsets));
5163 }
5164
5165 bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
5166 ut_ad(!adjust || page_is_leaf(block->frame));
5167
5168 if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
5169 if (adjust) {
5170 rec_offs_make_valid(page_cursor->rec, index,
5171 true, *offsets);
5172 }
5173 } else if (!dict_index_is_clust(index)
5174 && page_is_leaf(block->frame)) {
5175 /* Update the free bits in the insert buffer.
5176 This is the same block which was skipped by
5177 BTR_KEEP_IBUF_BITMAP. */
5178 if (page_zip) {
5179 ut_ad(!index->table->is_temporary());
5180 ibuf_update_free_bits_zip(block, mtr);
5181 } else if (!index->table->is_temporary()) {
5182 ibuf_update_free_bits_low(block, max_ins_size,
5183 mtr);
5184 }
5185 }
5186
5187 if (!srv_read_only_mode
5188 && !big_rec_vec
5189 && page_is_leaf(block->frame)
5190 && !dict_index_is_online_ddl(index)) {
5191
5192 mtr_memo_release(mtr, dict_index_get_lock(index),
5193 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5194
5195 /* NOTE: We cannot release root block latch here, because it
5196 has segment header and already modified in most of cases.*/
5197 }
5198
5199 err = DB_SUCCESS;
5200 goto return_after_reservations;
5201 } else {
5202 /* If the page is compressed and it initially
5203 compresses very well, and there is a subsequent insert
5204 of a badly-compressing record, it is possible for
5205 btr_cur_optimistic_update() to return DB_UNDERFLOW and
5206 btr_cur_insert_if_possible() to return FALSE. */
5207 ut_a(page_zip || optim_err != DB_UNDERFLOW);
5208
5209 /* Out of space: reset the free bits.
5210 This is the same block which was skipped by
5211 BTR_KEEP_IBUF_BITMAP. */
5212 if (!dict_index_is_clust(index)
5213 && !index->table->is_temporary()
5214 && page_is_leaf(block->frame)) {
5215 ibuf_reset_free_bits(block);
5216 }
5217 }
5218
5219 if (big_rec_vec != NULL) {
5220 ut_ad(page_is_leaf(block->frame));
5221 ut_ad(dict_index_is_clust(index));
5222 ut_ad(flags & BTR_KEEP_POS_FLAG);
5223
5224 /* btr_page_split_and_insert() in
5225 btr_cur_pessimistic_insert() invokes
5226 mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
5227 We must keep the index->lock when we created a
5228 big_rec, so that row_upd_clust_rec() can store the
5229 big_rec in the same mini-transaction. */
5230
5231 ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
5232 | MTR_MEMO_SX_LOCK));
5233 mtr_sx_lock_index(index, mtr);
5234 }
5235
5236 /* Was the record to be updated positioned as the first user
5237 record on its page? */
5238 was_first = page_cur_is_before_first(page_cursor);
5239
5240 /* Lock checks and undo logging were already performed by
5241 btr_cur_upd_lock_and_undo(). We do not try
5242 btr_cur_optimistic_insert() because
5243 btr_cur_insert_if_possible() already failed above. */
5244
5245 err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
5246 | BTR_NO_LOCKING_FLAG
5247 | BTR_KEEP_SYS_FLAG,
5248 cursor, offsets, offsets_heap,
5249 new_entry, &rec,
5250 &dummy_big_rec, n_ext, NULL, mtr);
5251 ut_a(rec);
5252 ut_a(err == DB_SUCCESS);
5253 ut_a(dummy_big_rec == NULL);
5254 ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
5255 page_cursor->rec = rec;
5256
5257 /* Multiple transactions cannot simultaneously operate on the
5258 same temp-table in parallel.
5259 max_trx_id is ignored for temp tables because it not required
5260 for MVCC. */
5261 if (dict_index_is_sec_or_ibuf(index)
5262 && !index->table->is_temporary()) {
5263 /* Update PAGE_MAX_TRX_ID in the index page header.
5264 It was not updated by btr_cur_pessimistic_insert()
5265 because of BTR_NO_LOCKING_FLAG. */
5266 page_update_max_trx_id(btr_cur_get_block(cursor),
5267 btr_cur_get_page_zip(cursor),
5268 trx_id, mtr);
5269 }
5270
5271 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
5272 /* The new inserted record owns its possible externally
5273 stored fields */
5274 #ifdef UNIV_ZIP_DEBUG
5275 ut_a(!page_zip || page_zip_validate(page_zip, block->frame,
5276 index));
5277 #endif /* UNIV_ZIP_DEBUG */
5278 btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
5279 index, *offsets, mtr);
5280 } else {
5281 /* In delete-marked records, DB_TRX_ID must
5282 always refer to an existing undo log record. */
5283 ut_ad(row_get_rec_trx_id(rec, index, *offsets));
5284 }
5285
5286 if (UNIV_UNLIKELY(is_metadata)) {
5287 /* We must empty the PAGE_FREE list, because if this
5288 was a rollback, the shortened metadata record
5289 would have too many fields, and we would be unable to
5290 know the size of the freed record. */
5291 btr_page_reorganize(page_cursor, index, mtr);
5292 rec = page_cursor->rec;
5293 } else if (!dict_table_is_locking_disabled(index->table)) {
5294 lock_rec_restore_from_page_infimum(
5295 btr_cur_get_block(cursor), rec, block);
5296 }
5297
5298 /* If necessary, restore also the correct lock state for a new,
5299 preceding supremum record created in a page split. While the old
5300 record was nonexistent, the supremum might have inherited its locks
5301 from a wrong record. */
5302
5303 if (!was_first && !dict_table_is_locking_disabled(index->table)) {
5304 btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
5305 rec, mtr);
5306 }
5307
5308 return_after_reservations:
5309 #ifdef UNIV_ZIP_DEBUG
5310 ut_a(!page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
5311 btr_cur_get_page(cursor), index));
5312 #endif /* UNIV_ZIP_DEBUG */
5313
5314 index->table->space->release_free_extents(n_reserved);
5315 *big_rec = big_rec_vec;
5316 return(err);
5317 }
5318
5319 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
5320
5321 /** Modify the delete-mark flag of a record.
5322 @tparam flag the value of the delete-mark flag
5323 @param[in,out] block buffer block
5324 @param[in,out] rec record on a physical index page
5325 @param[in,out] mtr mini-transaction */
5326 template<bool flag>
btr_rec_set_deleted(buf_block_t * block,rec_t * rec,mtr_t * mtr)5327 void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
5328 {
5329 if (page_rec_is_comp(rec))
5330 {
5331 byte *b= &rec[-REC_NEW_INFO_BITS];
5332 const byte v= flag
5333 ? (*b | REC_INFO_DELETED_FLAG)
5334 : (*b & byte(~REC_INFO_DELETED_FLAG));
5335 if (*b == v);
5336 else if (UNIV_LIKELY_NULL(block->page.zip.data))
5337 {
5338 *b= v;
5339 page_zip_rec_set_deleted(block, rec, flag, mtr);
5340 }
5341 else
5342 mtr->write<1>(*block, b, v);
5343 }
5344 else
5345 {
5346 ut_ad(!block->page.zip.data);
5347 byte *b= &rec[-REC_OLD_INFO_BITS];
5348 const byte v = flag
5349 ? (*b | REC_INFO_DELETED_FLAG)
5350 : (*b & byte(~REC_INFO_DELETED_FLAG));
5351 mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
5352 }
5353 }
5354
5355 template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
5356 template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
5357
5358 /***********************************************************//**
5359 Marks a clustered index record deleted. Writes an undo log record to
5360 undo log on this delete marking. Writes in the trx id field the id
5361 of the deleting transaction, and in the roll ptr field pointer to the
5362 undo log record created.
5363 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
5364 dberr_t
btr_cur_del_mark_set_clust_rec(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)5365 btr_cur_del_mark_set_clust_rec(
5366 /*===========================*/
5367 buf_block_t* block, /*!< in/out: buffer block of the record */
5368 rec_t* rec, /*!< in/out: record */
5369 dict_index_t* index, /*!< in: clustered index of the record */
5370 const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */
5371 que_thr_t* thr, /*!< in: query thread */
5372 const dtuple_t* entry, /*!< in: dtuple for the deleting record, also
5373 contains the virtual cols if there are any */
5374 mtr_t* mtr) /*!< in/out: mini-transaction */
5375 {
5376 roll_ptr_t roll_ptr;
5377 dberr_t err;
5378 trx_t* trx;
5379
5380 ut_ad(dict_index_is_clust(index));
5381 ut_ad(rec_offs_validate(rec, index, offsets));
5382 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
5383 ut_ad(buf_block_get_frame(block) == page_align(rec));
5384 ut_ad(page_rec_is_leaf(rec));
5385 ut_ad(mtr->is_named_space(index->table->space));
5386
5387 if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
5388 /* We may already have delete-marked this record
5389 when executing an ON DELETE CASCADE operation. */
5390 ut_ad(row_get_rec_trx_id(rec, index, offsets)
5391 == thr_get_trx(thr)->id);
5392 return(DB_SUCCESS);
5393 }
5394
5395 err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
5396 rec, index, offsets, thr);
5397
5398 if (err != DB_SUCCESS) {
5399
5400 return(err);
5401 }
5402
5403 err = trx_undo_report_row_operation(thr, index,
5404 entry, NULL, 0, rec, offsets,
5405 &roll_ptr);
5406 if (err != DB_SUCCESS) {
5407
5408 return(err);
5409 }
5410
5411 /* The search latch is not needed here, because
5412 the adaptive hash index does not depend on the delete-mark
5413 and the delete-mark is being updated in place. */
5414
5415 btr_rec_set_deleted<true>(block, rec, mtr);
5416
5417 trx = thr_get_trx(thr);
5418
5419 DBUG_LOG("ib_cur",
5420 "delete-mark clust " << index->table->name
5421 << " (" << index->id << ") by "
5422 << ib::hex(trx_get_id_for_print(trx)) << ": "
5423 << rec_printer(rec, offsets).str());
5424
5425 if (dict_index_is_online_ddl(index)) {
5426 row_log_table_delete(rec, index, offsets, NULL);
5427 }
5428
5429 btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, mtr);
5430 return(err);
5431 }
5432
5433 /*==================== B-TREE RECORD REMOVE =========================*/
5434
5435 /*************************************************************//**
5436 Tries to compress a page of the tree if it seems useful. It is assumed
5437 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5438 deadlocks, mtr must also own x-latches to brothers of page, if those
5439 brothers exist. NOTE: it is assumed that the caller has reserved enough
5440 free extents so that the compression will always succeed if done!
5441 @return TRUE if compression occurred */
5442 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5443 btr_cur_compress_if_useful(
5444 /*=======================*/
5445 btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
5446 cursor does not stay valid if !adjust and
5447 compression occurs */
5448 ibool adjust, /*!< in: TRUE if should adjust the
5449 cursor position even if compression occurs */
5450 mtr_t* mtr) /*!< in/out: mini-transaction */
5451 {
5452 ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
5453 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
5454 ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
5455 MTR_MEMO_PAGE_X_FIX));
5456
5457 if (cursor->index->is_spatial()) {
5458 const trx_t* trx = cursor->rtr_info->thr
5459 ? thr_get_trx(cursor->rtr_info->thr)
5460 : NULL;
5461 const buf_block_t* block = btr_cur_get_block(cursor);
5462
5463 /* Check whether page lock prevents the compression */
5464 if (!lock_test_prdt_page_lock(trx, block->page.id())) {
5465 return(false);
5466 }
5467 }
5468
5469 return(btr_cur_compress_recommendation(cursor, mtr)
5470 && btr_compress(cursor, adjust, mtr));
5471 }
5472
5473 /*******************************************************//**
5474 Removes the record on which the tree cursor is positioned on a leaf page.
5475 It is assumed that the mtr has an x-latch on the page where the cursor is
5476 positioned, but no latch on the whole tree.
5477 @return TRUE if success, i.e., the page did not become too empty */
5478 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5479 btr_cur_optimistic_delete_func(
5480 /*===========================*/
5481 btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
5482 delete; cursor stays valid: if deletion
5483 succeeds, on function exit it points to the
5484 successor of the deleted record */
5485 #ifdef UNIV_DEBUG
5486 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5487 #endif /* UNIV_DEBUG */
5488 mtr_t* mtr) /*!< in: mtr; if this function returns
5489 TRUE on a leaf page of a secondary
5490 index, the mtr must be committed
5491 before latching any further pages */
5492 {
5493 buf_block_t* block;
5494 rec_t* rec;
5495 mem_heap_t* heap = NULL;
5496 rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
5497 rec_offs* offsets = offsets_;
5498 rec_offs_init(offsets_);
5499
5500 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5501 ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
5502 MTR_MEMO_PAGE_X_FIX));
5503 ut_ad(mtr->is_named_space(cursor->index->table->space));
5504 ut_ad(!cursor->index->is_dummy);
5505
5506 /* This is intended only for leaf page deletions */
5507
5508 block = btr_cur_get_block(cursor);
5509
5510 ut_ad(block->page.id().space() == cursor->index->table->space->id);
5511 ut_ad(page_is_leaf(buf_block_get_frame(block)));
5512 ut_ad(!dict_index_is_online_ddl(cursor->index)
5513 || dict_index_is_clust(cursor->index)
5514 || (flags & BTR_CREATE_FLAG));
5515
5516 rec = btr_cur_get_rec(cursor);
5517
5518 offsets = rec_get_offsets(rec, cursor->index, offsets,
5519 cursor->index->n_core_fields,
5520 ULINT_UNDEFINED, &heap);
5521
5522 const ibool no_compress_needed = !rec_offs_any_extern(offsets)
5523 && btr_cur_can_delete_without_compress(
5524 cursor, rec_offs_size(offsets), mtr);
5525
5526 if (!no_compress_needed) {
5527 /* prefetch siblings of the leaf for the pessimistic
5528 operation. */
5529 btr_cur_prefetch_siblings(block, cursor->index);
5530 goto func_exit;
5531 }
5532
5533 if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page
5534 && page_get_n_recs(block->frame) == 1
5535 + (cursor->index->is_instant()
5536 && !rec_is_metadata(rec, *cursor->index))
5537 && !cursor->index->must_avoid_clear_instant_add())) {
5538 /* The whole index (and table) becomes logically empty.
5539 Empty the whole page. That is, if we are deleting the
5540 only user record, also delete the metadata record
5541 if one exists for instant ADD COLUMN (not generic ALTER TABLE).
5542 If we are deleting the metadata record and the
5543 table becomes empty, clean up the whole page. */
5544 dict_index_t* index = cursor->index;
5545 const rec_t* first_rec = page_rec_get_next_const(
5546 page_get_infimum_rec(block->frame));
5547 ut_ad(!index->is_instant()
5548 || rec_is_metadata(first_rec, *index));
5549 const bool is_metadata = rec_is_metadata(rec, *index);
5550 /* We can remove the metadata when rolling back an
5551 instant ALTER TABLE operation, or when deleting the
5552 last user record on the page such that only metadata for
5553 instant ADD COLUMN (not generic ALTER TABLE) remains. */
5554 const bool empty_table = is_metadata
5555 || !index->is_instant()
5556 || (first_rec != rec
5557 && rec_is_add_metadata(first_rec, *index));
5558 if (UNIV_LIKELY(empty_table)) {
5559 if (UNIV_LIKELY(!is_metadata)) {
5560 lock_update_delete(block, rec);
5561 }
5562 btr_page_empty(block, buf_block_get_page_zip(block),
5563 index, 0, mtr);
5564 if (index->is_instant()) {
5565 /* MDEV-17383: free metadata BLOBs! */
5566 index->clear_instant_alter();
5567 }
5568 page_cur_set_after_last(block,
5569 btr_cur_get_page_cur(cursor));
5570 goto func_exit;
5571 }
5572 }
5573
5574 {
5575 page_t* page = buf_block_get_frame(block);
5576 page_zip_des_t* page_zip= buf_block_get_page_zip(block);
5577
5578 if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
5579 & REC_INFO_MIN_REC_FLAG)) {
5580 /* This should be rolling back instant ADD COLUMN.
5581 If this is a recovered transaction, then
5582 index->is_instant() will hold until the
5583 insert into SYS_COLUMNS is rolled back. */
5584 ut_ad(cursor->index->table->supports_instant());
5585 ut_ad(cursor->index->is_primary());
5586 ut_ad(!page_zip);
5587 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5588 cursor->index, offsets, mtr);
5589 /* We must empty the PAGE_FREE list, because
5590 after rollback, this deleted metadata record
5591 would have too many fields, and we would be
5592 unable to know the size of the freed record. */
5593 btr_page_reorganize(btr_cur_get_page_cur(cursor),
5594 cursor->index, mtr);
5595 goto func_exit;
5596 } else {
5597 lock_update_delete(block, rec);
5598
5599 btr_search_update_hash_on_delete(cursor);
5600 }
5601
5602 if (page_zip) {
5603 #ifdef UNIV_ZIP_DEBUG
5604 ut_a(page_zip_validate(page_zip, page, cursor->index));
5605 #endif /* UNIV_ZIP_DEBUG */
5606 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5607 cursor->index, offsets, mtr);
5608 #ifdef UNIV_ZIP_DEBUG
5609 ut_a(page_zip_validate(page_zip, page, cursor->index));
5610 #endif /* UNIV_ZIP_DEBUG */
5611
5612 /* On compressed pages, the IBUF_BITMAP_FREE
5613 space is not affected by deleting (purging)
5614 records, because it is defined as the minimum
5615 of space available *without* reorganize, and
5616 space available in the modification log. */
5617 } else {
5618 const ulint max_ins
5619 = page_get_max_insert_size_after_reorganize(
5620 page, 1);
5621
5622 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5623 cursor->index, offsets, mtr);
5624
5625 /* The change buffer does not handle inserts
5626 into non-leaf pages, into clustered indexes,
5627 or into the change buffer. */
5628 if (!dict_index_is_clust(cursor->index)
5629 && !cursor->index->table->is_temporary()
5630 && !dict_index_is_ibuf(cursor->index)) {
5631 ibuf_update_free_bits_low(block, max_ins, mtr);
5632 }
5633 }
5634 }
5635
5636 func_exit:
5637 if (UNIV_LIKELY_NULL(heap)) {
5638 mem_heap_free(heap);
5639 }
5640
5641 return(no_compress_needed);
5642 }
5643
5644 /*************************************************************//**
5645 Removes the record on which the tree cursor is positioned. Tries
5646 to compress the page if its fillfactor drops below a threshold
5647 or if it is the only page on the level. It is assumed that mtr holds
5648 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5649 mtr must also own x-latches to brothers of page, if those brothers
5650 exist.
5651 @return TRUE if compression occurred and FALSE if not or something
5652 wrong. */
5653 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5654 btr_cur_pessimistic_delete(
5655 /*=======================*/
5656 dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5657 the latter may occur because we may have
5658 to update node pointers on upper levels,
5659 and in the case of variable length keys
5660 these may actually grow in size */
5661 ibool has_reserved_extents, /*!< in: TRUE if the
5662 caller has already reserved enough free
5663 extents so that he knows that the operation
5664 will succeed */
5665 btr_cur_t* cursor, /*!< in: cursor on the record to delete;
5666 if compression does not occur, the cursor
5667 stays valid: it points to successor of
5668 deleted record on function exit */
5669 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5670 bool rollback,/*!< in: performing rollback? */
5671 mtr_t* mtr) /*!< in: mtr */
5672 {
5673 buf_block_t* block;
5674 page_t* page;
5675 page_zip_des_t* page_zip;
5676 dict_index_t* index;
5677 rec_t* rec;
5678 uint32_t n_reserved = 0;
5679 bool success;
5680 ibool ret = FALSE;
5681 mem_heap_t* heap;
5682 rec_offs* offsets;
5683 #ifdef UNIV_DEBUG
5684 bool parent_latched = false;
5685 #endif /* UNIV_DEBUG */
5686
5687 block = btr_cur_get_block(cursor);
5688 page = buf_block_get_frame(block);
5689 index = btr_cur_get_index(cursor);
5690
5691 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5692 ut_ad(!dict_index_is_online_ddl(index)
5693 || dict_index_is_clust(index)
5694 || (flags & BTR_CREATE_FLAG));
5695 ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
5696 | MTR_MEMO_SX_LOCK));
5697 ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
5698 ut_ad(mtr->is_named_space(index->table->space));
5699 ut_ad(!index->is_dummy);
5700 ut_ad(block->page.id().space() == index->table->space->id);
5701
5702 if (!has_reserved_extents) {
5703 /* First reserve enough free space for the file segments
5704 of the index tree, so that the node pointer updates will
5705 not fail because of lack of space */
5706
5707 uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
5708
5709 success = fsp_reserve_free_extents(&n_reserved,
5710 index->table->space,
5711 n_extents,
5712 FSP_CLEANING, mtr);
5713 if (!success) {
5714 *err = DB_OUT_OF_FILE_SPACE;
5715
5716 return(FALSE);
5717 }
5718 }
5719
5720 heap = mem_heap_create(1024);
5721 rec = btr_cur_get_rec(cursor);
5722 page_zip = buf_block_get_page_zip(block);
5723 #ifdef UNIV_ZIP_DEBUG
5724 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5725 #endif /* UNIV_ZIP_DEBUG */
5726
5727 offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page)
5728 ? index->n_core_fields : 0,
5729 ULINT_UNDEFINED, &heap);
5730
5731 if (rec_offs_any_extern(offsets)) {
5732 btr_rec_free_externally_stored_fields(index,
5733 rec, offsets, block,
5734 rollback, mtr);
5735 #ifdef UNIV_ZIP_DEBUG
5736 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5737 #endif /* UNIV_ZIP_DEBUG */
5738 }
5739
5740 rec_t* next_rec = NULL;
5741 bool min_mark_next_rec = false;
5742
5743 if (page_is_leaf(page)) {
5744 const bool is_metadata = rec_is_metadata(
5745 rec, page_rec_is_comp(rec));
5746 if (UNIV_UNLIKELY(is_metadata)) {
5747 /* This should be rolling back instant ALTER TABLE.
5748 If this is a recovered transaction, then
5749 index->is_instant() will hold until the
5750 insert into SYS_COLUMNS is rolled back. */
5751 ut_ad(rollback);
5752 ut_ad(index->table->supports_instant());
5753 ut_ad(index->is_primary());
5754 } else if (flags == 0) {
5755 lock_update_delete(block, rec);
5756 }
5757
5758 if (block->page.id().page_no() != index->page) {
5759 if (page_get_n_recs(page) < 2) {
5760 goto discard_page;
5761 }
5762 } else if (page_get_n_recs(page) == 1
5763 + (index->is_instant() && !is_metadata)
5764 && !index->must_avoid_clear_instant_add()) {
5765 /* The whole index (and table) becomes logically empty.
5766 Empty the whole page. That is, if we are deleting the
5767 only user record, also delete the metadata record
5768 if one exists for instant ADD COLUMN
5769 (not generic ALTER TABLE).
5770 If we are deleting the metadata record
5771 (in the rollback of instant ALTER TABLE) and the
5772 table becomes empty, clean up the whole page. */
5773
5774 const rec_t* first_rec = page_rec_get_next_const(
5775 page_get_infimum_rec(page));
5776 ut_ad(!index->is_instant()
5777 || rec_is_metadata(first_rec, *index));
5778 if (is_metadata || !index->is_instant()
5779 || (first_rec != rec
5780 && rec_is_add_metadata(first_rec, *index))) {
5781 btr_page_empty(block, page_zip, index, 0, mtr);
5782 if (index->is_instant()) {
5783 /* MDEV-17383: free metadata BLOBs! */
5784 index->clear_instant_alter();
5785 }
5786 page_cur_set_after_last(
5787 block,
5788 btr_cur_get_page_cur(cursor));
5789 ret = TRUE;
5790 goto return_after_reservations;
5791 }
5792 }
5793
5794 if (UNIV_LIKELY(!is_metadata)) {
5795 btr_search_update_hash_on_delete(cursor);
5796 } else {
5797 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5798 index, offsets, mtr);
5799 /* We must empty the PAGE_FREE list, because
5800 after rollback, this deleted metadata record
5801 would carry too many fields, and we would be
5802 unable to know the size of the freed record. */
5803 btr_page_reorganize(btr_cur_get_page_cur(cursor),
5804 index, mtr);
5805 ut_ad(!ret);
5806 goto return_after_reservations;
5807 }
5808 } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
5809 if (page_rec_is_last(rec, page)) {
5810 discard_page:
5811 ut_ad(page_get_n_recs(page) == 1);
5812 /* If there is only one record, drop
5813 the whole page. */
5814
5815 btr_discard_page(cursor, mtr);
5816
5817 ret = TRUE;
5818 goto return_after_reservations;
5819 }
5820
5821 next_rec = page_rec_get_next(rec);
5822
5823 if (!page_has_prev(page)) {
5824 /* If we delete the leftmost node pointer on a
5825 non-leaf level, we must mark the new leftmost node
5826 pointer as the predefined minimum record */
5827
5828 min_mark_next_rec = true;
5829 } else if (index->is_spatial()) {
5830 /* For rtree, if delete the leftmost node pointer,
5831 we need to update parent page. */
5832 rtr_mbr_t father_mbr;
5833 rec_t* father_rec;
5834 btr_cur_t father_cursor;
5835 rec_offs* offsets;
5836 bool upd_ret;
5837 ulint len;
5838
5839 rtr_page_get_father_block(NULL, heap, index,
5840 block, mtr, NULL,
5841 &father_cursor);
5842 offsets = rec_get_offsets(
5843 btr_cur_get_rec(&father_cursor), index, NULL,
5844 0, ULINT_UNDEFINED, &heap);
5845
5846 father_rec = btr_cur_get_rec(&father_cursor);
5847 rtr_read_mbr(rec_get_nth_field(
5848 father_rec, offsets, 0, &len), &father_mbr);
5849
5850 upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5851 NULL, page, &father_mbr,
5852 next_rec, mtr);
5853
5854 if (!upd_ret) {
5855 *err = DB_ERROR;
5856
5857 mem_heap_free(heap);
5858 return(FALSE);
5859 }
5860
5861 ut_d(parent_latched = true);
5862 } else {
5863 /* Otherwise, if we delete the leftmost node pointer
5864 on a page, we have to change the parent node pointer
5865 so that it is equal to the new leftmost node pointer
5866 on the page */
5867 btr_cur_t cursor;
5868 btr_page_get_father(index, block, mtr, &cursor);
5869 btr_cur_node_ptr_delete(&cursor, mtr);
5870 const ulint level = btr_page_get_level(page);
5871 // FIXME: reuse the node_ptr from above
5872 dtuple_t* node_ptr = dict_index_build_node_ptr(
5873 index, next_rec, block->page.id().page_no(),
5874 heap, level);
5875
5876 btr_insert_on_non_leaf_level(
5877 flags, index, level + 1, node_ptr, mtr);
5878
5879 ut_d(parent_latched = true);
5880 }
5881 }
5882
5883 /* SPATIAL INDEX never use SX locks; we can allow page merges
5884 while holding X lock on the spatial index tree.
5885 Do not allow merges of non-leaf B-tree pages unless it is
5886 safe to do so. */
5887 {
5888 const bool allow_merge = page_is_leaf(page)
5889 || dict_index_is_spatial(index)
5890 || btr_cur_will_modify_tree(
5891 index, page, BTR_INTENTION_DELETE, rec,
5892 btr_node_ptr_max_size(index),
5893 block->zip_size(), mtr);
5894 page_cur_delete_rec(btr_cur_get_page_cur(cursor), index,
5895 offsets, mtr);
5896
5897 if (min_mark_next_rec) {
5898 btr_set_min_rec_mark(next_rec, *block, mtr);
5899 }
5900
5901 #ifdef UNIV_ZIP_DEBUG
5902 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5903 #endif /* UNIV_ZIP_DEBUG */
5904
5905 ut_ad(!parent_latched
5906 || btr_check_node_ptr(index, block, mtr));
5907
5908 if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
5909 if (UNIV_LIKELY(allow_merge)) {
5910 ret = btr_cur_compress_if_useful(
5911 cursor, FALSE, mtr);
5912 } else {
5913 ib::warn() << "Not merging page "
5914 << block->page.id()
5915 << " in index " << index->name
5916 << " of " << index->table->name;
5917 ut_ad("MDEV-14637" == 0);
5918 }
5919 }
5920 }
5921
5922 return_after_reservations:
5923 *err = DB_SUCCESS;
5924
5925 mem_heap_free(heap);
5926
5927 if (!srv_read_only_mode
5928 && page_is_leaf(page)
5929 && !dict_index_is_online_ddl(index)) {
5930
5931 mtr_memo_release(mtr, dict_index_get_lock(index),
5932 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5933
5934 /* NOTE: We cannot release root block latch here, because it
5935 has segment header and already modified in most of cases.*/
5936 }
5937
5938 index->table->space->release_free_extents(n_reserved);
5939 return(ret);
5940 }
5941
5942 /** Delete the node pointer in a parent page.
5943 @param[in,out] parent cursor pointing to parent record
5944 @param[in,out] mtr mini-transaction */
btr_cur_node_ptr_delete(btr_cur_t * parent,mtr_t * mtr)5945 void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
5946 {
5947 ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
5948 MTR_MEMO_PAGE_X_FIX));
5949 dberr_t err;
5950 ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
5951 BTR_CREATE_FLAG, false,
5952 mtr);
5953 ut_a(err == DB_SUCCESS);
5954 if (!compressed) {
5955 btr_cur_compress_if_useful(parent, FALSE, mtr);
5956 }
5957 }
5958
5959 /*******************************************************************//**
5960 Adds path information to the cursor for the current page, for which
5961 the binary search has been performed. */
5962 static
5963 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5964 btr_cur_add_path_info(
5965 /*==================*/
5966 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
5967 ulint height, /*!< in: height of the page in tree;
5968 0 means leaf node */
5969 ulint root_height) /*!< in: root node height in tree */
5970 {
5971 btr_path_t* slot;
5972
5973 ut_a(cursor->path_arr);
5974
5975 if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5976 /* Do nothing; return empty path */
5977
5978 slot = cursor->path_arr;
5979 slot->nth_rec = ULINT_UNDEFINED;
5980
5981 return;
5982 }
5983
5984 if (height == 0) {
5985 /* Mark end of slots for path */
5986 slot = cursor->path_arr + root_height + 1;
5987 slot->nth_rec = ULINT_UNDEFINED;
5988 }
5989
5990 slot = cursor->path_arr + (root_height - height);
5991
5992 const buf_block_t* block = btr_cur_get_block(cursor);
5993
5994 slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
5995 slot->n_recs = page_get_n_recs(block->frame);
5996 slot->page_no = block->page.id().page_no();
5997 slot->page_level = btr_page_get_level(block->frame);
5998 }
5999
6000 /*******************************************************************//**
6001 Estimate the number of rows between slot1 and slot2 for any level on a
6002 B-tree. This function starts from slot1->page and reads a few pages to
6003 the right, counting their records. If we reach slot2->page quickly then
6004 we know exactly how many records there are between slot1 and slot2 and
6005 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
6006 then we calculate the average number of records in the pages scanned
6007 so far and assume that all pages that we did not scan up to slot2->page
6008 contain the same number of records, then we multiply that average to
6009 the number of pages between slot1->page and slot2->page (which is
6010 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
6011 @return number of rows, not including the borders (exact or estimated) */
6012 static
6013 ha_rows
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,ha_rows n_rows_on_prev_level,bool * is_n_rows_exact)6014 btr_estimate_n_rows_in_range_on_level(
6015 /*==================================*/
6016 dict_index_t* index, /*!< in: index */
6017 btr_path_t* slot1, /*!< in: left border */
6018 btr_path_t* slot2, /*!< in: right border */
6019 ha_rows n_rows_on_prev_level, /*!< in: number of rows
6020 on the previous level for the
6021 same descend paths; used to
6022 determine the number of pages
6023 on this level */
6024 bool* is_n_rows_exact) /*!< out: TRUE if the returned
6025 value is exact i.e. not an
6026 estimation */
6027 {
6028 ha_rows n_rows = 0;
6029 uint n_pages_read = 0;
6030 ulint level;
6031
6032 /* Assume by default that we will scan all pages between
6033 slot1->page_no and slot2->page_no. */
6034 *is_n_rows_exact = true;
6035
6036 /* Add records from slot1->page_no which are to the right of
6037 the record which serves as a left border of the range, if any
6038 (we don't include the record itself in this count). */
6039 if (slot1->nth_rec <= slot1->n_recs) {
6040 n_rows += slot1->n_recs - slot1->nth_rec;
6041 }
6042
6043 /* Add records from slot2->page_no which are to the left of
6044 the record which servers as a right border of the range, if any
6045 (we don't include the record itself in this count). */
6046 if (slot2->nth_rec > 1) {
6047 n_rows += slot2->nth_rec - 1;
6048 }
6049
6050 /* Count the records in the pages between slot1->page_no and
6051 slot2->page_no (non inclusive), if any. */
6052
6053 /* Do not read more than this number of pages in order not to hurt
6054 performance with this code which is just an estimation. If we read
6055 this many pages before reaching slot2->page_no then we estimate the
6056 average from the pages scanned so far. */
6057 # define N_PAGES_READ_LIMIT 10
6058
6059 const fil_space_t* space = index->table->space;
6060 page_id_t page_id(space->id, slot1->page_no);
6061 const ulint zip_size = space->zip_size();
6062
6063 level = slot1->page_level;
6064
6065 do {
6066 mtr_t mtr;
6067 page_t* page;
6068 buf_block_t* block;
6069 dberr_t err=DB_SUCCESS;
6070
6071 mtr_start(&mtr);
6072
6073 /* Fetch the page. Because we are not holding the
6074 index->lock, the tree may have changed and we may be
6075 attempting to read a page that is no longer part of
6076 the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
6077 silence a debug assertion about this. */
6078 block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH,
6079 NULL, BUF_GET_POSSIBLY_FREED,
6080 __FILE__, __LINE__, &mtr, &err);
6081
6082 ut_ad((block != NULL) == (err == DB_SUCCESS));
6083
6084 if (!block) {
6085 if (err == DB_DECRYPTION_FAILED) {
6086 ib_push_warning((void *)NULL,
6087 DB_DECRYPTION_FAILED,
6088 "Table %s is encrypted but encryption service or"
6089 " used key_id is not available. "
6090 " Can't continue reading table.",
6091 index->table->name.m_name);
6092 index->table->file_unreadable = true;
6093 }
6094
6095 mtr_commit(&mtr);
6096 goto inexact;
6097 }
6098
6099 page = buf_block_get_frame(block);
6100
6101 /* It is possible that the tree has been reorganized in the
6102 meantime and this is a different page. If this happens the
6103 calculated estimate will be bogus, which is not fatal as
6104 this is only an estimate. We are sure that a page with
6105 page_no exists because InnoDB never frees pages, only
6106 reuses them. */
6107 if (!fil_page_index_page_check(page)
6108 || btr_page_get_index_id(page) != index->id
6109 || btr_page_get_level(page) != level) {
6110
6111 /* The page got reused for something else */
6112 mtr_commit(&mtr);
6113 goto inexact;
6114 }
6115
6116 /* It is possible but highly unlikely that the page was
6117 originally written by an old version of InnoDB that did
6118 not initialize FIL_PAGE_TYPE on other than B-tree pages.
6119 For example, this could be an almost-empty BLOB page
6120 that happens to contain the magic values in the fields
6121 that we checked above. */
6122
6123 n_pages_read++;
6124
6125 if (page_id.page_no() != slot1->page_no) {
6126 /* Do not count the records on slot1->page_no,
6127 we already counted them before this loop. */
6128 n_rows += page_get_n_recs(page);
6129 }
6130
6131 page_id.set_page_no(btr_page_get_next(page));
6132
6133 mtr_commit(&mtr);
6134
6135 if (n_pages_read == N_PAGES_READ_LIMIT
6136 || page_id.page_no() == FIL_NULL) {
6137 /* Either we read too many pages or
6138 we reached the end of the level without passing
6139 through slot2->page_no, the tree must have changed
6140 in the meantime */
6141 goto inexact;
6142 }
6143
6144 } while (page_id.page_no() != slot2->page_no);
6145
6146 return(n_rows);
6147
6148 inexact:
6149
6150 *is_n_rows_exact = false;
6151
6152 /* We did interrupt before reaching slot2->page */
6153
6154 if (n_pages_read > 0) {
6155 /* The number of pages on this level is
6156 n_rows_on_prev_level, multiply it by the
6157 average number of recs per page so far */
6158 n_rows = n_rows_on_prev_level * n_rows / n_pages_read;
6159 } else {
6160 /* The tree changed before we could even
6161 start with slot1->page_no */
6162 n_rows = 10;
6163 }
6164
6165 return(n_rows);
6166 }
6167
6168 /** If the tree gets changed too much between the two dives for the left
6169 and right boundary then btr_estimate_n_rows_in_range_low() will retry
6170 that many times before giving up and returning the value stored in
6171 rows_in_range_arbitrary_ret_val. */
6172 static const unsigned rows_in_range_max_retries = 4;
6173
6174 /** We pretend that a range has that many records if the tree keeps changing
6175 for rows_in_range_max_retries retries while we try to estimate the records
6176 in a given range. */
6177 static const ha_rows rows_in_range_arbitrary_ret_val = 10;
6178
6179 /** Estimates the number of rows in a given index range.
6180 @param[in] index index
6181 @param[in] tuple1 range start
6182 @param[in] tuple2 range end
6183 @param[in] nth_attempt if the tree gets modified too much while
6184 we are trying to analyze it, then we will retry (this function will call
6185 itself, incrementing this parameter)
6186 @return estimated number of rows; if after rows_in_range_max_retries
6187 retries the tree keeps changing, then we will just return
6188 rows_in_range_arbitrary_ret_val as a result (if
6189 nth_attempt >= rows_in_range_max_retries and the tree is modified between
6190 the two dives). */
6191 static
6192 ha_rows
btr_estimate_n_rows_in_range_low(dict_index_t * index,btr_pos_t * tuple1,btr_pos_t * tuple2,unsigned nth_attempt)6193 btr_estimate_n_rows_in_range_low(
6194 dict_index_t* index,
6195 btr_pos_t* tuple1,
6196 btr_pos_t* tuple2,
6197 unsigned nth_attempt)
6198 {
6199 btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
6200 btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
6201 btr_cur_t cursor;
6202 btr_path_t* slot1;
6203 btr_path_t* slot2;
6204 bool diverged;
6205 bool diverged_lot;
6206 ulint divergence_level;
6207 ha_rows n_rows;
6208 bool is_n_rows_exact;
6209 ulint i;
6210 mtr_t mtr;
6211 ha_rows table_n_rows;
6212 page_cur_mode_t mode2= tuple2->mode;
6213
6214 table_n_rows = dict_table_get_n_rows(index->table);
6215
6216 /* Below we dive to the two records specified by tuple1 and tuple2 and
6217 we remember the entire dive paths from the tree root. The place where
6218 the tuple1 path ends on the leaf level we call "left border" of our
6219 interval and the place where the tuple2 path ends on the leaf level -
6220 "right border". We take care to either include or exclude the interval
6221 boundaries depending on whether <, <=, > or >= was specified. For
6222 example if "5 < x AND x <= 10" then we should not include the left
6223 boundary, but should include the right one. */
6224
6225 mtr_start(&mtr);
6226
6227 cursor.path_arr = path1;
6228
6229 bool should_count_the_left_border;
6230
6231 if (dtuple_get_n_fields(tuple1->tuple) > 0) {
6232
6233 btr_cur_search_to_nth_level(index, 0, tuple1->tuple,
6234 tuple1->mode,
6235 BTR_SEARCH_LEAF | BTR_ESTIMATE,
6236 &cursor, 0,
6237 __FILE__, __LINE__, &mtr);
6238
6239 ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
6240
6241 /* We should count the border if there are any records to
6242 match the criteria, i.e. if the maximum record on the tree is
6243 5 and x > 3 is specified then the cursor will be positioned at
6244 5 and we should count the border, but if x > 7 is specified,
6245 then the cursor will be positioned at 'sup' on the rightmost
6246 leaf page in the tree and we should not count the border. */
6247 should_count_the_left_border
6248 = !page_rec_is_supremum(btr_cur_get_rec(&cursor));
6249 } else {
6250 dberr_t err = DB_SUCCESS;
6251
6252 err = btr_cur_open_at_index_side(true, index,
6253 BTR_SEARCH_LEAF | BTR_ESTIMATE,
6254 &cursor, 0, &mtr);
6255
6256 if (err != DB_SUCCESS) {
6257 ib::warn() << " Error code: " << err
6258 << " btr_estimate_n_rows_in_range_low "
6259 << " called from file: "
6260 << __FILE__ << " line: " << __LINE__
6261 << " table: " << index->table->name
6262 << " index: " << index->name;
6263 }
6264
6265 ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
6266
6267 /* The range specified is wihout a left border, just
6268 'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
6269 positioned the cursor on the infimum record on the leftmost
6270 page, which must not be counted. */
6271 should_count_the_left_border = false;
6272 }
6273
6274 tuple1->page_id= cursor.page_cur.block->page.id();
6275
6276 mtr_commit(&mtr);
6277
6278 if (!index->is_readable()) {
6279 return 0;
6280 }
6281
6282 mtr_start(&mtr);
6283
6284 cursor.path_arr = path2;
6285
6286 bool should_count_the_right_border;
6287
6288 if (dtuple_get_n_fields(tuple2->tuple) > 0) {
6289
6290 btr_cur_search_to_nth_level(index, 0, tuple2->tuple,
6291 mode2,
6292 BTR_SEARCH_LEAF | BTR_ESTIMATE,
6293 &cursor, 0,
6294 __FILE__, __LINE__, &mtr);
6295
6296 const rec_t* rec = btr_cur_get_rec(&cursor);
6297
6298 ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
6299
6300 should_count_the_right_border
6301 = (mode2 == PAGE_CUR_LE /* if the range is '<=' */
6302 /* and the record was found */
6303 && cursor.low_match >= dtuple_get_n_fields(tuple2->tuple))
6304 || (mode2 == PAGE_CUR_L /* or if the range is '<' */
6305 /* and there are any records to match the criteria,
6306 i.e. if the minimum record on the tree is 5 and
6307 x < 7 is specified then the cursor will be
6308 positioned at 5 and we should count the border, but
6309 if x < 2 is specified, then the cursor will be
6310 positioned at 'inf' and we should not count the
6311 border */
6312 && !page_rec_is_infimum(rec));
6313 /* Notice that for "WHERE col <= 'foo'" MySQL passes to
6314 ha_innobase::records_in_range():
6315 min_key=NULL (left-unbounded) which is expected
6316 max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
6317 unexpected - one would expect
6318 flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
6319 cursor will be positioned on the first record to the right of
6320 the requested one (can also be positioned on the 'sup') and
6321 we should not count the right border. */
6322 } else {
6323 dberr_t err = DB_SUCCESS;
6324
6325 err = btr_cur_open_at_index_side(false, index,
6326 BTR_SEARCH_LEAF | BTR_ESTIMATE,
6327 &cursor, 0, &mtr);
6328
6329 if (err != DB_SUCCESS) {
6330 ib::warn() << " Error code: " << err
6331 << " btr_estimate_n_rows_in_range_low "
6332 << " called from file: "
6333 << __FILE__ << " line: " << __LINE__
6334 << " table: " << index->table->name
6335 << " index: " << index->name;
6336 }
6337
6338 ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
6339
6340 /* The range specified is wihout a right border, just
6341 'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
6342 positioned the cursor on the supremum record on the rightmost
6343 page, which must not be counted. */
6344 should_count_the_right_border = false;
6345 }
6346
6347 tuple2->page_id= cursor.page_cur.block->page.id();
6348
6349 mtr_commit(&mtr);
6350
6351 /* We have the path information for the range in path1 and path2 */
6352
6353 n_rows = 0;
6354 is_n_rows_exact = true;
6355
6356 /* This becomes true when the two paths do not pass through the
6357 same pages anymore. */
6358 diverged = false;
6359
6360 /* This becomes true when the paths are not the same or adjacent
6361 any more. This means that they pass through the same or
6362 neighboring-on-the-same-level pages only. */
6363 diverged_lot = false;
6364
6365 /* This is the level where paths diverged a lot. */
6366 divergence_level = 1000000;
6367
6368 for (i = 0; ; i++) {
6369 ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
6370
6371 slot1 = path1 + i;
6372 slot2 = path2 + i;
6373
6374 if (slot1->nth_rec == ULINT_UNDEFINED
6375 || slot2->nth_rec == ULINT_UNDEFINED) {
6376
6377 /* Here none of the borders were counted. For example,
6378 if on the leaf level we descended to:
6379 (inf, a, b, c, d, e, f, sup)
6380 ^ ^
6381 path1 path2
6382 then n_rows will be 2 (c and d). */
6383
6384 if (is_n_rows_exact) {
6385 /* Only fiddle to adjust this off-by-one
6386 if the number is exact, otherwise we do
6387 much grosser adjustments below. */
6388
6389 btr_path_t* last1 = &path1[i - 1];
6390 btr_path_t* last2 = &path2[i - 1];
6391
6392 /* If both paths end up on the same record on
6393 the leaf level. */
6394 if (last1->page_no == last2->page_no
6395 && last1->nth_rec == last2->nth_rec) {
6396
6397 /* n_rows can be > 0 here if the paths
6398 were first different and then converged
6399 to the same record on the leaf level.
6400 For example:
6401 SELECT ... LIKE 'wait/synch/rwlock%'
6402 mode1=PAGE_CUR_GE,
6403 tuple1="wait/synch/rwlock"
6404 path1[0]={nth_rec=58, n_recs=58,
6405 page_no=3, page_level=1}
6406 path1[1]={nth_rec=56, n_recs=55,
6407 page_no=119, page_level=0}
6408
6409 mode2=PAGE_CUR_G
6410 tuple2="wait/synch/rwlock"
6411 path2[0]={nth_rec=57, n_recs=57,
6412 page_no=3, page_level=1}
6413 path2[1]={nth_rec=56, n_recs=55,
6414 page_no=119, page_level=0} */
6415
6416 /* If the range is such that we should
6417 count both borders, then avoid
6418 counting that record twice - once as a
6419 left border and once as a right
6420 border. */
6421 if (should_count_the_left_border
6422 && should_count_the_right_border) {
6423
6424 n_rows = 1;
6425 } else {
6426 /* Some of the borders should
6427 not be counted, e.g. [3,3). */
6428 n_rows = 0;
6429 }
6430 } else {
6431 if (should_count_the_left_border) {
6432 n_rows++;
6433 }
6434
6435 if (should_count_the_right_border) {
6436 n_rows++;
6437 }
6438 }
6439 }
6440
6441 if (i > divergence_level + 1 && !is_n_rows_exact) {
6442 /* In trees whose height is > 1 our algorithm
6443 tends to underestimate: multiply the estimate
6444 by 2: */
6445
6446 n_rows = n_rows * 2;
6447 }
6448
6449 DBUG_EXECUTE_IF("bug14007649", return(n_rows););
6450
6451 /* Do not estimate the number of rows in the range
6452 to over 1 / 2 of the estimated rows in the whole
6453 table */
6454
6455 if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
6456
6457 n_rows = table_n_rows / 2;
6458
6459 /* If there are just 0 or 1 rows in the table,
6460 then we estimate all rows are in the range */
6461
6462 if (n_rows == 0) {
6463 n_rows = table_n_rows;
6464 }
6465 }
6466
6467 return(n_rows);
6468 }
6469
6470 if (!diverged && slot1->nth_rec != slot2->nth_rec) {
6471
6472 /* If both slots do not point to the same page,
6473 this means that the tree must have changed between
6474 the dive for slot1 and the dive for slot2 at the
6475 beginning of this function. */
6476 if (slot1->page_no != slot2->page_no
6477 || slot1->page_level != slot2->page_level) {
6478
6479 /* If the tree keeps changing even after a
6480 few attempts, then just return some arbitrary
6481 number. */
6482 if (nth_attempt >= rows_in_range_max_retries) {
6483 return(rows_in_range_arbitrary_ret_val);
6484 }
6485
6486 return btr_estimate_n_rows_in_range_low(
6487 index, tuple1, tuple2,
6488 nth_attempt + 1);
6489 }
6490
6491 diverged = true;
6492
6493 if (slot1->nth_rec < slot2->nth_rec) {
6494 /* We do not count the borders (nor the left
6495 nor the right one), thus "- 1". */
6496 n_rows = slot2->nth_rec - slot1->nth_rec - 1;
6497
6498 if (n_rows > 0) {
6499 /* There is at least one row between
6500 the two borders pointed to by slot1
6501 and slot2, so on the level below the
6502 slots will point to non-adjacent
6503 pages. */
6504 diverged_lot = true;
6505 divergence_level = i;
6506 }
6507 } else {
6508 /* It is possible that
6509 slot1->nth_rec >= slot2->nth_rec
6510 if, for example, we have a single page
6511 tree which contains (inf, 5, 6, supr)
6512 and we select where x > 20 and x < 30;
6513 in this case slot1->nth_rec will point
6514 to the supr record and slot2->nth_rec
6515 will point to 6. */
6516 n_rows = 0;
6517 should_count_the_left_border = false;
6518 should_count_the_right_border = false;
6519 }
6520
6521 } else if (diverged && !diverged_lot) {
6522
6523 if (slot1->nth_rec < slot1->n_recs
6524 || slot2->nth_rec > 1) {
6525
6526 diverged_lot = true;
6527 divergence_level = i;
6528
6529 n_rows = 0;
6530
6531 if (slot1->nth_rec < slot1->n_recs) {
6532 n_rows += slot1->n_recs
6533 - slot1->nth_rec;
6534 }
6535
6536 if (slot2->nth_rec > 1) {
6537 n_rows += slot2->nth_rec - 1;
6538 }
6539 }
6540 } else if (diverged_lot) {
6541
6542 n_rows = btr_estimate_n_rows_in_range_on_level(
6543 index, slot1, slot2, n_rows,
6544 &is_n_rows_exact);
6545 }
6546 }
6547 }
6548
6549 /** Estimates the number of rows in a given index range.
6550 @param[in] index index
6551 @param[in] tuple1 range start, may also be empty tuple
6552 @param[in] mode1 search mode for range start
6553 @param[in] tuple2 range end, may also be empty tuple
6554 @param[in] mode2 search mode for range end
6555 @return estimated number of rows */
6556 ha_rows
btr_estimate_n_rows_in_range(dict_index_t * index,btr_pos_t * tuple1,btr_pos_t * tuple2)6557 btr_estimate_n_rows_in_range(
6558 dict_index_t* index,
6559 btr_pos_t *tuple1,
6560 btr_pos_t *tuple2)
6561 {
6562 return btr_estimate_n_rows_in_range_low(
6563 index, tuple1, tuple2, 1);
6564 }
6565
6566 /*******************************************************************//**
6567 Record the number of non_null key values in a given index for
6568 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6569 The estimates are eventually stored in the array:
6570 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6571 static
6572 void
btr_record_not_null_field_in_rec(ulint n_unique,const rec_offs * offsets,ib_uint64_t * n_not_null)6573 btr_record_not_null_field_in_rec(
6574 /*=============================*/
6575 ulint n_unique, /*!< in: dict_index_get_n_unique(index),
6576 number of columns uniquely determine
6577 an index entry */
6578 const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
6579 its size could be for all fields or
6580 that of "n_unique" */
6581 ib_uint64_t* n_not_null) /*!< in/out: array to record number of
6582 not null rows for n-column prefix */
6583 {
6584 ulint i;
6585
6586 ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6587
6588 if (n_not_null == NULL) {
6589 return;
6590 }
6591
6592 for (i = 0; i < n_unique; i++) {
6593 if (rec_offs_nth_sql_null(offsets, i)) {
6594 break;
6595 }
6596
6597 n_not_null[i]++;
6598 }
6599 }
6600
6601 /** Estimates the number of different key values in a given index, for
6602 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6603 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6604 0..n_uniq-1) and the number of pages that were sampled is saved in
6605 result.n_sample_sizes[].
6606 If innodb_stats_method is nulls_ignored, we also record the number of
6607 non-null values for each prefix and stored the estimates in
6608 array result.n_non_null_key_vals.
6609 @param[in] index index
6610 @return vector with statistics information
6611 empty vector if the index is unavailable. */
6612 std::vector<index_field_stats_t>
btr_estimate_number_of_different_key_vals(dict_index_t * index)6613 btr_estimate_number_of_different_key_vals(dict_index_t* index)
6614 {
6615 btr_cur_t cursor;
6616 page_t* page;
6617 rec_t* rec;
6618 ulint n_cols;
6619 ib_uint64_t* n_diff;
6620 ib_uint64_t* n_not_null;
6621 ibool stats_null_not_equal;
6622 uintmax_t n_sample_pages=1; /* number of pages to sample */
6623 ulint not_empty_flag = 0;
6624 ulint total_external_size = 0;
6625 ulint i;
6626 ulint j;
6627 uintmax_t add_on;
6628 mtr_t mtr;
6629 mem_heap_t* heap = NULL;
6630 rec_offs* offsets_rec = NULL;
6631 rec_offs* offsets_next_rec = NULL;
6632
6633 std::vector<index_field_stats_t> result;
6634
6635 /* For spatial index, there is no such stats can be
6636 fetched. */
6637 ut_ad(!dict_index_is_spatial(index));
6638
6639 n_cols = dict_index_get_n_unique(index);
6640
6641 heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6642 * n_cols
6643 + dict_index_get_n_fields(index)
6644 * (sizeof *offsets_rec
6645 + sizeof *offsets_next_rec));
6646
6647 n_diff = (ib_uint64_t*) mem_heap_zalloc(
6648 heap, n_cols * sizeof(n_diff[0]));
6649
6650 n_not_null = NULL;
6651
6652 /* Check srv_innodb_stats_method setting, and decide whether we
6653 need to record non-null value and also decide if NULL is
6654 considered equal (by setting stats_null_not_equal value) */
6655 switch (srv_innodb_stats_method) {
6656 case SRV_STATS_NULLS_IGNORED:
6657 n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6658 heap, n_cols * sizeof *n_not_null);
6659 /* fall through */
6660
6661 case SRV_STATS_NULLS_UNEQUAL:
6662 /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6663 case, we will treat NULLs as unequal value */
6664 stats_null_not_equal = TRUE;
6665 break;
6666
6667 case SRV_STATS_NULLS_EQUAL:
6668 stats_null_not_equal = FALSE;
6669 break;
6670
6671 default:
6672 ut_error;
6673 }
6674
6675 if (srv_stats_sample_traditional) {
6676 /* It makes no sense to test more pages than are contained
6677 in the index, thus we lower the number if it is too high */
6678 if (srv_stats_transient_sample_pages > index->stat_index_size) {
6679 if (index->stat_index_size > 0) {
6680 n_sample_pages = index->stat_index_size;
6681 }
6682 } else {
6683 n_sample_pages = srv_stats_transient_sample_pages;
6684 }
6685 } else {
6686 /* New logaritmic number of pages that are estimated.
6687 Number of pages estimated should be between 1 and
6688 index->stat_index_size.
6689
6690 If we have only 0 or 1 index pages then we can only take 1
6691 sample. We have already initialized n_sample_pages to 1.
6692
6693 So taking index size as I and sample as S and log(I)*S as L
6694
6695 requirement 1) we want the out limit of the expression to not exceed I;
6696 requirement 2) we want the ideal pages to be at least S;
6697 so the current expression is min(I, max( min(S,I), L)
6698
6699 looking for simplifications:
6700
6701 case 1: assume S < I
6702 min(I, max( min(S,I), L) -> min(I , max( S, L))
6703
6704 but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
6705
6706 so we have: min(I , L)
6707
6708 case 2: assume I < S
6709 min(I, max( min(S,I), L) -> min(I, max( I, L))
6710
6711 case 2a: L > I
6712 min(I, max( I, L)) -> min(I, L) -> I
6713
6714 case 2b: when L < I
6715 min(I, max( I, L)) -> min(I, I ) -> I
6716
6717 so taking all case2 paths is I, our expression is:
6718 n_pages = S < I? min(I,L) : I
6719 */
6720 if (index->stat_index_size > 1) {
6721 n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
6722 ? ut_min(index->stat_index_size,
6723 static_cast<ulint>(
6724 log2(double(index->stat_index_size))
6725 * double(srv_stats_transient_sample_pages)))
6726 : index->stat_index_size;
6727 }
6728 }
6729
6730 /* Sanity check */
6731 ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
6732
6733 /* We sample some pages in the index to get an estimate */
6734
6735 for (i = 0; i < n_sample_pages; i++) {
6736 mtr_start(&mtr);
6737
6738 bool available;
6739
6740 available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6741 &cursor, &mtr);
6742
6743 if (!available) {
6744 mtr_commit(&mtr);
6745 mem_heap_free(heap);
6746
6747 return result;
6748 }
6749
6750 /* Count the number of different key values for each prefix of
6751 the key on this index page. If the prefix does not determine
6752 the index record uniquely in the B-tree, then we subtract one
6753 because otherwise our algorithm would give a wrong estimate
6754 for an index where there is just one key value. */
6755
6756 if (!index->is_readable()) {
6757 mtr_commit(&mtr);
6758 goto exit_loop;
6759 }
6760
6761 page = btr_cur_get_page(&cursor);
6762
6763 rec = page_rec_get_next(page_get_infimum_rec(page));
6764 const ulint n_core = page_is_leaf(page)
6765 ? index->n_core_fields : 0;
6766
6767 if (!page_rec_is_supremum(rec)) {
6768 not_empty_flag = 1;
6769 offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6770 n_core,
6771 ULINT_UNDEFINED, &heap);
6772
6773 if (n_not_null != NULL) {
6774 btr_record_not_null_field_in_rec(
6775 n_cols, offsets_rec, n_not_null);
6776 }
6777 }
6778
6779 while (!page_rec_is_supremum(rec)) {
6780 ulint matched_fields;
6781 rec_t* next_rec = page_rec_get_next(rec);
6782 if (page_rec_is_supremum(next_rec)) {
6783 total_external_size +=
6784 btr_rec_get_externally_stored_len(
6785 rec, offsets_rec);
6786 break;
6787 }
6788
6789 offsets_next_rec = rec_get_offsets(next_rec, index,
6790 offsets_next_rec,
6791 n_core,
6792 ULINT_UNDEFINED,
6793 &heap);
6794
6795 cmp_rec_rec(rec, next_rec,
6796 offsets_rec, offsets_next_rec,
6797 index, stats_null_not_equal,
6798 &matched_fields);
6799
6800 for (j = matched_fields; j < n_cols; j++) {
6801 /* We add one if this index record has
6802 a different prefix from the previous */
6803
6804 n_diff[j]++;
6805 }
6806
6807 if (n_not_null != NULL) {
6808 btr_record_not_null_field_in_rec(
6809 n_cols, offsets_next_rec, n_not_null);
6810 }
6811
6812 total_external_size
6813 += btr_rec_get_externally_stored_len(
6814 rec, offsets_rec);
6815
6816 rec = next_rec;
6817 /* Initialize offsets_rec for the next round
6818 and assign the old offsets_rec buffer to
6819 offsets_next_rec. */
6820 {
6821 rec_offs* offsets_tmp = offsets_rec;
6822 offsets_rec = offsets_next_rec;
6823 offsets_next_rec = offsets_tmp;
6824 }
6825 }
6826
6827 if (n_cols == dict_index_get_n_unique_in_tree(index)
6828 && page_has_siblings(page)) {
6829
6830 /* If there is more than one leaf page in the tree,
6831 we add one because we know that the first record
6832 on the page certainly had a different prefix than the
6833 last record on the previous index page in the
6834 alphabetical order. Before this fix, if there was
6835 just one big record on each clustered index page, the
6836 algorithm grossly underestimated the number of rows
6837 in the table. */
6838
6839 n_diff[n_cols - 1]++;
6840 }
6841
6842 mtr_commit(&mtr);
6843 }
6844
6845 exit_loop:
6846 /* If we saw k borders between different key values on
6847 n_sample_pages leaf pages, we can estimate how many
6848 there will be in index->stat_n_leaf_pages */
6849
6850 /* We must take into account that our sample actually represents
6851 also the pages used for external storage of fields (those pages are
6852 included in index->stat_n_leaf_pages) */
6853
6854 result.reserve(n_cols);
6855
6856 for (j = 0; j < n_cols; j++) {
6857 index_field_stats_t stat;
6858
6859 stat.n_diff_key_vals
6860 = BTR_TABLE_STATS_FROM_SAMPLE(
6861 n_diff[j], index, n_sample_pages,
6862 total_external_size, not_empty_flag);
6863
6864 /* If the tree is small, smaller than
6865 10 * n_sample_pages + total_external_size, then
6866 the above estimate is ok. For bigger trees it is common that we
6867 do not see any borders between key values in the few pages
6868 we pick. But still there may be n_sample_pages
6869 different key values, or even more. Let us try to approximate
6870 that: */
6871
6872 add_on = index->stat_n_leaf_pages
6873 / (10 * (n_sample_pages
6874 + total_external_size));
6875
6876 if (add_on > n_sample_pages) {
6877 add_on = n_sample_pages;
6878 }
6879
6880 stat.n_diff_key_vals += add_on;
6881
6882 stat.n_sample_sizes = n_sample_pages;
6883
6884 if (n_not_null != NULL) {
6885 stat.n_non_null_key_vals =
6886 BTR_TABLE_STATS_FROM_SAMPLE(
6887 n_not_null[j], index, n_sample_pages,
6888 total_external_size, not_empty_flag);
6889 }
6890
6891 result.push_back(stat);
6892 }
6893
6894 mem_heap_free(heap);
6895
6896 return result;
6897 }
6898
6899 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6900
6901 /***********************************************************//**
6902 Gets the offset of the pointer to the externally stored part of a field.
6903 @return offset of the pointer to the externally stored part */
6904 static
6905 ulint
btr_rec_get_field_ref_offs(const rec_offs * offsets,ulint n)6906 btr_rec_get_field_ref_offs(
6907 /*=======================*/
6908 const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
6909 ulint n) /*!< in: index of the external field */
6910 {
6911 ulint field_ref_offs;
6912 ulint local_len;
6913
6914 ut_a(rec_offs_nth_extern(offsets, n));
6915 field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6916 ut_a(len_is_stored(local_len));
6917 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6918
6919 return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6920 }
6921
6922 /** Gets a pointer to the externally stored part of a field.
6923 @param rec record
6924 @param offsets rec_get_offsets(rec)
6925 @param n index of the externally stored field
6926 @return pointer to the externally stored part */
6927 #define btr_rec_get_field_ref(rec, offsets, n) \
6928 ((rec) + btr_rec_get_field_ref_offs(offsets, n))
6929
6930 /** Gets the externally stored size of a record, in units of a database page.
6931 @param[in] rec record
6932 @param[in] offsets array returned by rec_get_offsets()
6933 @return externally stored part, in units of a database page */
6934 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const rec_offs * offsets)6935 btr_rec_get_externally_stored_len(
6936 const rec_t* rec,
6937 const rec_offs* offsets)
6938 {
6939 ulint n_fields;
6940 ulint total_extern_len = 0;
6941 ulint i;
6942
6943 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6944
6945 if (!rec_offs_any_extern(offsets)) {
6946 return(0);
6947 }
6948
6949 n_fields = rec_offs_n_fields(offsets);
6950
6951 for (i = 0; i < n_fields; i++) {
6952 if (rec_offs_nth_extern(offsets, i)) {
6953
6954 ulint extern_len = mach_read_from_4(
6955 btr_rec_get_field_ref(rec, offsets, i)
6956 + BTR_EXTERN_LEN + 4);
6957
6958 total_extern_len += ut_calc_align(
6959 extern_len, ulint(srv_page_size));
6960 }
6961 }
6962
6963 return total_extern_len >> srv_page_size_shift;
6964 }
6965
6966 /*******************************************************************//**
6967 Sets the ownership bit of an externally stored field in a record. */
6968 static
6969 void
btr_cur_set_ownership_of_extern_field(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,ulint i,bool val,mtr_t * mtr)6970 btr_cur_set_ownership_of_extern_field(
6971 /*==================================*/
6972 buf_block_t* block, /*!< in/out: index page */
6973 rec_t* rec, /*!< in/out: clustered index record */
6974 dict_index_t* index, /*!< in: index of the page */
6975 const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
6976 ulint i, /*!< in: field number */
6977 bool val, /*!< in: value to set */
6978 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6979 {
6980 byte* data;
6981 ulint local_len;
6982 ulint byte_val;
6983
6984 data = rec_get_nth_field(rec, offsets, i, &local_len);
6985 ut_ad(rec_offs_nth_extern(offsets, i));
6986 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6987
6988 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6989
6990 byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6991
6992 if (val) {
6993 byte_val &= ~BTR_EXTERN_OWNER_FLAG;
6994 } else {
6995 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6996 ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6997 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6998 byte_val |= BTR_EXTERN_OWNER_FLAG;
6999 }
7000
7001 if (UNIV_LIKELY_NULL(block->page.zip.data)) {
7002 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
7003 page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
7004 } else {
7005 mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
7006 + BTR_EXTERN_LEN, byte_val);
7007 }
7008 }
7009
7010 /*******************************************************************//**
7011 Marks non-updated off-page fields as disowned by this record. The ownership
7012 must be transferred to the updated record which is inserted elsewhere in the
7013 index tree. In purge only the owner of externally stored field is allowed
7014 to free the field. */
7015 void
btr_cur_disown_inherited_fields(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,const upd_t * update,mtr_t * mtr)7016 btr_cur_disown_inherited_fields(
7017 /*============================*/
7018 buf_block_t* block, /*!< in/out: index page */
7019 rec_t* rec, /*!< in/out: record in a clustered index */
7020 dict_index_t* index, /*!< in: index of the page */
7021 const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
7022 const upd_t* update, /*!< in: update vector */
7023 mtr_t* mtr) /*!< in/out: mini-transaction */
7024 {
7025 ut_ad(rec_offs_validate(rec, index, offsets));
7026 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
7027 ut_ad(rec_offs_any_extern(offsets));
7028
7029 for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
7030 if (rec_offs_nth_extern(offsets, i)
7031 && !upd_get_field_by_field_no(update, i, false)) {
7032 btr_cur_set_ownership_of_extern_field(
7033 block, rec, index, offsets, i, false, mtr);
7034 }
7035 }
7036 }
7037
7038 /*******************************************************************//**
7039 Marks all extern fields in a record as owned by the record. This function
7040 should be called if the delete mark of a record is removed: a not delete
7041 marked record always owns all its extern fields. */
7042 static
7043 void
btr_cur_unmark_extern_fields(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,mtr_t * mtr)7044 btr_cur_unmark_extern_fields(
7045 /*=========================*/
7046 buf_block_t* block, /*!< in/out: index page */
7047 rec_t* rec, /*!< in/out: record in a clustered index */
7048 dict_index_t* index, /*!< in: index of the page */
7049 const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */
7050 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
7051 {
7052 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
7053 if (!rec_offs_any_extern(offsets)) {
7054 return;
7055 }
7056
7057 const ulint n = rec_offs_n_fields(offsets);
7058
7059 for (ulint i = 0; i < n; i++) {
7060 if (rec_offs_nth_extern(offsets, i)) {
7061 btr_cur_set_ownership_of_extern_field(
7062 block, rec, index, offsets, i, true, mtr);
7063 }
7064 }
7065 }
7066
7067 /*******************************************************************//**
7068 Returns the length of a BLOB part stored on the header page.
7069 @return part length */
7070 static
7071 uint32_t
btr_blob_get_part_len(const byte * blob_header)7072 btr_blob_get_part_len(
7073 /*==================*/
7074 const byte* blob_header) /*!< in: blob header */
7075 {
7076 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
7077 }
7078
7079 /*******************************************************************//**
7080 Returns the page number where the next BLOB part is stored.
7081 @return page number or FIL_NULL if no more pages */
7082 static
7083 uint32_t
btr_blob_get_next_page_no(const byte * blob_header)7084 btr_blob_get_next_page_no(
7085 /*======================*/
7086 const byte* blob_header) /*!< in: blob header */
7087 {
7088 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
7089 }
7090
7091 /** Deallocate a buffer block that was reserved for a BLOB part.
7092 @param block buffer block
7093 @param all flag whether to remove a ROW_FORMAT=COMPRESSED page
7094 @param mtr mini-transaction to commit */
btr_blob_free(buf_block_t * block,bool all,mtr_t * mtr)7095 static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
7096 {
7097 const page_id_t page_id(block->page.id());
7098 ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
7099 mtr->commit();
7100
7101 const ulint fold= page_id.fold();
7102
7103 mysql_mutex_lock(&buf_pool.mutex);
7104
7105 if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
7106 if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
7107 /* Attempt to deallocate the redundant copy of the uncompressed page
7108 if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
7109 buf_LRU_free_page(bpage, false);
7110
7111 mysql_mutex_unlock(&buf_pool.mutex);
7112 }
7113
7114 /** Helper class used while writing blob pages, during insert or update. */
7115 struct btr_blob_log_check_t {
7116 /** Persistent cursor on a clusterex index record with blobs. */
7117 btr_pcur_t* m_pcur;
7118 /** Mini transaction holding the latches for m_pcur */
7119 mtr_t* m_mtr;
7120 /** rec_get_offsets(rec, index); offset of clust_rec */
7121 const rec_offs* m_offsets;
7122 /** The block containing clustered record */
7123 buf_block_t** m_block;
7124 /** The clustered record pointer */
7125 rec_t** m_rec;
7126 /** The blob operation code */
7127 enum blob_op m_op;
7128
7129 /** Constructor
7130 @param[in] pcur persistent cursor on a clustered
7131 index record with blobs.
7132 @param[in] mtr mini-transaction holding latches for
7133 pcur.
7134 @param[in] offsets offsets of the clust_rec
7135 @param[in,out] block record block containing pcur record
7136 @param[in,out] rec the clustered record pointer
7137 @param[in] op the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t7138 btr_blob_log_check_t(
7139 btr_pcur_t* pcur,
7140 mtr_t* mtr,
7141 const rec_offs* offsets,
7142 buf_block_t** block,
7143 rec_t** rec,
7144 enum blob_op op)
7145 : m_pcur(pcur),
7146 m_mtr(mtr),
7147 m_offsets(offsets),
7148 m_block(block),
7149 m_rec(rec),
7150 m_op(op)
7151 {
7152 ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
7153 ut_ad((*m_block)->frame == page_align(*m_rec));
7154 ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
7155 }
7156
7157 /** Check if there is enough space in log file. Commit and re-start the
7158 mini transaction. */
checkbtr_blob_log_check_t7159 void check()
7160 {
7161 dict_index_t* index = m_pcur->index();
7162 ulint offs = 0;
7163 uint32_t page_no = FIL_NULL;
7164
7165 if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
7166 offs = page_offset(*m_rec);
7167 page_no = (*m_block)->page.id().page_no();
7168 buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
7169 ut_ad(page_no != FIL_NULL);
7170 } else {
7171 btr_pcur_store_position(m_pcur, m_mtr);
7172 }
7173 m_mtr->commit();
7174
7175 DEBUG_SYNC_C("blob_write_middle");
7176
7177 log_free_check();
7178
7179 DEBUG_SYNC_C("blob_write_middle_after_check");
7180
7181 const mtr_log_t log_mode = m_mtr->get_log_mode();
7182 m_mtr->start();
7183 m_mtr->set_log_mode(log_mode);
7184 index->set_modified(*m_mtr);
7185
7186 if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
7187 m_pcur->btr_cur.page_cur.block = btr_block_get(
7188 *index, page_no, RW_X_LATCH, false, m_mtr);
7189 m_pcur->btr_cur.page_cur.rec
7190 = m_pcur->btr_cur.page_cur.block->frame
7191 + offs;
7192
7193 buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block);
7194 } else {
7195 ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
7196 bool ret = btr_pcur_restore_position(
7197 BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
7198 m_pcur, m_mtr);
7199
7200 ut_a(ret);
7201 }
7202
7203 *m_block = btr_pcur_get_block(m_pcur);
7204 *m_rec = btr_pcur_get_rec(m_pcur);
7205
7206 rec_offs_make_valid(*m_rec, index, true,
7207 const_cast<rec_offs*>(m_offsets));
7208
7209 ut_ad(m_mtr->memo_contains_page_flagged(
7210 *m_rec,
7211 MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
7212
7213 ut_ad((m_op == BTR_STORE_INSERT_BULK)
7214 == !m_mtr->memo_contains_flagged(&index->lock,
7215 MTR_MEMO_SX_LOCK
7216 | MTR_MEMO_X_LOCK));
7217 }
7218 };
7219
7220 /*******************************************************************//**
7221 Stores the fields in big_rec_vec to the tablespace and puts pointers to
7222 them in rec. The extern flags in rec will have to be set beforehand.
7223 The fields are stored on pages allocated from leaf node
7224 file segment of the index tree.
7225
7226 TODO: If the allocation extends the tablespace, it will not be redo logged, in
7227 any mini-transaction. Tablespace extension should be redo-logged, so that
7228 recovery will not fail when the big_rec was written to the extended portion of
7229 the file, in case the file was somehow truncated in the crash.
7230
7231 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
7232 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,rec_offs * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)7233 btr_store_big_rec_extern_fields(
7234 /*============================*/
7235 btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if
7236 btr_mtr is restarted, then this can
7237 be repositioned. */
7238 rec_offs* offsets, /*!< in/out: rec_get_offsets() on
7239 pcur. the "external storage" flags
7240 in offsets will correctly correspond
7241 to rec when this function returns */
7242 const big_rec_t*big_rec_vec, /*!< in: vector containing fields
7243 to be stored externally */
7244 mtr_t* btr_mtr, /*!< in/out: mtr containing the
7245 latches to the clustered index. can be
7246 committed and restarted. */
7247 enum blob_op op) /*! in: operation code */
7248 {
7249 byte* field_ref;
7250 ulint extern_len;
7251 ulint store_len;
7252 ulint space_id;
7253 ulint i;
7254 mtr_t mtr;
7255 mem_heap_t* heap = NULL;
7256 page_zip_des_t* page_zip;
7257 z_stream c_stream;
7258 dberr_t error = DB_SUCCESS;
7259 dict_index_t* index = pcur->index();
7260 buf_block_t* rec_block = btr_pcur_get_block(pcur);
7261 rec_t* rec = btr_pcur_get_rec(pcur);
7262
7263 ut_ad(rec_offs_validate(rec, index, offsets));
7264 ut_ad(rec_offs_any_extern(offsets));
7265 ut_ad(op == BTR_STORE_INSERT_BULK
7266 || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
7267 | MTR_MEMO_SX_LOCK));
7268 ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
7269 ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
7270 ut_a(dict_index_is_clust(index));
7271
7272 btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
7273 &rec, op);
7274 page_zip = buf_block_get_page_zip(rec_block);
7275 space_id = rec_block->page.id().space();
7276 ut_a(fil_page_index_page_check(page_align(rec))
7277 || op == BTR_STORE_INSERT_BULK);
7278
7279 if (page_zip) {
7280 int err;
7281
7282 /* Zlib deflate needs 128 kilobytes for the default
7283 window size, plus 512 << memLevel, plus a few
7284 kilobytes for small objects. We use reduced memLevel
7285 to limit the memory consumption, and preallocate the
7286 heap, hoping to avoid memory fragmentation. */
7287 heap = mem_heap_create(250000);
7288 page_zip_set_alloc(&c_stream, heap);
7289
7290 err = deflateInit2(&c_stream, int(page_zip_level),
7291 Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
7292 ut_a(err == Z_OK);
7293 }
7294
7295 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7296 /* All pointers to externally stored columns in the record
7297 must either be zero or they must be pointers to inherited
7298 columns, owned by this record or an earlier record version. */
7299 for (i = 0; i < big_rec_vec->n_fields; i++) {
7300 field_ref = btr_rec_get_field_ref(
7301 rec, offsets, big_rec_vec->fields[i].field_no);
7302
7303 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7304 /* Either this must be an update in place,
7305 or the BLOB must be inherited, or the BLOB pointer
7306 must be zero (will be written in this function). */
7307 ut_a(op == BTR_STORE_UPDATE
7308 || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
7309 || !memcmp(field_ref, field_ref_zero,
7310 BTR_EXTERN_FIELD_REF_SIZE));
7311 }
7312 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7313
7314 /* Space available in compressed page to carry blob data */
7315 const ulint payload_size_zip = rec_block->physical_size()
7316 - FIL_PAGE_DATA;
7317
7318 /* Space available in uncompressed page to carry blob data */
7319 const ulint payload_size = payload_size_zip
7320 - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
7321
7322 /* We have to create a file segment to the tablespace
7323 for each field and put the pointer to the field in rec */
7324
7325 for (i = 0; i < big_rec_vec->n_fields; i++) {
7326 const ulint field_no = big_rec_vec->fields[i].field_no;
7327
7328 field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
7329 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7330 /* A zero BLOB pointer should have been initially inserted. */
7331 ut_a(!memcmp(field_ref, field_ref_zero,
7332 BTR_EXTERN_FIELD_REF_SIZE));
7333 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7334 extern_len = big_rec_vec->fields[i].len;
7335 MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
7336 ut_a(extern_len > 0);
7337
7338 uint32_t prev_page_no = FIL_NULL;
7339
7340 if (page_zip) {
7341 int err = deflateReset(&c_stream);
7342 ut_a(err == Z_OK);
7343
7344 c_stream.next_in = (Bytef*)
7345 big_rec_vec->fields[i].data;
7346 c_stream.avail_in = static_cast<uInt>(extern_len);
7347 }
7348
7349 for (ulint blob_npages = 0;; ++blob_npages) {
7350 buf_block_t* block;
7351 const ulint commit_freq = 4;
7352 uint32_t r_extents;
7353
7354 ut_ad(page_align(field_ref) == page_align(rec));
7355
7356 if (!(blob_npages % commit_freq)) {
7357
7358 redo_log.check();
7359
7360 field_ref = btr_rec_get_field_ref(
7361 rec, offsets, field_no);
7362
7363 page_zip = buf_block_get_page_zip(rec_block);
7364 }
7365
7366 mtr.start();
7367 index->set_modified(mtr);
7368 mtr.set_log_mode(btr_mtr->get_log_mode());
7369
7370 buf_page_get(rec_block->page.id(),
7371 rec_block->zip_size(), RW_X_LATCH, &mtr);
7372
7373 uint32_t hint_prev = prev_page_no;
7374 if (hint_prev == FIL_NULL) {
7375 hint_prev = rec_block->page.id().page_no();
7376 }
7377
7378 if (!fsp_reserve_free_extents(&r_extents,
7379 index->table->space, 1,
7380 FSP_BLOB, &mtr, 1)) {
7381 mtr.commit();
7382 error = DB_OUT_OF_FILE_SPACE;
7383 goto func_exit;
7384 }
7385
7386 block = btr_page_alloc(index, hint_prev + 1,
7387 FSP_NO_DIR, 0, &mtr, &mtr);
7388
7389 index->table->space->release_free_extents(r_extents);
7390
7391 ut_a(block != NULL);
7392
7393 const uint32_t page_no = block->page.id().page_no();
7394
7395 if (prev_page_no != FIL_NULL) {
7396 buf_block_t* prev_block;
7397
7398 prev_block = buf_page_get(
7399 page_id_t(space_id, prev_page_no),
7400 rec_block->zip_size(),
7401 RW_X_LATCH, &mtr);
7402
7403 buf_block_dbg_add_level(prev_block,
7404 SYNC_EXTERN_STORAGE);
7405
7406 if (page_zip) {
7407 mtr.write<4>(*prev_block,
7408 prev_block->frame
7409 + FIL_PAGE_NEXT,
7410 page_no);
7411 memcpy_aligned<4>(
7412 buf_block_get_page_zip(
7413 prev_block)
7414 ->data + FIL_PAGE_NEXT,
7415 prev_block->frame
7416 + FIL_PAGE_NEXT, 4);
7417 } else {
7418 mtr.write<4>(*prev_block,
7419 BTR_BLOB_HDR_NEXT_PAGE_NO
7420 + FIL_PAGE_DATA
7421 + prev_block->frame,
7422 page_no);
7423 }
7424 } else if (dict_index_is_online_ddl(index)) {
7425 row_log_table_blob_alloc(index, page_no);
7426 }
7427
7428 ut_ad(!page_has_siblings(block->frame));
7429 ut_ad(!fil_page_get_type(block->frame));
7430
7431 if (page_zip) {
7432 int err;
7433 page_zip_des_t* blob_page_zip;
7434
7435 mtr.write<1>(*block,
7436 FIL_PAGE_TYPE + 1 + block->frame,
7437 prev_page_no == FIL_NULL
7438 ? FIL_PAGE_TYPE_ZBLOB
7439 : FIL_PAGE_TYPE_ZBLOB2);
7440 block->page.zip.data[FIL_PAGE_TYPE + 1]
7441 = block->frame[FIL_PAGE_TYPE + 1];
7442
7443 c_stream.next_out = block->frame
7444 + FIL_PAGE_DATA;
7445 c_stream.avail_out = static_cast<uInt>(
7446 payload_size_zip);
7447
7448 err = deflate(&c_stream, Z_FINISH);
7449 ut_a(err == Z_OK || err == Z_STREAM_END);
7450 ut_a(err == Z_STREAM_END
7451 || c_stream.avail_out == 0);
7452
7453 mtr.memcpy(*block,
7454 FIL_PAGE_DATA,
7455 page_zip_get_size(page_zip)
7456 - FIL_PAGE_DATA
7457 - c_stream.avail_out);
7458 /* Copy the page to compressed storage,
7459 because it will be flushed to disk
7460 from there. */
7461 blob_page_zip = buf_block_get_page_zip(block);
7462 ut_ad(blob_page_zip);
7463 ut_ad(page_zip_get_size(blob_page_zip)
7464 == page_zip_get_size(page_zip));
7465 memcpy(blob_page_zip->data, block->frame,
7466 page_zip_get_size(page_zip));
7467
7468 if (err == Z_OK && prev_page_no != FIL_NULL) {
7469
7470 goto next_zip_page;
7471 }
7472
7473 if (err == Z_STREAM_END) {
7474 mach_write_to_4(field_ref
7475 + BTR_EXTERN_LEN, 0);
7476 mach_write_to_4(field_ref
7477 + BTR_EXTERN_LEN + 4,
7478 c_stream.total_in);
7479 } else {
7480 memset(field_ref + BTR_EXTERN_LEN,
7481 0, 8);
7482 }
7483
7484 if (prev_page_no == FIL_NULL) {
7485 ut_ad(blob_npages == 0);
7486 mach_write_to_4(field_ref
7487 + BTR_EXTERN_SPACE_ID,
7488 space_id);
7489
7490 mach_write_to_4(field_ref
7491 + BTR_EXTERN_PAGE_NO,
7492 page_no);
7493
7494 mach_write_to_4(field_ref
7495 + BTR_EXTERN_OFFSET,
7496 FIL_PAGE_NEXT);
7497 }
7498
7499 /* We compress a page when finish bulk insert.*/
7500 if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
7501 page_zip_write_blob_ptr(
7502 rec_block, rec, index, offsets,
7503 field_no, &mtr);
7504 }
7505
7506 next_zip_page:
7507 prev_page_no = page_no;
7508
7509 /* Commit mtr and release the
7510 uncompressed page frame to save memory. */
7511 btr_blob_free(block, FALSE, &mtr);
7512
7513 if (err == Z_STREAM_END) {
7514 break;
7515 }
7516 } else {
7517 mtr.write<1>(*block, FIL_PAGE_TYPE + 1
7518 + block->frame,
7519 FIL_PAGE_TYPE_BLOB);
7520
7521 if (extern_len > payload_size) {
7522 store_len = payload_size;
7523 } else {
7524 store_len = extern_len;
7525 }
7526
7527 mtr.memcpy<mtr_t::MAYBE_NOP>(
7528 *block,
7529 FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
7530 + block->frame,
7531 static_cast<const byte*>
7532 (big_rec_vec->fields[i].data)
7533 + big_rec_vec->fields[i].len
7534 - extern_len, store_len);
7535 mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
7536 + FIL_PAGE_DATA + block->frame,
7537 store_len);
7538 compile_time_assert(FIL_NULL == 0xffffffff);
7539 mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
7540 + FIL_PAGE_DATA, 4, 0xff);
7541
7542 extern_len -= store_len;
7543
7544 ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
7545 + field_ref));
7546 mtr.write<4>(*rec_block,
7547 BTR_EXTERN_LEN + 4 + field_ref,
7548 big_rec_vec->fields[i].len
7549 - extern_len);
7550
7551 if (prev_page_no == FIL_NULL) {
7552 ut_ad(blob_npages == 0);
7553 mtr.write<4,mtr_t::MAYBE_NOP>(
7554 *rec_block,
7555 field_ref + BTR_EXTERN_SPACE_ID,
7556 space_id);
7557
7558 mtr.write<4>(*rec_block, field_ref
7559 + BTR_EXTERN_PAGE_NO,
7560 page_no);
7561
7562 mtr.write<4>(*rec_block, field_ref
7563 + BTR_EXTERN_OFFSET,
7564 FIL_PAGE_DATA);
7565 }
7566
7567 prev_page_no = page_no;
7568
7569 mtr.commit();
7570
7571 if (extern_len == 0) {
7572 break;
7573 }
7574 }
7575 }
7576
7577 DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7578 error = DB_OUT_OF_FILE_SPACE;
7579 goto func_exit;);
7580
7581 rec_offs_make_nth_extern(offsets, field_no);
7582 }
7583
7584 func_exit:
7585 if (page_zip) {
7586 deflateEnd(&c_stream);
7587 }
7588
7589 if (heap != NULL) {
7590 mem_heap_free(heap);
7591 }
7592
7593 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7594 /* All pointers to externally stored columns in the record
7595 must be valid. */
7596 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7597 if (!rec_offs_nth_extern(offsets, i)) {
7598 continue;
7599 }
7600
7601 field_ref = btr_rec_get_field_ref(rec, offsets, i);
7602
7603 /* The pointer must not be zero if the operation
7604 succeeded. */
7605 ut_a(0 != memcmp(field_ref, field_ref_zero,
7606 BTR_EXTERN_FIELD_REF_SIZE)
7607 || error != DB_SUCCESS);
7608 /* The column must not be disowned by this record. */
7609 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7610 }
7611 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7612 return(error);
7613 }
7614
7615 /** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
7616 @param[in] block uncompressed BLOB page
7617 @param[in] read true=read, false=purge */
btr_check_blob_fil_page_type(const buf_block_t & block,bool read)7618 static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read)
7619 {
7620 uint16_t type= fil_page_get_type(block.frame);
7621
7622 if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB))
7623 return;
7624 /* FIXME: take the tablespace as a parameter */
7625 if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
7626 {
7627 /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
7628 pages. Do not print anything about the type mismatch when reading
7629 a BLOB page that may be from old versions. */
7630 if (space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags))
7631 {
7632 ib::fatal() << "FIL_PAGE_TYPE=" << type
7633 << (read ? " on BLOB read file " : " on BLOB purge file ")
7634 << space->chain.start->name
7635 << " page " << block.page.id().page_no();
7636 }
7637 space->release();
7638 }
7639 }
7640
7641 /*******************************************************************//**
7642 Frees the space in an externally stored field to the file space
7643 management if the field in data is owned by the externally stored field,
7644 in a rollback we may have the additional condition that the field must
7645 not be inherited. */
7646 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const rec_offs * offsets,buf_block_t * block,ulint i,bool rollback,mtr_t * local_mtr)7647 btr_free_externally_stored_field(
7648 /*=============================*/
7649 dict_index_t* index, /*!< in: index of the data, the index
7650 tree MUST be X-latched; if the tree
7651 height is 1, then also the root page
7652 must be X-latched! (this is relevant
7653 in the case this function is called
7654 from purge where 'data' is located on
7655 an undo log page, not an index
7656 page) */
7657 byte* field_ref, /*!< in/out: field reference */
7658 const rec_t* rec, /*!< in: record containing field_ref, for
7659 page_zip_write_blob_ptr(), or NULL */
7660 const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
7661 or NULL */
7662 buf_block_t* block, /*!< in/out: page of field_ref */
7663 ulint i, /*!< in: field number of field_ref;
7664 ignored if rec == NULL */
7665 bool rollback, /*!< in: performing rollback? */
7666 mtr_t* local_mtr) /*!< in: mtr
7667 containing the latch to data an an
7668 X-latch to the index tree */
7669 {
7670 page_t* page;
7671 const uint32_t space_id = mach_read_from_4(
7672 field_ref + BTR_EXTERN_SPACE_ID);
7673 const uint32_t start_page = mach_read_from_4(
7674 field_ref + BTR_EXTERN_PAGE_NO);
7675 uint32_t page_no;
7676 uint32_t next_page_no;
7677 mtr_t mtr;
7678
7679 ut_ad(index->is_primary());
7680 ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
7681 | MTR_MEMO_SX_LOCK));
7682 ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
7683 MTR_MEMO_PAGE_X_FIX));
7684 ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7685 ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7686 ut_ad(local_mtr->is_named_space(
7687 page_get_space_id(page_align(field_ref))));
7688
7689 if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7690 BTR_EXTERN_FIELD_REF_SIZE))) {
7691 /* In the rollback, we may encounter a clustered index
7692 record with some unwritten off-page columns. There is
7693 nothing to free then. */
7694 ut_a(rollback);
7695 return;
7696 }
7697
7698 ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7699 & ~((BTR_EXTERN_OWNER_FLAG
7700 | BTR_EXTERN_INHERITED_FLAG) << 24)));
7701 ut_ad(space_id == index->table->space->id);
7702 ut_ad(space_id == index->table->space_id);
7703
7704 const ulint ext_zip_size = index->table->space->zip_size();
7705 const ulint rec_zip_size = rec ? ext_zip_size : 0;
7706
7707 /* !rec holds in a call from purge when field_ref is in an undo page */
7708 ut_ad(rec || !block->page.zip.data);
7709
7710 for (;;) {
7711 #ifdef UNIV_DEBUG
7712 buf_block_t* rec_block;
7713 #endif /* UNIV_DEBUG */
7714 buf_block_t* ext_block;
7715
7716 mtr_start(&mtr);
7717 mtr.set_spaces(*local_mtr);
7718 mtr.set_log_mode(local_mtr->get_log_mode());
7719
7720 ut_ad(!index->table->is_temporary()
7721 || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7722
7723 const page_t* p = page_align(field_ref);
7724
7725 const page_id_t page_id(page_get_space_id(p),
7726 page_get_page_no(p));
7727
7728 #ifdef UNIV_DEBUG
7729 rec_block =
7730 #endif /* UNIV_DEBUG */
7731 buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr);
7732
7733 buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7734 page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7735
7736 if (/* There is no external storage data */
7737 page_no == FIL_NULL
7738 /* This field does not own the externally stored field */
7739 || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7740 & BTR_EXTERN_OWNER_FLAG)
7741 /* Rollback and inherited field */
7742 || (rollback
7743 && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7744 & BTR_EXTERN_INHERITED_FLAG))) {
7745
7746 /* Do not free */
7747 mtr_commit(&mtr);
7748
7749 return;
7750 }
7751
7752 if (page_no == start_page && dict_index_is_online_ddl(index)) {
7753 row_log_table_blob_free(index, start_page);
7754 }
7755
7756 ext_block = buf_page_get(
7757 page_id_t(space_id, page_no), ext_zip_size,
7758 RW_X_LATCH, &mtr);
7759
7760 buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7761 page = buf_block_get_frame(ext_block);
7762
7763 if (ext_zip_size) {
7764 /* Note that page_zip will be NULL
7765 in row_purge_upd_exist_or_extern(). */
7766 switch (fil_page_get_type(page)) {
7767 case FIL_PAGE_TYPE_ZBLOB:
7768 case FIL_PAGE_TYPE_ZBLOB2:
7769 break;
7770 default:
7771 ut_error;
7772 }
7773 next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7774
7775 btr_page_free(index, ext_block, &mtr, true);
7776
7777 if (UNIV_LIKELY_NULL(block->page.zip.data)) {
7778 mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7779 next_page_no);
7780 memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
7781 page_zip_write_blob_ptr(block, rec, index,
7782 offsets, i, &mtr);
7783 } else {
7784 mtr.write<4>(*block,
7785 BTR_EXTERN_PAGE_NO + field_ref,
7786 next_page_no);
7787 mtr.write<4,mtr_t::MAYBE_NOP>(*block,
7788 BTR_EXTERN_LEN
7789 + 4 + field_ref,
7790 0U);
7791 }
7792 } else {
7793 ut_ad(!block->page.zip.data);
7794 btr_check_blob_fil_page_type(*ext_block, false);
7795
7796 next_page_no = mach_read_from_4(
7797 page + FIL_PAGE_DATA
7798 + BTR_BLOB_HDR_NEXT_PAGE_NO);
7799 btr_page_free(index, ext_block, &mtr, true);
7800
7801 mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
7802 next_page_no);
7803 /* Zero out the BLOB length. If the server
7804 crashes during the execution of this function,
7805 trx_rollback_all_recovered() could
7806 dereference the half-deleted BLOB, fetching a
7807 wrong prefix for the BLOB. */
7808 mtr.write<4,mtr_t::MAYBE_NOP>(*block,
7809 BTR_EXTERN_LEN + 4
7810 + field_ref, 0U);
7811 }
7812
7813 /* Commit mtr and release the BLOB block to save memory. */
7814 btr_blob_free(ext_block, TRUE, &mtr);
7815 }
7816 }
7817
7818 /***********************************************************//**
7819 Frees the externally stored fields for a record. */
7820 static
7821 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const rec_offs * offsets,buf_block_t * block,bool rollback,mtr_t * mtr)7822 btr_rec_free_externally_stored_fields(
7823 /*==================================*/
7824 dict_index_t* index, /*!< in: index of the data, the index
7825 tree MUST be X-latched */
7826 rec_t* rec, /*!< in/out: record */
7827 const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
7828 buf_block_t* block, /*!< in: index page of rec */
7829 bool rollback,/*!< in: performing rollback? */
7830 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7831 an X-latch to record page and to the index
7832 tree */
7833 {
7834 ulint n_fields;
7835 ulint i;
7836
7837 ut_ad(rec_offs_validate(rec, index, offsets));
7838 ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
7839 ut_ad(index->is_primary());
7840 ut_ad(page_rec_is_leaf(rec));
7841 /* Free possible externally stored fields in the record */
7842
7843 ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7844 n_fields = rec_offs_n_fields(offsets);
7845
7846 for (i = 0; i < n_fields; i++) {
7847 if (rec_offs_nth_extern(offsets, i)) {
7848 btr_free_externally_stored_field(
7849 index, btr_rec_get_field_ref(rec, offsets, i),
7850 rec, offsets, block, i, rollback, mtr);
7851 }
7852 }
7853 }
7854
7855 /***********************************************************//**
7856 Frees the externally stored fields for a record, if the field is mentioned
7857 in the update vector. */
7858 static
7859 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,buf_block_t * block,const rec_offs * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7860 btr_rec_free_updated_extern_fields(
7861 /*===============================*/
7862 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
7863 X-latched */
7864 rec_t* rec, /*!< in/out: record */
7865 buf_block_t* block, /*!< in: index page of rec */
7866 const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
7867 const upd_t* update, /*!< in: update vector */
7868 bool rollback,/*!< in: performing rollback? */
7869 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7870 an X-latch to record page and to the tree */
7871 {
7872 ulint n_fields;
7873 ulint i;
7874
7875 ut_ad(rec_offs_validate(rec, index, offsets));
7876 ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
7877
7878 /* Free possible externally stored fields in the record */
7879
7880 n_fields = upd_get_n_fields(update);
7881
7882 for (i = 0; i < n_fields; i++) {
7883 const upd_field_t* ufield = upd_get_nth_field(update, i);
7884
7885 if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7886 ulint len;
7887 byte* data = rec_get_nth_field(
7888 rec, offsets, ufield->field_no, &len);
7889 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7890
7891 btr_free_externally_stored_field(
7892 index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7893 rec, offsets, block,
7894 ufield->field_no, rollback, mtr);
7895 }
7896 }
7897 }
7898
7899 /*******************************************************************//**
7900 Copies the prefix of an uncompressed BLOB. The clustered index record
7901 that points to this BLOB must be protected by a lock or a page latch.
7902 @return number of bytes written to buf */
7903 static
7904 ulint
btr_copy_blob_prefix(byte * buf,uint32_t len,page_id_t id,uint32_t offset)7905 btr_copy_blob_prefix(
7906 /*=================*/
7907 byte* buf, /*!< out: the externally stored part of
7908 the field, or a prefix of it */
7909 uint32_t len, /*!< in: length of buf, in bytes */
7910 page_id_t id, /*!< in: page identifier of the first BLOB page */
7911 uint32_t offset) /*!< in: offset on the first BLOB page */
7912 {
7913 ulint copied_len = 0;
7914
7915 for (;;) {
7916 mtr_t mtr;
7917 buf_block_t* block;
7918 const page_t* page;
7919 const byte* blob_header;
7920 ulint part_len;
7921 ulint copy_len;
7922
7923 mtr_start(&mtr);
7924
7925 block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
7926 buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7927 page = buf_block_get_frame(block);
7928
7929 btr_check_blob_fil_page_type(*block, true);
7930
7931 blob_header = page + offset;
7932 part_len = btr_blob_get_part_len(blob_header);
7933 copy_len = ut_min(part_len, len - copied_len);
7934
7935 memcpy(buf + copied_len,
7936 blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7937 copied_len += copy_len;
7938
7939 id.set_page_no(btr_blob_get_next_page_no(blob_header));
7940
7941 mtr_commit(&mtr);
7942
7943 if (id.page_no() == FIL_NULL || copy_len != part_len) {
7944 MEM_CHECK_DEFINED(buf, copied_len);
7945 return(copied_len);
7946 }
7947
7948 /* On other BLOB pages except the first the BLOB header
7949 always is at the page data start: */
7950
7951 offset = FIL_PAGE_DATA;
7952
7953 ut_ad(copied_len <= len);
7954 }
7955 }
7956
7957 /** Copies the prefix of a compressed BLOB.
7958 The clustered index record that points to this BLOB must be protected
7959 by a lock or a page latch.
7960 @param[out] buf the externally stored part of the field,
7961 or a prefix of it
7962 @param[in] len length of buf, in bytes
7963 @param[in] zip_size ROW_FORMAT=COMPRESSED page size
7964 @param[in] id page identifier of the BLOB pages
7965 @return number of bytes written to buf */
7966 static
7967 ulint
btr_copy_zblob_prefix(byte * buf,uint32_t len,ulint zip_size,page_id_t id,uint32_t offset)7968 btr_copy_zblob_prefix(
7969 byte* buf,
7970 uint32_t len,
7971 ulint zip_size,
7972 page_id_t id,
7973 uint32_t offset)
7974 {
7975 ulint page_type = FIL_PAGE_TYPE_ZBLOB;
7976 mem_heap_t* heap;
7977 int err;
7978 z_stream d_stream;
7979
7980 d_stream.next_out = buf;
7981 d_stream.avail_out = static_cast<uInt>(len);
7982 d_stream.next_in = Z_NULL;
7983 d_stream.avail_in = 0;
7984
7985 /* Zlib inflate needs 32 kilobytes for the default
7986 window size, plus a few kilobytes for small objects. */
7987 heap = mem_heap_create(40000);
7988 page_zip_set_alloc(&d_stream, heap);
7989
7990 ut_ad(zip_size);
7991 ut_ad(ut_is_2pow(zip_size));
7992 ut_ad(id.space());
7993
7994 err = inflateInit(&d_stream);
7995 ut_a(err == Z_OK);
7996
7997 for (;;) {
7998 buf_page_t* bpage;
7999 uint32_t next_page_no;
8000
8001 /* There is no latch on bpage directly. Instead,
8002 bpage is protected by the B-tree page latch that
8003 is being held on the clustered index record, or,
8004 in row_merge_copy_blobs(), by an exclusive table lock. */
8005 bpage = buf_page_get_zip(id, zip_size);
8006
8007 if (UNIV_UNLIKELY(!bpage)) {
8008 ib::error() << "Cannot load compressed BLOB " << id;
8009 goto func_exit;
8010 }
8011
8012 if (UNIV_UNLIKELY
8013 (fil_page_get_type(bpage->zip.data) != page_type)) {
8014
8015 ib::error() << "Unexpected type "
8016 << fil_page_get_type(bpage->zip.data)
8017 << " of compressed BLOB page " << id;
8018
8019 ut_ad(0);
8020 goto end_of_blob;
8021 }
8022
8023 next_page_no = mach_read_from_4(bpage->zip.data + offset);
8024
8025 if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
8026 /* When the BLOB begins at page header,
8027 the compressed data payload does not
8028 immediately follow the next page pointer. */
8029 offset = FIL_PAGE_DATA;
8030 } else {
8031 offset += 4;
8032 }
8033
8034 d_stream.next_in = bpage->zip.data + offset;
8035 d_stream.avail_in = uInt(zip_size - offset);
8036
8037 err = inflate(&d_stream, Z_NO_FLUSH);
8038 switch (err) {
8039 case Z_OK:
8040 if (!d_stream.avail_out) {
8041 goto end_of_blob;
8042 }
8043 break;
8044 case Z_STREAM_END:
8045 if (next_page_no == FIL_NULL) {
8046 goto end_of_blob;
8047 }
8048 /* fall through */
8049 default:
8050 inflate_error:
8051 ib::error() << "inflate() of compressed BLOB page "
8052 << id
8053 << " returned " << err
8054 << " (" << d_stream.msg << ")";
8055
8056 case Z_BUF_ERROR:
8057 goto end_of_blob;
8058 }
8059
8060 if (next_page_no == FIL_NULL) {
8061 if (!d_stream.avail_in) {
8062 ib::error()
8063 << "Unexpected end of compressed "
8064 << "BLOB page " << id;
8065 } else {
8066 err = inflate(&d_stream, Z_FINISH);
8067 switch (err) {
8068 case Z_STREAM_END:
8069 case Z_BUF_ERROR:
8070 break;
8071 default:
8072 goto inflate_error;
8073 }
8074 }
8075
8076 end_of_blob:
8077 buf_page_release_zip(bpage);
8078 goto func_exit;
8079 }
8080
8081 buf_page_release_zip(bpage);
8082
8083 /* On other BLOB pages except the first
8084 the BLOB header always is at the page header: */
8085
8086 id.set_page_no(next_page_no);
8087 offset = FIL_PAGE_NEXT;
8088 page_type = FIL_PAGE_TYPE_ZBLOB2;
8089 }
8090
8091 func_exit:
8092 inflateEnd(&d_stream);
8093 mem_heap_free(heap);
8094 MEM_CHECK_DEFINED(buf, d_stream.total_out);
8095 return(d_stream.total_out);
8096 }
8097
8098 /** Copies the prefix of an externally stored field of a record.
8099 The clustered index record that points to this BLOB must be protected
8100 by a lock or a page latch.
8101 @param[out] buf the externally stored part of the
8102 field, or a prefix of it
8103 @param[in] len length of buf, in bytes
8104 @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
8105 @param[in] id page identifier of the first BLOB page
8106 @param[in] offset offset on the first BLOB page
8107 @return number of bytes written to buf */
8108 static
8109 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,uint32_t len,ulint zip_size,page_id_t id,uint32_t offset)8110 btr_copy_externally_stored_field_prefix_low(
8111 byte* buf,
8112 uint32_t len,
8113 ulint zip_size,
8114 page_id_t id,
8115 uint32_t offset)
8116 {
8117 if (len == 0)
8118 return 0;
8119
8120 return zip_size
8121 ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
8122 : btr_copy_blob_prefix(buf, len, id, offset);
8123 }
8124
8125 /** Copies the prefix of an externally stored field of a record.
8126 The clustered index record must be protected by a lock or a page latch.
8127 @param[out] buf the field, or a prefix of it
8128 @param[in] len length of buf, in bytes
8129 @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
8130 @param[in] data 'internally' stored part of the field
8131 containing also the reference to the external part; must be protected by
8132 a lock or a page latch
8133 @param[in] local_len length of data, in bytes
8134 @return the length of the copied field, or 0 if the column was being
8135 or has been deleted */
8136 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,ulint zip_size,const byte * data,ulint local_len)8137 btr_copy_externally_stored_field_prefix(
8138 byte* buf,
8139 ulint len,
8140 ulint zip_size,
8141 const byte* data,
8142 ulint local_len)
8143 {
8144 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8145
8146 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
8147
8148 if (UNIV_UNLIKELY(local_len >= len)) {
8149 memcpy(buf, data, len);
8150 return(len);
8151 }
8152
8153 memcpy(buf, data, local_len);
8154 data += local_len;
8155
8156 ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
8157
8158 if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
8159 /* The externally stored part of the column has been
8160 (partially) deleted. Signal the half-deleted BLOB
8161 to the caller. */
8162
8163 return(0);
8164 }
8165
8166 uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
8167 uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
8168 uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
8169 len -= local_len;
8170
8171 return(local_len
8172 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
8173 uint32_t(len),
8174 zip_size,
8175 page_id_t(
8176 space_id,
8177 page_no),
8178 offset));
8179 }
8180
8181 /** Copies an externally stored field of a record to mem heap.
8182 The clustered index record must be protected by a lock or a page latch.
8183 @param[out] len length of the whole field
8184 @param[in] data 'internally' stored part of the field
8185 containing also the reference to the external part; must be protected by
8186 a lock or a page latch
8187 @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
8188 @param[in] local_len length of data
8189 @param[in,out] heap mem heap
8190 @return the whole field copied to heap */
8191 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,ulint zip_size,ulint local_len,mem_heap_t * heap)8192 btr_copy_externally_stored_field(
8193 ulint* len,
8194 const byte* data,
8195 ulint zip_size,
8196 ulint local_len,
8197 mem_heap_t* heap)
8198 {
8199 byte* buf;
8200
8201 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8202
8203 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
8204
8205 uint32_t space_id = mach_read_from_4(data + local_len
8206 + BTR_EXTERN_SPACE_ID);
8207 uint32_t page_no = mach_read_from_4(data + local_len
8208 + BTR_EXTERN_PAGE_NO);
8209 uint32_t offset = mach_read_from_4(data + local_len
8210 + BTR_EXTERN_OFFSET);
8211
8212 /* Currently a BLOB cannot be bigger than 4 GB; we
8213 leave the 4 upper bytes in the length field unused */
8214
8215 uint32_t extern_len = mach_read_from_4(data + local_len
8216 + BTR_EXTERN_LEN + 4);
8217
8218 buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
8219
8220 memcpy(buf, data, local_len);
8221 *len = local_len
8222 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
8223 extern_len,
8224 zip_size,
8225 page_id_t(
8226 space_id,
8227 page_no),
8228 offset);
8229
8230 return(buf);
8231 }
8232
8233 /** Copies an externally stored field of a record to mem heap.
8234 @param[in] rec record in a clustered index; must be
8235 protected by a lock or a page latch
8236 @param[in] offset array returned by rec_get_offsets()
8237 @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0
8238 @param[in] no field number
8239 @param[out] len length of the field
8240 @param[in,out] heap mem heap
8241 @return the field copied to heap, or NULL if the field is incomplete */
8242 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const rec_offs * offsets,ulint zip_size,ulint no,ulint * len,mem_heap_t * heap)8243 btr_rec_copy_externally_stored_field(
8244 const rec_t* rec,
8245 const rec_offs* offsets,
8246 ulint zip_size,
8247 ulint no,
8248 ulint* len,
8249 mem_heap_t* heap)
8250 {
8251 ulint local_len;
8252 const byte* data;
8253
8254 ut_a(rec_offs_nth_extern(offsets, no));
8255
8256 /* An externally stored field can contain some initial
8257 data from the field, and in the last 20 bytes it has the
8258 space id, page number, and offset where the rest of the
8259 field data is stored, and the data length in addition to
8260 the data stored locally. We may need to store some data
8261 locally to get the local record length above the 128 byte
8262 limit so that field offsets are stored in two bytes, and
8263 the extern bit is available in those two bytes. */
8264
8265 data = rec_get_nth_field(rec, offsets, no, &local_len);
8266
8267 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8268
8269 if (UNIV_UNLIKELY
8270 (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
8271 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
8272 /* The externally stored field was not written yet.
8273 This record should only be seen by
8274 trx_rollback_recovered() or any
8275 TRX_ISO_READ_UNCOMMITTED transactions. */
8276 return(NULL);
8277 }
8278
8279 return(btr_copy_externally_stored_field(len, data,
8280 zip_size, local_len, heap));
8281 }
8282