1 /*****************************************************************************
2
3 Copyright (c) 1997, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation. The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 GNU General Public License, version 2.0, for more details.
27
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31
32 *****************************************************************************/
33
34 /***************************************************//**
35 @file row/row0sel.cc
36 Select
37
38 Created 12/19/1997 Heikki Tuuri
39 *******************************************************/
40
41 #include "row0sel.h"
42
43 #ifdef UNIV_NONINL
44 #include "row0sel.ic"
45 #endif
46
47 #include "dict0dict.h"
48 #include "dict0boot.h"
49 #include "trx0undo.h"
50 #include "trx0trx.h"
51 #include "btr0btr.h"
52 #include "btr0cur.h"
53 #include "btr0sea.h"
54 #include "mach0data.h"
55 #include "que0que.h"
56 #include "row0upd.h"
57 #include "row0row.h"
58 #include "row0vers.h"
59 #include "rem0cmp.h"
60 #include "lock0lock.h"
61 #include "eval0eval.h"
62 #include "pars0sym.h"
63 #include "pars0pars.h"
64 #include "row0mysql.h"
65 #include "read0read.h"
66 #include "buf0lru.h"
67 #include "ha_prototypes.h"
68 #include "m_string.h" /* for my_sys.h */
69 #include "my_sys.h" /* DEBUG_SYNC_C */
70
71 #include "my_compare.h" /* enum icp_result */
72 #include "thr_lock.h"
73 #include "handler.h"
74 #include "ha_innodb.h"
75
76 /* Maximum number of rows to prefetch; MySQL interface has another parameter */
77 #define SEL_MAX_N_PREFETCH 16
78
79 /* Number of rows fetched, after which to start prefetching; MySQL interface
80 has another parameter */
81 #define SEL_PREFETCH_LIMIT 1
82
83 /* When a select has accessed about this many pages, it returns control back
84 to que_run_threads: this is to allow canceling runaway queries */
85
86 #define SEL_COST_LIMIT 100
87
88 /* Flags for search shortcut */
89 #define SEL_FOUND 0
90 #define SEL_EXHAUSTED 1
91 #define SEL_RETRY 2
92
93 /********************************************************************//**
94 Returns TRUE if the user-defined column in a secondary index record
95 is alphabetically the same as the corresponding BLOB column in the clustered
96 index record.
97 NOTE: the comparison is NOT done as a binary comparison, but character
98 fields are compared with collation!
99 @return TRUE if the columns are equal */
100 static
101 ibool
row_sel_sec_rec_is_for_blob(ulint mtype,ulint prtype,ulint mbminmaxlen,const byte * clust_field,ulint clust_len,const byte * sec_field,ulint sec_len,ulint prefix_len,dict_table_t * table)102 row_sel_sec_rec_is_for_blob(
103 /*========================*/
104 ulint mtype, /*!< in: main type */
105 ulint prtype, /*!< in: precise type */
106 ulint mbminmaxlen, /*!< in: minimum and maximum length of
107 a multi-byte character */
108 const byte* clust_field, /*!< in: the locally stored part of
109 the clustered index column, including
110 the BLOB pointer; the clustered
111 index record must be covered by
112 a lock or a page latch to protect it
113 against deletion (rollback or purge) */
114 ulint clust_len, /*!< in: length of clust_field */
115 const byte* sec_field, /*!< in: column in secondary index */
116 ulint sec_len, /*!< in: length of sec_field */
117 ulint prefix_len, /*!< in: index column prefix length
118 in bytes */
119 dict_table_t* table) /*!< in: table */
120 {
121 ulint len;
122 byte buf[REC_VERSION_56_MAX_INDEX_COL_LEN];
123 ulint zip_size = dict_tf_get_zip_size(table->flags);
124
125 /* This function should never be invoked on an Antelope format
126 table, because they should always contain enough prefix in the
127 clustered index record. */
128 ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B);
129 ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
130 ut_ad(prefix_len >= sec_len);
131 ut_ad(prefix_len > 0);
132 ut_a(prefix_len <= sizeof buf);
133
134 if (UNIV_UNLIKELY
135 (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
136 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
137 /* The externally stored field was not written yet.
138 This record should only be seen by
139 recv_recovery_rollback_active() or any
140 TRX_ISO_READ_UNCOMMITTED transactions. */
141 return(FALSE);
142 }
143
144 len = btr_copy_externally_stored_field_prefix(buf, prefix_len,
145 zip_size,
146 clust_field, clust_len);
147
148 if (UNIV_UNLIKELY(len == 0)) {
149 /* The BLOB was being deleted as the server crashed.
150 There should not be any secondary index records
151 referring to this clustered index record, because
152 btr_free_externally_stored_field() is called after all
153 secondary index entries of the row have been purged. */
154 return(FALSE);
155 }
156
157 len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen,
158 prefix_len, len, (const char*) buf);
159
160 return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
161 }
162
163 /********************************************************************//**
164 Returns TRUE if the user-defined column values in a secondary index record
165 are alphabetically the same as the corresponding columns in the clustered
166 index record.
167 NOTE: the comparison is NOT done as a binary comparison, but character
168 fields are compared with collation!
169 @return TRUE if the secondary record is equal to the corresponding
170 fields in the clustered record, when compared with collation;
171 FALSE if not equal or if the clustered record has been marked for deletion */
172 static
173 ibool
row_sel_sec_rec_is_for_clust_rec(const rec_t * sec_rec,dict_index_t * sec_index,const rec_t * clust_rec,dict_index_t * clust_index)174 row_sel_sec_rec_is_for_clust_rec(
175 /*=============================*/
176 const rec_t* sec_rec, /*!< in: secondary index record */
177 dict_index_t* sec_index, /*!< in: secondary index */
178 const rec_t* clust_rec, /*!< in: clustered index record;
179 must be protected by a lock or
180 a page latch against deletion
181 in rollback or purge */
182 dict_index_t* clust_index) /*!< in: clustered index */
183 {
184 const byte* sec_field;
185 ulint sec_len;
186 const byte* clust_field;
187 ulint n;
188 ulint i;
189 mem_heap_t* heap = NULL;
190 ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
191 ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
192 ulint* clust_offs = clust_offsets_;
193 ulint* sec_offs = sec_offsets_;
194 ibool is_equal = TRUE;
195
196 rec_offs_init(clust_offsets_);
197 rec_offs_init(sec_offsets_);
198
199 if (rec_get_deleted_flag(clust_rec,
200 dict_table_is_comp(clust_index->table))) {
201
202 /* The clustered index record is delete-marked;
203 it is not visible in the read view. Besides,
204 if there are any externally stored columns,
205 some of them may have already been purged. */
206 return(FALSE);
207 }
208
209 clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
210 ULINT_UNDEFINED, &heap);
211 sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
212 ULINT_UNDEFINED, &heap);
213
214 n = dict_index_get_n_ordering_defined_by_user(sec_index);
215
216 for (i = 0; i < n; i++) {
217 const dict_field_t* ifield;
218 const dict_col_t* col;
219 ulint clust_pos;
220 ulint clust_len;
221 ulint len;
222
223 ifield = dict_index_get_nth_field(sec_index, i);
224 col = dict_field_get_col(ifield);
225 clust_pos = dict_col_get_clust_pos(col, clust_index);
226
227 clust_field = rec_get_nth_field(
228 clust_rec, clust_offs, clust_pos, &clust_len);
229 sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
230
231 len = clust_len;
232
233 if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL
234 && sec_len != UNIV_SQL_NULL) {
235
236 if (rec_offs_nth_extern(clust_offs, clust_pos)) {
237 len -= BTR_EXTERN_FIELD_REF_SIZE;
238 }
239
240 len = dtype_get_at_most_n_mbchars(
241 col->prtype, col->mbminmaxlen,
242 ifield->prefix_len, len, (char*) clust_field);
243
244 if (rec_offs_nth_extern(clust_offs, clust_pos)
245 && len < sec_len) {
246 if (!row_sel_sec_rec_is_for_blob(
247 col->mtype, col->prtype,
248 col->mbminmaxlen,
249 clust_field, clust_len,
250 sec_field, sec_len,
251 ifield->prefix_len,
252 clust_index->table)) {
253 goto inequal;
254 }
255
256 continue;
257 }
258 }
259
260 if (0 != cmp_data_data(col->mtype, col->prtype,
261 clust_field, len,
262 sec_field, sec_len)) {
263 inequal:
264 is_equal = FALSE;
265 goto func_exit;
266 }
267 }
268
269 func_exit:
270 if (UNIV_LIKELY_NULL(heap)) {
271 mem_heap_free(heap);
272 }
273 return(is_equal);
274 }
275
276 /*********************************************************************//**
277 Creates a select node struct.
278 @return own: select node struct */
279 UNIV_INTERN
280 sel_node_t*
sel_node_create(mem_heap_t * heap)281 sel_node_create(
282 /*============*/
283 mem_heap_t* heap) /*!< in: memory heap where created */
284 {
285 sel_node_t* node;
286
287 node = static_cast<sel_node_t*>(
288 mem_heap_alloc(heap, sizeof(sel_node_t)));
289
290 node->common.type = QUE_NODE_SELECT;
291 node->state = SEL_NODE_OPEN;
292
293 node->plans = NULL;
294
295 return(node);
296 }
297
298 /*********************************************************************//**
299 Frees the memory private to a select node when a query graph is freed,
300 does not free the heap where the node was originally created. */
301 UNIV_INTERN
302 void
sel_node_free_private(sel_node_t * node)303 sel_node_free_private(
304 /*==================*/
305 sel_node_t* node) /*!< in: select node struct */
306 {
307 ulint i;
308 plan_t* plan;
309
310 if (node->plans != NULL) {
311 for (i = 0; i < node->n_tables; i++) {
312 plan = sel_node_get_nth_plan(node, i);
313
314 btr_pcur_close(&(plan->pcur));
315 btr_pcur_close(&(plan->clust_pcur));
316
317 if (plan->old_vers_heap) {
318 mem_heap_free(plan->old_vers_heap);
319 }
320 }
321 }
322 }
323
324 /*********************************************************************//**
325 Evaluates the values in a select list. If there are aggregate functions,
326 their argument value is added to the aggregate total. */
327 UNIV_INLINE
328 void
sel_eval_select_list(sel_node_t * node)329 sel_eval_select_list(
330 /*=================*/
331 sel_node_t* node) /*!< in: select node */
332 {
333 que_node_t* exp;
334
335 exp = node->select_list;
336
337 while (exp) {
338 eval_exp(exp);
339
340 exp = que_node_get_next(exp);
341 }
342 }
343
344 /*********************************************************************//**
345 Assigns the values in the select list to the possible into-variables in
346 SELECT ... INTO ... */
347 UNIV_INLINE
348 void
sel_assign_into_var_values(sym_node_t * var,sel_node_t * node)349 sel_assign_into_var_values(
350 /*=======================*/
351 sym_node_t* var, /*!< in: first variable in a list of
352 variables */
353 sel_node_t* node) /*!< in: select node */
354 {
355 que_node_t* exp;
356
357 if (var == NULL) {
358
359 return;
360 }
361
362 for (exp = node->select_list;
363 var != 0;
364 var = static_cast<sym_node_t*>(que_node_get_next(var))) {
365
366 ut_ad(exp);
367
368 eval_node_copy_val(var->alias, exp);
369
370 exp = que_node_get_next(exp);
371 }
372 }
373
374 /*********************************************************************//**
375 Resets the aggregate value totals in the select list of an aggregate type
376 query. */
377 UNIV_INLINE
378 void
sel_reset_aggregate_vals(sel_node_t * node)379 sel_reset_aggregate_vals(
380 /*=====================*/
381 sel_node_t* node) /*!< in: select node */
382 {
383 func_node_t* func_node;
384
385 ut_ad(node->is_aggregate);
386
387 for (func_node = static_cast<func_node_t*>(node->select_list);
388 func_node != 0;
389 func_node = static_cast<func_node_t*>(
390 que_node_get_next(func_node))) {
391
392 eval_node_set_int_val(func_node, 0);
393 }
394
395 node->aggregate_already_fetched = FALSE;
396 }
397
398 /*********************************************************************//**
399 Copies the input variable values when an explicit cursor is opened. */
400 UNIV_INLINE
401 void
row_sel_copy_input_variable_vals(sel_node_t * node)402 row_sel_copy_input_variable_vals(
403 /*=============================*/
404 sel_node_t* node) /*!< in: select node */
405 {
406 sym_node_t* var;
407
408 var = UT_LIST_GET_FIRST(node->copy_variables);
409
410 while (var) {
411 eval_node_copy_val(var, var->alias);
412
413 var->indirection = NULL;
414
415 var = UT_LIST_GET_NEXT(col_var_list, var);
416 }
417 }
418
419 /*********************************************************************//**
420 Fetches the column values from a record. */
421 static
422 void
row_sel_fetch_columns(dict_index_t * index,const rec_t * rec,const ulint * offsets,sym_node_t * column)423 row_sel_fetch_columns(
424 /*==================*/
425 dict_index_t* index, /*!< in: record index */
426 const rec_t* rec, /*!< in: record in a clustered or non-clustered
427 index; must be protected by a page latch */
428 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
429 sym_node_t* column) /*!< in: first column in a column list, or
430 NULL */
431 {
432 dfield_t* val;
433 ulint index_type;
434 ulint field_no;
435 const byte* data;
436 ulint len;
437
438 ut_ad(rec_offs_validate(rec, index, offsets));
439
440 if (dict_index_is_clust(index)) {
441 index_type = SYM_CLUST_FIELD_NO;
442 } else {
443 index_type = SYM_SEC_FIELD_NO;
444 }
445
446 while (column) {
447 mem_heap_t* heap = NULL;
448 ibool needs_copy;
449
450 field_no = column->field_nos[index_type];
451
452 if (field_no != ULINT_UNDEFINED) {
453
454 if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
455 field_no))) {
456
457 /* Copy an externally stored field to the
458 temporary heap, if possible. */
459
460 heap = mem_heap_create(1);
461
462 data = btr_rec_copy_externally_stored_field(
463 rec, offsets,
464 dict_table_zip_size(index->table),
465 field_no, &len, heap);
466
467 /* data == NULL means that the
468 externally stored field was not
469 written yet. This record
470 should only be seen by
471 recv_recovery_rollback_active() or any
472 TRX_ISO_READ_UNCOMMITTED
473 transactions. The InnoDB SQL parser
474 (the sole caller of this function)
475 does not implement READ UNCOMMITTED,
476 and it is not involved during rollback. */
477 ut_a(data);
478 ut_a(len != UNIV_SQL_NULL);
479
480 needs_copy = TRUE;
481 } else {
482 data = rec_get_nth_field(rec, offsets,
483 field_no, &len);
484
485 needs_copy = column->copy_val;
486 }
487
488 if (needs_copy) {
489 eval_node_copy_and_alloc_val(column, data,
490 len);
491 } else {
492 val = que_node_get_val(column);
493 dfield_set_data(val, data, len);
494 }
495
496 if (UNIV_LIKELY_NULL(heap)) {
497 mem_heap_free(heap);
498 }
499 }
500
501 column = UT_LIST_GET_NEXT(col_var_list, column);
502 }
503 }
504
505 /*********************************************************************//**
506 Allocates a prefetch buffer for a column when prefetch is first time done. */
507 static
508 void
sel_col_prefetch_buf_alloc(sym_node_t * column)509 sel_col_prefetch_buf_alloc(
510 /*=======================*/
511 sym_node_t* column) /*!< in: symbol table node for a column */
512 {
513 sel_buf_t* sel_buf;
514 ulint i;
515
516 ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
517
518 column->prefetch_buf = static_cast<sel_buf_t*>(
519 mem_alloc(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
520
521 for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
522 sel_buf = column->prefetch_buf + i;
523
524 sel_buf->data = NULL;
525 sel_buf->len = 0;
526 sel_buf->val_buf_size = 0;
527 }
528 }
529
530 /*********************************************************************//**
531 Frees a prefetch buffer for a column, including the dynamically allocated
532 memory for data stored there. */
533 UNIV_INTERN
534 void
sel_col_prefetch_buf_free(sel_buf_t * prefetch_buf)535 sel_col_prefetch_buf_free(
536 /*======================*/
537 sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
538 {
539 sel_buf_t* sel_buf;
540 ulint i;
541
542 for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
543 sel_buf = prefetch_buf + i;
544
545 if (sel_buf->val_buf_size > 0) {
546
547 mem_free(sel_buf->data);
548 }
549 }
550
551 mem_free(prefetch_buf);
552 }
553
554 /*********************************************************************//**
555 Pops the column values for a prefetched, cached row from the column prefetch
556 buffers and places them to the val fields in the column nodes. */
557 static
558 void
sel_dequeue_prefetched_row(plan_t * plan)559 sel_dequeue_prefetched_row(
560 /*=======================*/
561 plan_t* plan) /*!< in: plan node for a table */
562 {
563 sym_node_t* column;
564 sel_buf_t* sel_buf;
565 dfield_t* val;
566 byte* data;
567 ulint len;
568 ulint val_buf_size;
569
570 ut_ad(plan->n_rows_prefetched > 0);
571
572 column = UT_LIST_GET_FIRST(plan->columns);
573
574 while (column) {
575 val = que_node_get_val(column);
576
577 if (!column->copy_val) {
578 /* We did not really push any value for the
579 column */
580
581 ut_ad(!column->prefetch_buf);
582 ut_ad(que_node_get_val_buf_size(column) == 0);
583 ut_d(dfield_set_null(val));
584
585 goto next_col;
586 }
587
588 ut_ad(column->prefetch_buf);
589 ut_ad(!dfield_is_ext(val));
590
591 sel_buf = column->prefetch_buf + plan->first_prefetched;
592
593 data = sel_buf->data;
594 len = sel_buf->len;
595 val_buf_size = sel_buf->val_buf_size;
596
597 /* We must keep track of the allocated memory for
598 column values to be able to free it later: therefore
599 we swap the values for sel_buf and val */
600
601 sel_buf->data = static_cast<byte*>(dfield_get_data(val));
602 sel_buf->len = dfield_get_len(val);
603 sel_buf->val_buf_size = que_node_get_val_buf_size(column);
604
605 dfield_set_data(val, data, len);
606 que_node_set_val_buf_size(column, val_buf_size);
607 next_col:
608 column = UT_LIST_GET_NEXT(col_var_list, column);
609 }
610
611 plan->n_rows_prefetched--;
612
613 plan->first_prefetched++;
614 }
615
616 /*********************************************************************//**
617 Pushes the column values for a prefetched, cached row to the column prefetch
618 buffers from the val fields in the column nodes. */
619 UNIV_INLINE
620 void
sel_enqueue_prefetched_row(plan_t * plan)621 sel_enqueue_prefetched_row(
622 /*=======================*/
623 plan_t* plan) /*!< in: plan node for a table */
624 {
625 sym_node_t* column;
626 sel_buf_t* sel_buf;
627 dfield_t* val;
628 byte* data;
629 ulint len;
630 ulint pos;
631 ulint val_buf_size;
632
633 if (plan->n_rows_prefetched == 0) {
634 pos = 0;
635 plan->first_prefetched = 0;
636 } else {
637 pos = plan->n_rows_prefetched;
638
639 /* We have the convention that pushing new rows starts only
640 after the prefetch stack has been emptied: */
641
642 ut_ad(plan->first_prefetched == 0);
643 }
644
645 plan->n_rows_prefetched++;
646
647 ut_ad(pos < SEL_MAX_N_PREFETCH);
648
649 for (column = UT_LIST_GET_FIRST(plan->columns);
650 column != 0;
651 column = UT_LIST_GET_NEXT(col_var_list, column)) {
652
653 if (!column->copy_val) {
654 /* There is no sense to push pointers to database
655 page fields when we do not keep latch on the page! */
656 continue;
657 }
658
659 if (!column->prefetch_buf) {
660 /* Allocate a new prefetch buffer */
661
662 sel_col_prefetch_buf_alloc(column);
663 }
664
665 sel_buf = column->prefetch_buf + pos;
666
667 val = que_node_get_val(column);
668
669 data = static_cast<byte*>(dfield_get_data(val));
670 len = dfield_get_len(val);
671 val_buf_size = que_node_get_val_buf_size(column);
672
673 /* We must keep track of the allocated memory for
674 column values to be able to free it later: therefore
675 we swap the values for sel_buf and val */
676
677 dfield_set_data(val, sel_buf->data, sel_buf->len);
678 que_node_set_val_buf_size(column, sel_buf->val_buf_size);
679
680 sel_buf->data = data;
681 sel_buf->len = len;
682 sel_buf->val_buf_size = val_buf_size;
683 }
684 }
685
686 /*********************************************************************//**
687 Builds a previous version of a clustered index record for a consistent read
688 @return DB_SUCCESS or error code */
689 static MY_ATTRIBUTE((nonnull, warn_unused_result))
690 dberr_t
row_sel_build_prev_vers(read_view_t * read_view,dict_index_t * index,rec_t * rec,ulint ** offsets,mem_heap_t ** offset_heap,mem_heap_t ** old_vers_heap,rec_t ** old_vers,mtr_t * mtr)691 row_sel_build_prev_vers(
692 /*====================*/
693 read_view_t* read_view, /*!< in: read view */
694 dict_index_t* index, /*!< in: plan node for table */
695 rec_t* rec, /*!< in: record in a clustered index */
696 ulint** offsets, /*!< in/out: offsets returned by
697 rec_get_offsets(rec, plan->index) */
698 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
699 the offsets are allocated */
700 mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
701 rec_t** old_vers, /*!< out: old version, or NULL if the
702 record does not exist in the view:
703 i.e., it was freshly inserted
704 afterwards */
705 mtr_t* mtr) /*!< in: mtr */
706 {
707 dberr_t err;
708
709 if (*old_vers_heap) {
710 mem_heap_empty(*old_vers_heap);
711 } else {
712 *old_vers_heap = mem_heap_create(512);
713 }
714
715 err = row_vers_build_for_consistent_read(
716 rec, mtr, index, offsets, read_view, offset_heap,
717 *old_vers_heap, old_vers);
718 return(err);
719 }
720
721 /*********************************************************************//**
722 Builds the last committed version of a clustered index record for a
723 semi-consistent read. */
724 static MY_ATTRIBUTE((nonnull))
725 void
row_sel_build_committed_vers_for_mysql(dict_index_t * clust_index,row_prebuilt_t * prebuilt,const rec_t * rec,ulint ** offsets,mem_heap_t ** offset_heap,const rec_t ** old_vers,mtr_t * mtr)726 row_sel_build_committed_vers_for_mysql(
727 /*===================================*/
728 dict_index_t* clust_index, /*!< in: clustered index */
729 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
730 const rec_t* rec, /*!< in: record in a clustered index */
731 ulint** offsets, /*!< in/out: offsets returned by
732 rec_get_offsets(rec, clust_index) */
733 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
734 the offsets are allocated */
735 const rec_t** old_vers, /*!< out: old version, or NULL if the
736 record does not exist in the view:
737 i.e., it was freshly inserted
738 afterwards */
739 mtr_t* mtr) /*!< in: mtr */
740 {
741 if (prebuilt->old_vers_heap) {
742 mem_heap_empty(prebuilt->old_vers_heap);
743 } else {
744 prebuilt->old_vers_heap = mem_heap_create(
745 rec_offs_size(*offsets));
746 }
747
748 row_vers_build_for_semi_consistent_read(
749 rec, mtr, clust_index, offsets, offset_heap,
750 prebuilt->old_vers_heap, old_vers);
751 }
752
753 /*********************************************************************//**
754 Tests the conditions which determine when the index segment we are searching
755 through has been exhausted.
756 @return TRUE if row passed the tests */
757 UNIV_INLINE
758 ibool
row_sel_test_end_conds(plan_t * plan)759 row_sel_test_end_conds(
760 /*===================*/
761 plan_t* plan) /*!< in: plan for the table; the column values must
762 already have been retrieved and the right sides of
763 comparisons evaluated */
764 {
765 func_node_t* cond;
766
767 /* All conditions in end_conds are comparisons of a column to an
768 expression */
769
770 for (cond = UT_LIST_GET_FIRST(plan->end_conds);
771 cond != 0;
772 cond = UT_LIST_GET_NEXT(cond_list, cond)) {
773
774 /* Evaluate the left side of the comparison, i.e., get the
775 column value if there is an indirection */
776
777 eval_sym(static_cast<sym_node_t*>(cond->args));
778
779 /* Do the comparison */
780
781 if (!eval_cmp(cond)) {
782
783 return(FALSE);
784 }
785 }
786
787 return(TRUE);
788 }
789
790 /*********************************************************************//**
791 Tests the other conditions.
792 @return TRUE if row passed the tests */
793 UNIV_INLINE
794 ibool
row_sel_test_other_conds(plan_t * plan)795 row_sel_test_other_conds(
796 /*=====================*/
797 plan_t* plan) /*!< in: plan for the table; the column values must
798 already have been retrieved */
799 {
800 func_node_t* cond;
801
802 cond = UT_LIST_GET_FIRST(plan->other_conds);
803
804 while (cond) {
805 eval_exp(cond);
806
807 if (!eval_node_get_ibool_val(cond)) {
808
809 return(FALSE);
810 }
811
812 cond = UT_LIST_GET_NEXT(cond_list, cond);
813 }
814
815 return(TRUE);
816 }
817
818 /*********************************************************************//**
819 Retrieves the clustered index record corresponding to a record in a
820 non-clustered index. Does the necessary locking.
821 @return DB_SUCCESS or error code */
822 static MY_ATTRIBUTE((nonnull, warn_unused_result))
823 dberr_t
row_sel_get_clust_rec(sel_node_t * node,plan_t * plan,rec_t * rec,que_thr_t * thr,rec_t ** out_rec,mtr_t * mtr)824 row_sel_get_clust_rec(
825 /*==================*/
826 sel_node_t* node, /*!< in: select_node */
827 plan_t* plan, /*!< in: plan node for table */
828 rec_t* rec, /*!< in: record in a non-clustered index */
829 que_thr_t* thr, /*!< in: query thread */
830 rec_t** out_rec,/*!< out: clustered record or an old version of
831 it, NULL if the old version did not exist
832 in the read view, i.e., it was a fresh
833 inserted version */
834 mtr_t* mtr) /*!< in: mtr used to get access to the
835 non-clustered record; the same mtr is used to
836 access the clustered index */
837 {
838 dict_index_t* index;
839 rec_t* clust_rec;
840 rec_t* old_vers;
841 dberr_t err;
842 mem_heap_t* heap = NULL;
843 ulint offsets_[REC_OFFS_NORMAL_SIZE];
844 ulint* offsets = offsets_;
845 rec_offs_init(offsets_);
846
847 *out_rec = NULL;
848
849 offsets = rec_get_offsets(rec,
850 btr_pcur_get_btr_cur(&plan->pcur)->index,
851 offsets, ULINT_UNDEFINED, &heap);
852
853 row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
854
855 index = dict_table_get_first_index(plan->table);
856
857 btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
858 BTR_SEARCH_LEAF, &plan->clust_pcur,
859 0, mtr);
860
861 clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
862
863 /* Note: only if the search ends up on a non-infimum record is the
864 low_match value the real match to the search tuple */
865
866 if (!page_rec_is_user_rec(clust_rec)
867 || btr_pcur_get_low_match(&(plan->clust_pcur))
868 < dict_index_get_n_unique(index)) {
869
870 ut_a(rec_get_deleted_flag(rec,
871 dict_table_is_comp(plan->table)));
872 ut_a(node->read_view);
873
874 /* In a rare case it is possible that no clust rec is found
875 for a delete-marked secondary index record: if in row0umod.cc
876 in row_undo_mod_remove_clust_low() we have already removed
877 the clust rec, while purge is still cleaning and removing
878 secondary index records associated with earlier versions of
879 the clustered index record. In that case we know that the
880 clustered index record did not exist in the read view of
881 trx. */
882
883 goto func_exit;
884 }
885
886 offsets = rec_get_offsets(clust_rec, index, offsets,
887 ULINT_UNDEFINED, &heap);
888
889 if (!node->read_view) {
890 /* Try to place a lock on the index record */
891
892 /* If innodb_locks_unsafe_for_binlog option is used
893 or this session is using READ COMMITTED isolation level
894 we lock only the record, i.e., next-key locking is
895 not used. */
896 ulint lock_type;
897 trx_t* trx;
898
899 trx = thr_get_trx(thr);
900
901 if (srv_locks_unsafe_for_binlog
902 || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
903 lock_type = LOCK_REC_NOT_GAP;
904 } else {
905 lock_type = LOCK_ORDINARY;
906 }
907
908 err = lock_clust_rec_read_check_and_lock(
909 0, btr_pcur_get_block(&plan->clust_pcur),
910 clust_rec, index, offsets,
911 static_cast<enum lock_mode>(node->row_lock_mode),
912 lock_type,
913 thr);
914
915 switch (err) {
916 case DB_SUCCESS:
917 case DB_SUCCESS_LOCKED_REC:
918 /* Declare the variable uninitialized in Valgrind.
919 It should be set to DB_SUCCESS at func_exit. */
920 UNIV_MEM_INVALID(&err, sizeof err);
921 break;
922 default:
923 goto err_exit;
924 }
925 } else {
926 /* This is a non-locking consistent read: if necessary, fetch
927 a previous version of the record */
928
929 old_vers = NULL;
930
931 if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
932 node->read_view)) {
933
934 err = row_sel_build_prev_vers(
935 node->read_view, index, clust_rec,
936 &offsets, &heap, &plan->old_vers_heap,
937 &old_vers, mtr);
938
939 if (err != DB_SUCCESS) {
940
941 goto err_exit;
942 }
943
944 clust_rec = old_vers;
945
946 if (clust_rec == NULL) {
947 goto func_exit;
948 }
949 }
950
951 /* If we had to go to an earlier version of row or the
952 secondary index record is delete marked, then it may be that
953 the secondary index record corresponding to clust_rec
954 (or old_vers) is not rec; in that case we must ignore
955 such row because in our snapshot rec would not have existed.
956 Remember that from rec we cannot see directly which transaction
957 id corresponds to it: we have to go to the clustered index
958 record. A query where we want to fetch all rows where
959 the secondary index value is in some interval would return
960 a wrong result if we would not drop rows which we come to
961 visit through secondary index records that would not really
962 exist in our snapshot. */
963
964 if ((old_vers
965 || rec_get_deleted_flag(rec, dict_table_is_comp(
966 plan->table)))
967 && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
968 clust_rec, index)) {
969 goto func_exit;
970 }
971 }
972
973 /* Fetch the columns needed in test conditions. The clustered
974 index record is protected by a page latch that was acquired
975 when plan->clust_pcur was positioned. The latch will not be
976 released until mtr_commit(mtr). */
977
978 ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
979 row_sel_fetch_columns(index, clust_rec, offsets,
980 UT_LIST_GET_FIRST(plan->columns));
981 *out_rec = clust_rec;
982 func_exit:
983 err = DB_SUCCESS;
984 err_exit:
985 if (UNIV_LIKELY_NULL(heap)) {
986 mem_heap_free(heap);
987 }
988 return(err);
989 }
990
991 /*********************************************************************//**
992 Sets a lock on a record.
993 @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
994 UNIV_INLINE
995 dberr_t
sel_set_rec_lock(const buf_block_t * block,const rec_t * rec,dict_index_t * index,const ulint * offsets,ulint mode,ulint type,que_thr_t * thr)996 sel_set_rec_lock(
997 /*=============*/
998 const buf_block_t* block, /*!< in: buffer block of rec */
999 const rec_t* rec, /*!< in: record */
1000 dict_index_t* index, /*!< in: index */
1001 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
1002 ulint mode, /*!< in: lock mode */
1003 ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
1004 LOC_REC_NOT_GAP */
1005 que_thr_t* thr) /*!< in: query thread */
1006 {
1007 trx_t* trx;
1008 dberr_t err;
1009
1010 trx = thr_get_trx(thr);
1011
1012 if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
1013 if (buf_LRU_buf_pool_running_out()) {
1014
1015 return(DB_LOCK_TABLE_FULL);
1016 }
1017 }
1018
1019 if (dict_index_is_clust(index)) {
1020 err = lock_clust_rec_read_check_and_lock(
1021 0, block, rec, index, offsets,
1022 static_cast<enum lock_mode>(mode), type, thr);
1023 } else {
1024 err = lock_sec_rec_read_check_and_lock(
1025 0, block, rec, index, offsets,
1026 static_cast<enum lock_mode>(mode), type, thr);
1027 }
1028
1029 return(err);
1030 }
1031
1032 /*********************************************************************//**
1033 Opens a pcur to a table index. */
1034 static
1035 void
row_sel_open_pcur(plan_t * plan,ibool search_latch_locked,mtr_t * mtr)1036 row_sel_open_pcur(
1037 /*==============*/
1038 plan_t* plan, /*!< in: table plan */
1039 ibool search_latch_locked,
1040 /*!< in: TRUE if the thread currently
1041 has the search latch locked in
1042 s-mode */
1043 mtr_t* mtr) /*!< in: mtr */
1044 {
1045 dict_index_t* index;
1046 func_node_t* cond;
1047 que_node_t* exp;
1048 ulint n_fields;
1049 ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
1050 ulint i;
1051
1052 if (search_latch_locked) {
1053 has_search_latch = RW_S_LATCH;
1054 }
1055
1056 index = plan->index;
1057
1058 /* Calculate the value of the search tuple: the exact match columns
1059 get their expressions evaluated when we evaluate the right sides of
1060 end_conds */
1061
1062 cond = UT_LIST_GET_FIRST(plan->end_conds);
1063
1064 while (cond) {
1065 eval_exp(que_node_get_next(cond->args));
1066
1067 cond = UT_LIST_GET_NEXT(cond_list, cond);
1068 }
1069
1070 if (plan->tuple) {
1071 n_fields = dtuple_get_n_fields(plan->tuple);
1072
1073 if (plan->n_exact_match < n_fields) {
1074 /* There is a non-exact match field which must be
1075 evaluated separately */
1076
1077 eval_exp(plan->tuple_exps[n_fields - 1]);
1078 }
1079
1080 for (i = 0; i < n_fields; i++) {
1081 exp = plan->tuple_exps[i];
1082
1083 dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1084 que_node_get_val(exp));
1085 }
1086
1087 /* Open pcur to the index */
1088
1089 btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1090 BTR_SEARCH_LEAF, &plan->pcur,
1091 has_search_latch, mtr);
1092 } else {
1093 /* Open the cursor to the start or the end of the index
1094 (FALSE: no init) */
1095
1096 btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
1097 &(plan->pcur), false, 0, mtr);
1098 }
1099
1100 ut_ad(plan->n_rows_prefetched == 0);
1101 ut_ad(plan->n_rows_fetched == 0);
1102 ut_ad(plan->cursor_at_end == FALSE);
1103
1104 plan->pcur_is_open = TRUE;
1105 }
1106
1107 /*********************************************************************//**
1108 Restores a stored pcur position to a table index.
1109 @return TRUE if the cursor should be moved to the next record after we
1110 return from this function (moved to the previous, in the case of a
1111 descending cursor) without processing again the current cursor
1112 record */
1113 static
1114 ibool
row_sel_restore_pcur_pos(plan_t * plan,mtr_t * mtr)1115 row_sel_restore_pcur_pos(
1116 /*=====================*/
1117 plan_t* plan, /*!< in: table plan */
1118 mtr_t* mtr) /*!< in: mtr */
1119 {
1120 ibool equal_position;
1121 ulint relative_position;
1122
1123 ut_ad(!plan->cursor_at_end);
1124
1125 relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1126
1127 equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1128 &(plan->pcur), mtr);
1129
1130 /* If the cursor is traveling upwards, and relative_position is
1131
1132 (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1133 yet on the successor of the page infimum;
1134 (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1135 first record GREATER than the predecessor of a page supremum; we have
1136 not yet processed the cursor record: no need to move the cursor to the
1137 next record;
1138 (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1139 last record LESS or EQUAL to the old stored user record; (a) if
1140 equal_position is FALSE, this means that the cursor is now on a record
1141 less than the old user record, and we must move to the next record;
1142 (b) if equal_position is TRUE, then if
1143 plan->stored_cursor_rec_processed is TRUE, we must move to the next
1144 record, else there is no need to move the cursor. */
1145
1146 if (plan->asc) {
1147 if (relative_position == BTR_PCUR_ON) {
1148
1149 if (equal_position) {
1150
1151 return(plan->stored_cursor_rec_processed);
1152 }
1153
1154 return(TRUE);
1155 }
1156
1157 ut_ad(relative_position == BTR_PCUR_AFTER
1158 || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1159
1160 return(FALSE);
1161 }
1162
1163 /* If the cursor is traveling downwards, and relative_position is
1164
1165 (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1166 the last record LESS than the successor of a page infimum; we have not
1167 processed the cursor record: no need to move the cursor;
1168 (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1169 first record GREATER than the predecessor of a page supremum; we have
1170 processed the cursor record: we should move the cursor to the previous
1171 record;
1172 (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1173 last record LESS or EQUAL to the old stored user record; (a) if
1174 equal_position is FALSE, this means that the cursor is now on a record
1175 less than the old user record, and we need not move to the previous
1176 record; (b) if equal_position is TRUE, then if
1177 plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1178 record, else there is no need to move the cursor. */
1179
1180 if (relative_position == BTR_PCUR_BEFORE
1181 || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1182
1183 return(FALSE);
1184 }
1185
1186 if (relative_position == BTR_PCUR_ON) {
1187
1188 if (equal_position) {
1189
1190 return(plan->stored_cursor_rec_processed);
1191 }
1192
1193 return(FALSE);
1194 }
1195
1196 ut_ad(relative_position == BTR_PCUR_AFTER
1197 || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1198
1199 return(TRUE);
1200 }
1201
1202 /*********************************************************************//**
1203 Resets a plan cursor to a closed state. */
1204 UNIV_INLINE
1205 void
plan_reset_cursor(plan_t * plan)1206 plan_reset_cursor(
1207 /*==============*/
1208 plan_t* plan) /*!< in: plan */
1209 {
1210 plan->pcur_is_open = FALSE;
1211 plan->cursor_at_end = FALSE;
1212 plan->n_rows_fetched = 0;
1213 plan->n_rows_prefetched = 0;
1214 }
1215
1216 /*********************************************************************//**
1217 Tries to do a shortcut to fetch a clustered index record with a unique key,
1218 using the hash index if possible (not always).
1219 @return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1220 static
1221 ulint
row_sel_try_search_shortcut(sel_node_t * node,plan_t * plan,ibool search_latch_locked,mtr_t * mtr)1222 row_sel_try_search_shortcut(
1223 /*========================*/
1224 sel_node_t* node, /*!< in: select node for a consistent read */
1225 plan_t* plan, /*!< in: plan for a unique search in clustered
1226 index */
1227 ibool search_latch_locked,
1228 /*!< in: whether the search holds
1229 btr_search_latch */
1230 mtr_t* mtr) /*!< in: mtr */
1231 {
1232 dict_index_t* index;
1233 rec_t* rec;
1234 mem_heap_t* heap = NULL;
1235 ulint offsets_[REC_OFFS_NORMAL_SIZE];
1236 ulint* offsets = offsets_;
1237 ulint ret;
1238 rec_offs_init(offsets_);
1239
1240 index = plan->index;
1241
1242 ut_ad(node->read_view);
1243 ut_ad(plan->unique_search);
1244 ut_ad(!plan->must_get_clust);
1245 #ifdef UNIV_SYNC_DEBUG
1246 if (search_latch_locked) {
1247 ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1248 }
1249 #endif /* UNIV_SYNC_DEBUG */
1250
1251 row_sel_open_pcur(plan, search_latch_locked, mtr);
1252
1253 rec = btr_pcur_get_rec(&(plan->pcur));
1254
1255 if (!page_rec_is_user_rec(rec)) {
1256
1257 return(SEL_RETRY);
1258 }
1259
1260 ut_ad(plan->mode == PAGE_CUR_GE);
1261
1262 /* As the cursor is now placed on a user record after a search with
1263 the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1264 fields in the user record matched to the search tuple */
1265
1266 if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1267
1268 return(SEL_EXHAUSTED);
1269 }
1270
1271 /* This is a non-locking consistent read: if necessary, fetch
1272 a previous version of the record */
1273
1274 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1275
1276 if (dict_index_is_clust(index)) {
1277 if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1278 node->read_view)) {
1279 ret = SEL_RETRY;
1280 goto func_exit;
1281 }
1282 } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1283
1284 ret = SEL_RETRY;
1285 goto func_exit;
1286 }
1287
1288 /* Test the deleted flag. */
1289
1290 if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1291
1292 ret = SEL_EXHAUSTED;
1293 goto func_exit;
1294 }
1295
1296 /* Fetch the columns needed in test conditions. The index
1297 record is protected by a page latch that was acquired when
1298 plan->pcur was positioned. The latch will not be released
1299 until mtr_commit(mtr). */
1300
1301 row_sel_fetch_columns(index, rec, offsets,
1302 UT_LIST_GET_FIRST(plan->columns));
1303
1304 /* Test the rest of search conditions */
1305
1306 if (!row_sel_test_other_conds(plan)) {
1307
1308 ret = SEL_EXHAUSTED;
1309 goto func_exit;
1310 }
1311
1312 ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1313
1314 plan->n_rows_fetched++;
1315 ret = SEL_FOUND;
1316 func_exit:
1317 if (UNIV_LIKELY_NULL(heap)) {
1318 mem_heap_free(heap);
1319 }
1320 return(ret);
1321 }
1322
1323 /*********************************************************************//**
1324 Performs a select step.
1325 @return DB_SUCCESS or error code */
1326 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1327 dberr_t
row_sel(sel_node_t * node,que_thr_t * thr)1328 row_sel(
1329 /*====*/
1330 sel_node_t* node, /*!< in: select node */
1331 que_thr_t* thr) /*!< in: query thread */
1332 {
1333 dict_index_t* index;
1334 plan_t* plan;
1335 mtr_t mtr;
1336 ibool moved;
1337 rec_t* rec;
1338 rec_t* old_vers;
1339 rec_t* clust_rec;
1340 ibool search_latch_locked;
1341 ibool consistent_read;
1342
1343 /* The following flag becomes TRUE when we are doing a
1344 consistent read from a non-clustered index and we must look
1345 at the clustered index to find out the previous delete mark
1346 state of the non-clustered record: */
1347
1348 ibool cons_read_requires_clust_rec = FALSE;
1349 ulint cost_counter = 0;
1350 ibool cursor_just_opened;
1351 ibool must_go_to_next;
1352 ibool mtr_has_extra_clust_latch = FALSE;
1353 /* TRUE if the search was made using
1354 a non-clustered index, and we had to
1355 access the clustered record: now &mtr
1356 contains a clustered index latch, and
1357 &mtr must be committed before we move
1358 to the next non-clustered record */
1359 ulint found_flag;
1360 dberr_t err;
1361 mem_heap_t* heap = NULL;
1362 ulint offsets_[REC_OFFS_NORMAL_SIZE];
1363 ulint* offsets = offsets_;
1364 rec_offs_init(offsets_);
1365
1366 ut_ad(thr->run_node == node);
1367
1368 search_latch_locked = FALSE;
1369
1370 if (node->read_view) {
1371 /* In consistent reads, we try to do with the hash index and
1372 not to use the buffer page get. This is to reduce memory bus
1373 load resulting from semaphore operations. The search latch
1374 will be s-locked when we access an index with a unique search
1375 condition, but not locked when we access an index with a
1376 less selective search condition. */
1377
1378 consistent_read = TRUE;
1379 } else {
1380 consistent_read = FALSE;
1381 }
1382
1383 table_loop:
1384 /* TABLE LOOP
1385 ----------
1386 This is the outer major loop in calculating a join. We come here when
1387 node->fetch_table changes, and after adding a row to aggregate totals
1388 and, of course, when this function is called. */
1389
1390 ut_ad(mtr_has_extra_clust_latch == FALSE);
1391
1392 plan = sel_node_get_nth_plan(node, node->fetch_table);
1393 index = plan->index;
1394
1395 if (plan->n_rows_prefetched > 0) {
1396 sel_dequeue_prefetched_row(plan);
1397
1398 goto next_table_no_mtr;
1399 }
1400
1401 if (plan->cursor_at_end) {
1402 /* The cursor has already reached the result set end: no more
1403 rows to process for this table cursor, as also the prefetch
1404 stack was empty */
1405
1406 ut_ad(plan->pcur_is_open);
1407
1408 goto table_exhausted_no_mtr;
1409 }
1410
1411 /* Open a cursor to index, or restore an open cursor position */
1412
1413 mtr_start(&mtr);
1414
1415 if (consistent_read && plan->unique_search && !plan->pcur_is_open
1416 && !plan->must_get_clust
1417 && !plan->table->big_rows) {
1418 if (!search_latch_locked) {
1419 rw_lock_s_lock(&btr_search_latch);
1420
1421 search_latch_locked = TRUE;
1422 } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
1423
1424 /* There is an x-latch request waiting: release the
1425 s-latch for a moment; as an s-latch here is often
1426 kept for some 10 searches before being released,
1427 a waiting x-latch request would block other threads
1428 from acquiring an s-latch for a long time, lowering
1429 performance significantly in multiprocessors. */
1430
1431 rw_lock_s_unlock(&btr_search_latch);
1432 rw_lock_s_lock(&btr_search_latch);
1433 }
1434
1435 found_flag = row_sel_try_search_shortcut(node, plan,
1436 search_latch_locked,
1437 &mtr);
1438
1439 if (found_flag == SEL_FOUND) {
1440
1441 goto next_table;
1442
1443 } else if (found_flag == SEL_EXHAUSTED) {
1444
1445 goto table_exhausted;
1446 }
1447
1448 ut_ad(found_flag == SEL_RETRY);
1449
1450 plan_reset_cursor(plan);
1451
1452 mtr_commit(&mtr);
1453 mtr_start(&mtr);
1454 }
1455
1456 if (search_latch_locked) {
1457 rw_lock_s_unlock(&btr_search_latch);
1458
1459 search_latch_locked = FALSE;
1460 }
1461
1462 if (!plan->pcur_is_open) {
1463 /* Evaluate the expressions to build the search tuple and
1464 open the cursor */
1465
1466 row_sel_open_pcur(plan, search_latch_locked, &mtr);
1467
1468 cursor_just_opened = TRUE;
1469
1470 /* A new search was made: increment the cost counter */
1471 cost_counter++;
1472 } else {
1473 /* Restore pcur position to the index */
1474
1475 must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1476
1477 cursor_just_opened = FALSE;
1478
1479 if (must_go_to_next) {
1480 /* We have already processed the cursor record: move
1481 to the next */
1482
1483 goto next_rec;
1484 }
1485 }
1486
1487 rec_loop:
1488 /* RECORD LOOP
1489 -----------
1490 In this loop we use pcur and try to fetch a qualifying row, and
1491 also fill the prefetch buffer for this table if n_rows_fetched has
1492 exceeded a threshold. While we are inside this loop, the following
1493 holds:
1494 (1) &mtr is started,
1495 (2) pcur is positioned and open.
1496
1497 NOTE that if cursor_just_opened is TRUE here, it means that we came
1498 to this point right after row_sel_open_pcur. */
1499
1500 ut_ad(mtr_has_extra_clust_latch == FALSE);
1501
1502 rec = btr_pcur_get_rec(&(plan->pcur));
1503
1504 /* PHASE 1: Set a lock if specified */
1505
1506 if (!node->asc && cursor_just_opened
1507 && !page_rec_is_supremum(rec)) {
1508
1509 /* When we open a cursor for a descending search, we must set
1510 a next-key lock on the successor record: otherwise it would
1511 be possible to insert new records next to the cursor position,
1512 and it might be that these new records should appear in the
1513 search result set, resulting in the phantom problem. */
1514
1515 if (!consistent_read) {
1516
1517 /* If innodb_locks_unsafe_for_binlog option is used
1518 or this session is using READ COMMITTED isolation
1519 level, we lock only the record, i.e., next-key
1520 locking is not used. */
1521
1522 rec_t* next_rec = page_rec_get_next(rec);
1523 ulint lock_type;
1524 trx_t* trx;
1525
1526 trx = thr_get_trx(thr);
1527
1528 offsets = rec_get_offsets(next_rec, index, offsets,
1529 ULINT_UNDEFINED, &heap);
1530
1531 if (srv_locks_unsafe_for_binlog
1532 || trx->isolation_level
1533 <= TRX_ISO_READ_COMMITTED) {
1534
1535 if (page_rec_is_supremum(next_rec)) {
1536
1537 goto skip_lock;
1538 }
1539
1540 lock_type = LOCK_REC_NOT_GAP;
1541 } else {
1542 lock_type = LOCK_ORDINARY;
1543 }
1544
1545 err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1546 next_rec, index, offsets,
1547 node->row_lock_mode,
1548 lock_type, thr);
1549
1550 switch (err) {
1551 case DB_SUCCESS_LOCKED_REC:
1552 err = DB_SUCCESS;
1553 case DB_SUCCESS:
1554 break;
1555 default:
1556 /* Note that in this case we will store in pcur
1557 the PREDECESSOR of the record we are waiting
1558 the lock for */
1559 goto lock_wait_or_error;
1560 }
1561 }
1562 }
1563
1564 skip_lock:
1565 if (page_rec_is_infimum(rec)) {
1566
1567 /* The infimum record on a page cannot be in the result set,
1568 and neither can a record lock be placed on it: we skip such
1569 a record. We also increment the cost counter as we may have
1570 processed yet another page of index. */
1571
1572 cost_counter++;
1573
1574 goto next_rec;
1575 }
1576
1577 if (!consistent_read) {
1578 /* Try to place a lock on the index record */
1579
1580 /* If innodb_locks_unsafe_for_binlog option is used
1581 or this session is using READ COMMITTED isolation level,
1582 we lock only the record, i.e., next-key locking is
1583 not used. */
1584
1585 ulint lock_type;
1586 trx_t* trx;
1587
1588 offsets = rec_get_offsets(rec, index, offsets,
1589 ULINT_UNDEFINED, &heap);
1590
1591 trx = thr_get_trx(thr);
1592
1593 if (srv_locks_unsafe_for_binlog
1594 || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
1595
1596 if (page_rec_is_supremum(rec)) {
1597
1598 goto next_rec;
1599 }
1600
1601 lock_type = LOCK_REC_NOT_GAP;
1602 } else {
1603 lock_type = LOCK_ORDINARY;
1604 }
1605
1606 err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1607 rec, index, offsets,
1608 node->row_lock_mode, lock_type, thr);
1609
1610 switch (err) {
1611 case DB_SUCCESS_LOCKED_REC:
1612 err = DB_SUCCESS;
1613 case DB_SUCCESS:
1614 break;
1615 default:
1616 goto lock_wait_or_error;
1617 }
1618 }
1619
1620 if (page_rec_is_supremum(rec)) {
1621
1622 /* A page supremum record cannot be in the result set: skip
1623 it now when we have placed a possible lock on it */
1624
1625 goto next_rec;
1626 }
1627
1628 ut_ad(page_rec_is_user_rec(rec));
1629
1630 if (cost_counter > SEL_COST_LIMIT) {
1631
1632 /* Now that we have placed the necessary locks, we can stop
1633 for a while and store the cursor position; NOTE that if we
1634 would store the cursor position BEFORE placing a record lock,
1635 it might happen that the cursor would jump over some records
1636 that another transaction could meanwhile insert adjacent to
1637 the cursor: this would result in the phantom problem. */
1638
1639 goto stop_for_a_while;
1640 }
1641
1642 /* PHASE 2: Check a mixed index mix id if needed */
1643
1644 if (plan->unique_search && cursor_just_opened) {
1645
1646 ut_ad(plan->mode == PAGE_CUR_GE);
1647
1648 /* As the cursor is now placed on a user record after a search
1649 with the mode PAGE_CUR_GE, the up_match field in the cursor
1650 tells how many fields in the user record matched to the search
1651 tuple */
1652
1653 if (btr_pcur_get_up_match(&(plan->pcur))
1654 < plan->n_exact_match) {
1655 goto table_exhausted;
1656 }
1657
1658 /* Ok, no need to test end_conds or mix id */
1659
1660 }
1661
1662 /* We are ready to look at a possible new index entry in the result
1663 set: the cursor is now placed on a user record */
1664
1665 /* PHASE 3: Get previous version in a consistent read */
1666
1667 cons_read_requires_clust_rec = FALSE;
1668 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1669
1670 if (consistent_read) {
1671 /* This is a non-locking consistent read: if necessary, fetch
1672 a previous version of the record */
1673
1674 if (dict_index_is_clust(index)) {
1675
1676 if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1677 node->read_view)) {
1678
1679 err = row_sel_build_prev_vers(
1680 node->read_view, index, rec,
1681 &offsets, &heap, &plan->old_vers_heap,
1682 &old_vers, &mtr);
1683
1684 if (err != DB_SUCCESS) {
1685
1686 goto lock_wait_or_error;
1687 }
1688
1689 if (old_vers == NULL) {
1690 /* The record does not exist
1691 in our read view. Skip it, but
1692 first attempt to determine
1693 whether the index segment we
1694 are searching through has been
1695 exhausted. */
1696
1697 offsets = rec_get_offsets(
1698 rec, index, offsets,
1699 ULINT_UNDEFINED, &heap);
1700
1701 /* Fetch the columns needed in
1702 test conditions. The clustered
1703 index record is protected by a
1704 page latch that was acquired
1705 by row_sel_open_pcur() or
1706 row_sel_restore_pcur_pos().
1707 The latch will not be released
1708 until mtr_commit(mtr). */
1709
1710 row_sel_fetch_columns(
1711 index, rec, offsets,
1712 UT_LIST_GET_FIRST(
1713 plan->columns));
1714
1715 if (!row_sel_test_end_conds(plan)) {
1716
1717 goto table_exhausted;
1718 }
1719
1720 goto next_rec;
1721 }
1722
1723 rec = old_vers;
1724 }
1725 } else if (!lock_sec_rec_cons_read_sees(rec,
1726 node->read_view)) {
1727 cons_read_requires_clust_rec = TRUE;
1728 }
1729 }
1730
1731 /* PHASE 4: Test search end conditions and deleted flag */
1732
1733 /* Fetch the columns needed in test conditions. The record is
1734 protected by a page latch that was acquired by
1735 row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1736 will not be released until mtr_commit(mtr). */
1737
1738 row_sel_fetch_columns(index, rec, offsets,
1739 UT_LIST_GET_FIRST(plan->columns));
1740
1741 /* Test the selection end conditions: these can only contain columns
1742 which already are found in the index, even though the index might be
1743 non-clustered */
1744
1745 if (plan->unique_search && cursor_just_opened) {
1746
1747 /* No test necessary: the test was already made above */
1748
1749 } else if (!row_sel_test_end_conds(plan)) {
1750
1751 goto table_exhausted;
1752 }
1753
1754 if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1755 && !cons_read_requires_clust_rec) {
1756
1757 /* The record is delete marked: we can skip it if this is
1758 not a consistent read which might see an earlier version
1759 of a non-clustered index record */
1760
1761 if (plan->unique_search) {
1762
1763 goto table_exhausted;
1764 }
1765
1766 goto next_rec;
1767 }
1768
1769 /* PHASE 5: Get the clustered index record, if needed and if we did
1770 not do the search using the clustered index */
1771
1772 if (plan->must_get_clust || cons_read_requires_clust_rec) {
1773
1774 /* It was a non-clustered index and we must fetch also the
1775 clustered index record */
1776
1777 err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1778 &mtr);
1779 mtr_has_extra_clust_latch = TRUE;
1780
1781 if (err != DB_SUCCESS) {
1782
1783 goto lock_wait_or_error;
1784 }
1785
1786 /* Retrieving the clustered record required a search:
1787 increment the cost counter */
1788
1789 cost_counter++;
1790
1791 if (clust_rec == NULL) {
1792 /* The record did not exist in the read view */
1793 ut_ad(consistent_read);
1794
1795 goto next_rec;
1796 }
1797
1798 if (rec_get_deleted_flag(clust_rec,
1799 dict_table_is_comp(plan->table))) {
1800
1801 /* The record is delete marked: we can skip it */
1802
1803 goto next_rec;
1804 }
1805
1806 if (node->can_get_updated) {
1807
1808 btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1809 }
1810 }
1811
1812 /* PHASE 6: Test the rest of search conditions */
1813
1814 if (!row_sel_test_other_conds(plan)) {
1815
1816 if (plan->unique_search) {
1817
1818 goto table_exhausted;
1819 }
1820
1821 goto next_rec;
1822 }
1823
1824 /* PHASE 7: We found a new qualifying row for the current table; push
1825 the row if prefetch is on, or move to the next table in the join */
1826
1827 plan->n_rows_fetched++;
1828
1829 ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1830
1831 if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1832 || plan->unique_search || plan->no_prefetch
1833 || plan->table->big_rows) {
1834
1835 /* No prefetch in operation: go to the next table */
1836
1837 goto next_table;
1838 }
1839
1840 sel_enqueue_prefetched_row(plan);
1841
1842 if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1843
1844 /* The prefetch buffer is now full */
1845
1846 sel_dequeue_prefetched_row(plan);
1847
1848 goto next_table;
1849 }
1850
1851 next_rec:
1852 ut_ad(!search_latch_locked);
1853
1854 if (mtr_has_extra_clust_latch) {
1855
1856 /* We must commit &mtr if we are moving to the next
1857 non-clustered index record, because we could break the
1858 latching order if we would access a different clustered
1859 index page right away without releasing the previous. */
1860
1861 goto commit_mtr_for_a_while;
1862 }
1863
1864 if (node->asc) {
1865 moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1866 } else {
1867 moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1868 }
1869
1870 if (!moved) {
1871
1872 goto table_exhausted;
1873 }
1874
1875 cursor_just_opened = FALSE;
1876
1877 /* END OF RECORD LOOP
1878 ------------------ */
1879 goto rec_loop;
1880
1881 next_table:
1882 /* We found a record which satisfies the conditions: we can move to
1883 the next table or return a row in the result set */
1884
1885 ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
1886
1887 if (plan->unique_search && !node->can_get_updated) {
1888
1889 plan->cursor_at_end = TRUE;
1890 } else {
1891 ut_ad(!search_latch_locked);
1892
1893 plan->stored_cursor_rec_processed = TRUE;
1894
1895 btr_pcur_store_position(&(plan->pcur), &mtr);
1896 }
1897
1898 mtr_commit(&mtr);
1899
1900 mtr_has_extra_clust_latch = FALSE;
1901
1902 next_table_no_mtr:
1903 /* If we use 'goto' to this label, it means that the row was popped
1904 from the prefetched rows stack, and &mtr is already committed */
1905
1906 if (node->fetch_table + 1 == node->n_tables) {
1907
1908 sel_eval_select_list(node);
1909
1910 if (node->is_aggregate) {
1911
1912 goto table_loop;
1913 }
1914
1915 sel_assign_into_var_values(node->into_list, node);
1916
1917 thr->run_node = que_node_get_parent(node);
1918
1919 err = DB_SUCCESS;
1920 goto func_exit;
1921 }
1922
1923 node->fetch_table++;
1924
1925 /* When we move to the next table, we first reset the plan cursor:
1926 we do not care about resetting it when we backtrack from a table */
1927
1928 plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1929
1930 goto table_loop;
1931
1932 table_exhausted:
1933 /* The table cursor pcur reached the result set end: backtrack to the
1934 previous table in the join if we do not have cached prefetched rows */
1935
1936 plan->cursor_at_end = TRUE;
1937
1938 mtr_commit(&mtr);
1939
1940 mtr_has_extra_clust_latch = FALSE;
1941
1942 if (plan->n_rows_prefetched > 0) {
1943 /* The table became exhausted during a prefetch */
1944
1945 sel_dequeue_prefetched_row(plan);
1946
1947 goto next_table_no_mtr;
1948 }
1949
1950 table_exhausted_no_mtr:
1951 if (node->fetch_table == 0) {
1952 err = DB_SUCCESS;
1953
1954 if (node->is_aggregate && !node->aggregate_already_fetched) {
1955
1956 node->aggregate_already_fetched = TRUE;
1957
1958 sel_assign_into_var_values(node->into_list, node);
1959
1960 thr->run_node = que_node_get_parent(node);
1961 } else {
1962 node->state = SEL_NODE_NO_MORE_ROWS;
1963
1964 thr->run_node = que_node_get_parent(node);
1965 }
1966
1967 goto func_exit;
1968 }
1969
1970 node->fetch_table--;
1971
1972 goto table_loop;
1973
1974 stop_for_a_while:
1975 /* Return control for a while to que_run_threads, so that runaway
1976 queries can be canceled. NOTE that when we come here, we must, in a
1977 locking read, have placed the necessary (possibly waiting request)
1978 record lock on the cursor record or its successor: when we reposition
1979 the cursor, this record lock guarantees that nobody can meanwhile have
1980 inserted new records which should have appeared in the result set,
1981 which would result in the phantom problem. */
1982
1983 ut_ad(!search_latch_locked);
1984
1985 plan->stored_cursor_rec_processed = FALSE;
1986 btr_pcur_store_position(&(plan->pcur), &mtr);
1987
1988 mtr_commit(&mtr);
1989
1990 #ifdef UNIV_SYNC_DEBUG
1991 ut_ad(sync_thread_levels_empty_except_dict());
1992 #endif /* UNIV_SYNC_DEBUG */
1993 err = DB_SUCCESS;
1994 goto func_exit;
1995
1996 commit_mtr_for_a_while:
1997 /* Stores the cursor position and commits &mtr; this is used if
1998 &mtr may contain latches which would break the latching order if
1999 &mtr would not be committed and the latches released. */
2000
2001 plan->stored_cursor_rec_processed = TRUE;
2002
2003 ut_ad(!search_latch_locked);
2004 btr_pcur_store_position(&(plan->pcur), &mtr);
2005
2006 mtr_commit(&mtr);
2007
2008 mtr_has_extra_clust_latch = FALSE;
2009
2010 #ifdef UNIV_SYNC_DEBUG
2011 ut_ad(sync_thread_levels_empty_except_dict());
2012 #endif /* UNIV_SYNC_DEBUG */
2013
2014 goto table_loop;
2015
2016 lock_wait_or_error:
2017 /* See the note at stop_for_a_while: the same holds for this case */
2018
2019 ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
2020 ut_ad(!search_latch_locked);
2021
2022 plan->stored_cursor_rec_processed = FALSE;
2023 btr_pcur_store_position(&(plan->pcur), &mtr);
2024
2025 mtr_commit(&mtr);
2026
2027 #ifdef UNIV_SYNC_DEBUG
2028 ut_ad(sync_thread_levels_empty_except_dict());
2029 #endif /* UNIV_SYNC_DEBUG */
2030
2031 func_exit:
2032 if (search_latch_locked) {
2033 rw_lock_s_unlock(&btr_search_latch);
2034 }
2035 if (UNIV_LIKELY_NULL(heap)) {
2036 mem_heap_free(heap);
2037 }
2038 return(err);
2039 }
2040
2041 /**********************************************************************//**
2042 Performs a select step. This is a high-level function used in SQL execution
2043 graphs.
2044 @return query thread to run next or NULL */
2045 UNIV_INTERN
2046 que_thr_t*
row_sel_step(que_thr_t * thr)2047 row_sel_step(
2048 /*=========*/
2049 que_thr_t* thr) /*!< in: query thread */
2050 {
2051 sel_node_t* node;
2052
2053 ut_ad(thr);
2054
2055 node = static_cast<sel_node_t*>(thr->run_node);
2056
2057 ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2058
2059 /* If this is a new time this node is executed (or when execution
2060 resumes after wait for a table intention lock), set intention locks
2061 on the tables, or assign a read view */
2062
2063 if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2064
2065 node->state = SEL_NODE_OPEN;
2066 }
2067
2068 if (node->state == SEL_NODE_OPEN) {
2069
2070 /* It may be that the current session has not yet started
2071 its transaction, or it has been committed: */
2072
2073 trx_start_if_not_started_xa(thr_get_trx(thr));
2074
2075 plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2076
2077 if (node->consistent_read) {
2078 /* Assign a read view for the query */
2079 node->read_view = trx_assign_read_view(
2080 thr_get_trx(thr));
2081 } else {
2082 sym_node_t* table_node;
2083 enum lock_mode i_lock_mode;
2084
2085 if (node->set_x_locks) {
2086 i_lock_mode = LOCK_IX;
2087 } else {
2088 i_lock_mode = LOCK_IS;
2089 }
2090
2091 for (table_node = node->table_list;
2092 table_node != 0;
2093 table_node = static_cast<sym_node_t*>(
2094 que_node_get_next(table_node))) {
2095
2096 dberr_t err = lock_table(
2097 0, table_node->table, i_lock_mode,
2098 thr);
2099
2100 if (err != DB_SUCCESS) {
2101 trx_t* trx;
2102
2103 trx = thr_get_trx(thr);
2104 trx->error_state = err;
2105
2106 return(NULL);
2107 }
2108 }
2109 }
2110
2111 /* If this is an explicit cursor, copy stored procedure
2112 variable values, so that the values cannot change between
2113 fetches (currently, we copy them also for non-explicit
2114 cursors) */
2115
2116 if (node->explicit_cursor
2117 && UT_LIST_GET_FIRST(node->copy_variables)) {
2118
2119 row_sel_copy_input_variable_vals(node);
2120 }
2121
2122 node->state = SEL_NODE_FETCH;
2123 node->fetch_table = 0;
2124
2125 if (node->is_aggregate) {
2126 /* Reset the aggregate total values */
2127 sel_reset_aggregate_vals(node);
2128 }
2129 }
2130
2131 dberr_t err = row_sel(node, thr);
2132
2133 /* NOTE! if queries are parallelized, the following assignment may
2134 have problems; the assignment should be made only if thr is the
2135 only top-level thr in the graph: */
2136
2137 thr->graph->last_sel_node = node;
2138
2139 if (err != DB_SUCCESS) {
2140 thr_get_trx(thr)->error_state = err;
2141
2142 return(NULL);
2143 }
2144
2145 return(thr);
2146 }
2147
2148 /**********************************************************************//**
2149 Performs a fetch for a cursor.
2150 @return query thread to run next or NULL */
2151 UNIV_INTERN
2152 que_thr_t*
fetch_step(que_thr_t * thr)2153 fetch_step(
2154 /*=======*/
2155 que_thr_t* thr) /*!< in: query thread */
2156 {
2157 sel_node_t* sel_node;
2158 fetch_node_t* node;
2159
2160 ut_ad(thr);
2161
2162 node = static_cast<fetch_node_t*>(thr->run_node);
2163 sel_node = node->cursor_def;
2164
2165 ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2166
2167 if (thr->prev_node != que_node_get_parent(node)) {
2168
2169 if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2170
2171 if (node->into_list) {
2172 sel_assign_into_var_values(node->into_list,
2173 sel_node);
2174 } else {
2175 ibool ret = (*node->func->func)(
2176 sel_node, node->func->arg);
2177
2178 if (!ret) {
2179 sel_node->state
2180 = SEL_NODE_NO_MORE_ROWS;
2181 }
2182 }
2183 }
2184
2185 thr->run_node = que_node_get_parent(node);
2186
2187 return(thr);
2188 }
2189
2190 /* Make the fetch node the parent of the cursor definition for
2191 the time of the fetch, so that execution knows to return to this
2192 fetch node after a row has been selected or we know that there is
2193 no row left */
2194
2195 sel_node->common.parent = node;
2196
2197 if (sel_node->state == SEL_NODE_CLOSED) {
2198 fprintf(stderr,
2199 "InnoDB: Error: fetch called on a closed cursor\n");
2200
2201 thr_get_trx(thr)->error_state = DB_ERROR;
2202
2203 return(NULL);
2204 }
2205
2206 thr->run_node = sel_node;
2207
2208 return(thr);
2209 }
2210
2211 /****************************************************************//**
2212 Sample callback function for fetch that prints each row.
2213 @return always returns non-NULL */
2214 UNIV_INTERN
2215 void*
row_fetch_print(void * row,void * user_arg)2216 row_fetch_print(
2217 /*============*/
2218 void* row, /*!< in: sel_node_t* */
2219 void* user_arg) /*!< in: not used */
2220 {
2221 que_node_t* exp;
2222 ulint i = 0;
2223 sel_node_t* node = static_cast<sel_node_t*>(row);
2224
2225 UT_NOT_USED(user_arg);
2226
2227 fprintf(stderr, "row_fetch_print: row %p\n", row);
2228
2229 for (exp = node->select_list;
2230 exp != 0;
2231 exp = que_node_get_next(exp), i++) {
2232
2233 dfield_t* dfield = que_node_get_val(exp);
2234 const dtype_t* type = dfield_get_type(dfield);
2235
2236 fprintf(stderr, " column %lu:\n", (ulong) i);
2237
2238 dtype_print(type);
2239 putc('\n', stderr);
2240
2241 if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2242 ut_print_buf(stderr, dfield_get_data(dfield),
2243 dfield_get_len(dfield));
2244 putc('\n', stderr);
2245 } else {
2246 fputs(" <NULL>;\n", stderr);
2247 }
2248 }
2249
2250 return((void*)42);
2251 }
2252
2253 /***********************************************************//**
2254 Prints a row in a select result.
2255 @return query thread to run next or NULL */
2256 UNIV_INTERN
2257 que_thr_t*
row_printf_step(que_thr_t * thr)2258 row_printf_step(
2259 /*============*/
2260 que_thr_t* thr) /*!< in: query thread */
2261 {
2262 row_printf_node_t* node;
2263 sel_node_t* sel_node;
2264 que_node_t* arg;
2265
2266 ut_ad(thr);
2267
2268 node = static_cast<row_printf_node_t*>(thr->run_node);
2269
2270 sel_node = node->sel_node;
2271
2272 ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2273
2274 if (thr->prev_node == que_node_get_parent(node)) {
2275
2276 /* Reset the cursor */
2277 sel_node->state = SEL_NODE_OPEN;
2278
2279 /* Fetch next row to print */
2280
2281 thr->run_node = sel_node;
2282
2283 return(thr);
2284 }
2285
2286 if (sel_node->state != SEL_NODE_FETCH) {
2287
2288 ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2289
2290 /* No more rows to print */
2291
2292 thr->run_node = que_node_get_parent(node);
2293
2294 return(thr);
2295 }
2296
2297 arg = sel_node->select_list;
2298
2299 while (arg) {
2300 dfield_print_also_hex(que_node_get_val(arg));
2301
2302 fputs(" ::: ", stderr);
2303
2304 arg = que_node_get_next(arg);
2305 }
2306
2307 putc('\n', stderr);
2308
2309 /* Fetch next row to print */
2310
2311 thr->run_node = sel_node;
2312
2313 return(thr);
2314 }
2315
2316 /****************************************************************//**
2317 Converts a key value stored in MySQL format to an Innobase dtuple. The last
2318 field of the key value may be just a prefix of a fixed length field: hence
2319 the parameter key_len. But currently we do not allow search keys where the
2320 last field is only a prefix of the full key field len and print a warning if
2321 such appears. A counterpart of this function is
2322 ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2323 UNIV_INTERN
2324 void
row_sel_convert_mysql_key_to_innobase(dtuple_t * tuple,byte * buf,ulint buf_len,dict_index_t * index,const byte * key_ptr,ulint key_len,trx_t * trx)2325 row_sel_convert_mysql_key_to_innobase(
2326 /*==================================*/
2327 dtuple_t* tuple, /*!< in/out: tuple where to build;
2328 NOTE: we assume that the type info
2329 in the tuple is already according
2330 to index! */
2331 byte* buf, /*!< in: buffer to use in field
2332 conversions; NOTE that dtuple->data
2333 may end up pointing inside buf so
2334 do not discard that buffer while
2335 the tuple is being used. See
2336 row_mysql_store_col_in_innobase_format()
2337 in the case of DATA_INT */
2338 ulint buf_len, /*!< in: buffer length */
2339 dict_index_t* index, /*!< in: index of the key value */
2340 const byte* key_ptr, /*!< in: MySQL key value */
2341 ulint key_len, /*!< in: MySQL key value length */
2342 trx_t* trx) /*!< in: transaction */
2343 {
2344 byte* original_buf = buf;
2345 const byte* original_key_ptr = key_ptr;
2346 dict_field_t* field;
2347 dfield_t* dfield;
2348 ulint data_offset;
2349 ulint data_len;
2350 ulint data_field_len;
2351 ibool is_null;
2352 const byte* key_end;
2353 ulint n_fields = 0;
2354
2355 /* For documentation of the key value storage format in MySQL, see
2356 ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2357
2358 key_end = key_ptr + key_len;
2359
2360 /* Permit us to access any field in the tuple (ULINT_MAX): */
2361
2362 dtuple_set_n_fields(tuple, ULINT_MAX);
2363
2364 dfield = dtuple_get_nth_field(tuple, 0);
2365 field = dict_index_get_nth_field(index, 0);
2366
2367 if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2368 /* A special case: we are looking for a position in the
2369 generated clustered index which InnoDB automatically added
2370 to a table with no primary key: the first and the only
2371 ordering column is ROW_ID which InnoDB stored to the key_ptr
2372 buffer. */
2373
2374 ut_a(key_len == DATA_ROW_ID_LEN);
2375
2376 dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2377
2378 dtuple_set_n_fields(tuple, 1);
2379
2380 return;
2381 }
2382
2383 while (key_ptr < key_end) {
2384
2385 ulint type = dfield_get_type(dfield)->mtype;
2386 ut_a(field->col->mtype == type);
2387
2388 data_offset = 0;
2389 is_null = FALSE;
2390
2391 if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2392 /* The first byte in the field tells if this is
2393 an SQL NULL value */
2394
2395 data_offset = 1;
2396
2397 if (*key_ptr != 0) {
2398 dfield_set_null(dfield);
2399
2400 is_null = TRUE;
2401 }
2402 }
2403
2404 /* Calculate data length and data field total length */
2405
2406 if (type == DATA_BLOB) {
2407 /* The key field is a column prefix of a BLOB or
2408 TEXT */
2409
2410 ut_a(field->prefix_len > 0);
2411
2412 /* MySQL stores the actual data length to the first 2
2413 bytes after the optional SQL NULL marker byte. The
2414 storage format is little-endian, that is, the most
2415 significant byte at a higher address. In UTF-8, MySQL
2416 seems to reserve field->prefix_len bytes for
2417 storing this field in the key value buffer, even
2418 though the actual value only takes data_len bytes
2419 from the start. */
2420
2421 data_len = key_ptr[data_offset]
2422 + 256 * key_ptr[data_offset + 1];
2423 data_field_len = data_offset + 2 + field->prefix_len;
2424
2425 data_offset += 2;
2426
2427 /* Now that we know the length, we store the column
2428 value like it would be a fixed char field */
2429
2430 } else if (field->prefix_len > 0) {
2431 /* Looks like MySQL pads unused end bytes in the
2432 prefix with space. Therefore, also in UTF-8, it is ok
2433 to compare with a prefix containing full prefix_len
2434 bytes, and no need to take at most prefix_len / 3
2435 UTF-8 characters from the start.
2436 If the prefix is used as the upper end of a LIKE
2437 'abc%' query, then MySQL pads the end with chars
2438 0xff. TODO: in that case does it any harm to compare
2439 with the full prefix_len bytes. How do characters
2440 0xff in UTF-8 behave? */
2441
2442 data_len = field->prefix_len;
2443 data_field_len = data_offset + data_len;
2444 } else {
2445 data_len = dfield_get_type(dfield)->len;
2446 data_field_len = data_offset + data_len;
2447 }
2448
2449 if (UNIV_UNLIKELY
2450 (dtype_get_mysql_type(dfield_get_type(dfield))
2451 == DATA_MYSQL_TRUE_VARCHAR)
2452 && UNIV_LIKELY(type != DATA_INT)) {
2453 /* In a MySQL key value format, a true VARCHAR is
2454 always preceded by 2 bytes of a length field.
2455 dfield_get_type(dfield)->len returns the maximum
2456 'payload' len in bytes. That does not include the
2457 2 bytes that tell the actual data length.
2458
2459 We added the check != DATA_INT to make sure we do
2460 not treat MySQL ENUM or SET as a true VARCHAR! */
2461
2462 data_len += 2;
2463 data_field_len += 2;
2464 }
2465
2466 /* Storing may use at most data_len bytes of buf */
2467
2468 if (UNIV_LIKELY(!is_null)) {
2469 buf = row_mysql_store_col_in_innobase_format(
2470 dfield, buf,
2471 FALSE, /* MySQL key value format col */
2472 key_ptr + data_offset, data_len,
2473 dict_table_is_comp(index->table));
2474 ut_a(buf <= original_buf + buf_len);
2475 }
2476
2477 key_ptr += data_field_len;
2478
2479 if (UNIV_UNLIKELY(key_ptr > key_end)) {
2480 /* The last field in key was not a complete key field
2481 but a prefix of it.
2482
2483 Print a warning about this! HA_READ_PREFIX_LAST does
2484 not currently work in InnoDB with partial-field key
2485 value prefixes. Since MySQL currently uses a padding
2486 trick to calculate LIKE 'abc%' type queries there
2487 should never be partial-field prefixes in searches. */
2488
2489 ut_print_timestamp(stderr);
2490
2491 fputs(" InnoDB: Warning: using a partial-field"
2492 " key prefix in search.\n"
2493 "InnoDB: ", stderr);
2494 dict_index_name_print(stderr, trx, index);
2495 fprintf(stderr, ". Last data field length %lu bytes,\n"
2496 "InnoDB: key ptr now exceeds"
2497 " key end by %lu bytes.\n"
2498 "InnoDB: Key value in the MySQL format:\n",
2499 (ulong) data_field_len,
2500 (ulong) (key_ptr - key_end));
2501 fflush(stderr);
2502 ut_print_buf(stderr, original_key_ptr, key_len);
2503 putc('\n', stderr);
2504
2505 if (!is_null) {
2506 ulint len = dfield_get_len(dfield);
2507 dfield_set_len(dfield, len
2508 - (ulint) (key_ptr - key_end));
2509 }
2510 ut_ad(0);
2511 }
2512
2513 n_fields++;
2514 field++;
2515 dfield++;
2516 }
2517
2518 ut_a(buf <= original_buf + buf_len);
2519
2520 /* We set the length of tuple to n_fields: we assume that the memory
2521 area allocated for it is big enough (usually bigger than n_fields). */
2522
2523 dtuple_set_n_fields(tuple, n_fields);
2524 }
2525
2526 /**************************************************************//**
2527 Stores the row id to the prebuilt struct. */
2528 static
2529 void
row_sel_store_row_id_to_prebuilt(row_prebuilt_t * prebuilt,const rec_t * index_rec,const dict_index_t * index,const ulint * offsets)2530 row_sel_store_row_id_to_prebuilt(
2531 /*=============================*/
2532 row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */
2533 const rec_t* index_rec, /*!< in: record */
2534 const dict_index_t* index, /*!< in: index of the record */
2535 const ulint* offsets) /*!< in: rec_get_offsets
2536 (index_rec, index) */
2537 {
2538 const byte* data;
2539 ulint len;
2540
2541 ut_ad(rec_offs_validate(index_rec, index, offsets));
2542
2543 data = rec_get_nth_field(
2544 index_rec, offsets,
2545 dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2546
2547 if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2548 fprintf(stderr,
2549 "InnoDB: Error: Row id field is"
2550 " wrong length %lu in ", (ulong) len);
2551 dict_index_name_print(stderr, prebuilt->trx, index);
2552 fprintf(stderr, "\n"
2553 "InnoDB: Field number %lu, record:\n",
2554 (ulong) dict_index_get_sys_col_pos(index,
2555 DATA_ROW_ID));
2556 rec_print_new(stderr, index_rec, offsets);
2557 putc('\n', stderr);
2558 ut_error;
2559 }
2560
2561 ut_memcpy(prebuilt->row_id, data, len);
2562 }
2563
2564 #ifdef UNIV_DEBUG
2565 /** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
2566 # define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len,sec) \
2567 row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len,sec)
2568 #else /* UNIV_DEBUG */
2569 /** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
2570 # define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len,sec) \
2571 row_sel_field_store_in_mysql_format_func(dest,templ,src,len,sec)
2572 #endif /* UNIV_DEBUG */
2573
2574 /** Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2575 function is row_mysql_store_col_in_innobase_format() in row0mysql.cc.
2576 @param[in,out] dest buffer where to store; NOTE
2577 that BLOBs are not in themselves stored
2578 here: the caller must allocate and copy
2579 the BLOB into buffer before, and pass
2580 the pointer to the BLOB in 'data'
2581 @param[in] templ MySQL column template. Its following fields
2582 are referenced: type, is_unsigned, mysql_col_len,
2583 mbminlen, mbmaxlen
2584 @param[in] index InnoDB index
2585 @param[in] field_no templ->rec_field_no or templ->clust_rec_field_no
2586 or templ->icp_rec_field_no
2587 @param[in] data data to store
2588 @param[in] len length of the data
2589 @param[in] sec_field secondary index field no if the secondary index
2590 record but the prebuilt template is in
2591 clustered index format and used only for end
2592 range comparison. */
2593 static MY_ATTRIBUTE((nonnull))
2594 void
row_sel_field_store_in_mysql_format_func(byte * dest,const mysql_row_templ_t * templ,const dict_index_t * index,ulint field_no,const byte * data,ulint len,ulint sec_field)2595 row_sel_field_store_in_mysql_format_func(
2596 byte* dest,
2597 const mysql_row_templ_t* templ,
2598 #ifdef UNIV_DEBUG
2599 const dict_index_t* index,
2600 ulint field_no,
2601 #endif /* UNIV_DEBUG */
2602 const byte* data,
2603 ulint len,
2604 ulint sec_field)
2605 {
2606 byte* ptr;
2607 #ifdef UNIV_DEBUG
2608 const dict_field_t* field
2609 = dict_index_get_nth_field(index, field_no);
2610 bool clust_templ_for_sec = (sec_field != ULINT_UNDEFINED);
2611 #endif /* UNIV_DEBUG */
2612
2613 ut_ad(len != UNIV_SQL_NULL);
2614 UNIV_MEM_ASSERT_RW(data, len);
2615 UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len);
2616 UNIV_MEM_INVALID(dest, templ->mysql_col_len);
2617
2618 switch (templ->type) {
2619 const byte* field_end;
2620 byte* pad;
2621 case DATA_INT:
2622 /* Convert integer data from Innobase to a little-endian
2623 format, sign bit restored to normal */
2624
2625 ptr = dest + len;
2626
2627 for (;;) {
2628 ptr--;
2629 *ptr = *data;
2630 if (ptr == dest) {
2631 break;
2632 }
2633 data++;
2634 }
2635
2636 if (!templ->is_unsigned) {
2637 dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2638 }
2639
2640 ut_ad(templ->mysql_col_len == len);
2641 break;
2642
2643 case DATA_VARCHAR:
2644 case DATA_VARMYSQL:
2645 case DATA_BINARY:
2646 field_end = dest + templ->mysql_col_len;
2647
2648 if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2649 /* This is a >= 5.0.3 type true VARCHAR. Store the
2650 length of the data to the first byte or the first
2651 two bytes of dest. */
2652
2653 dest = row_mysql_store_true_var_len(
2654 dest, len, templ->mysql_length_bytes);
2655 /* Copy the actual data. Leave the rest of the
2656 buffer uninitialized. */
2657 memcpy(dest, data, len);
2658 break;
2659 }
2660
2661 /* Copy the actual data */
2662 ut_memcpy(dest, data, len);
2663
2664 /* Pad with trailing spaces. */
2665
2666 pad = dest + len;
2667
2668 ut_ad(templ->mbminlen <= templ->mbmaxlen);
2669
2670 /* We treat some Unicode charset strings specially. */
2671 switch (templ->mbminlen) {
2672 case 4:
2673 /* InnoDB should never have stripped partial
2674 UTF-32 characters. */
2675 ut_a(!(len & 3));
2676 break;
2677 case 2:
2678 /* A space char is two bytes,
2679 0x0020 in UCS2 and UTF-16 */
2680
2681 if (UNIV_UNLIKELY(len & 1)) {
2682 /* A 0x20 has been stripped from the column.
2683 Pad it back. */
2684
2685 if (pad < field_end) {
2686 *pad++ = 0x20;
2687 }
2688 }
2689 }
2690
2691 row_mysql_pad_col(templ->mbminlen, pad, field_end - pad);
2692 break;
2693
2694 case DATA_BLOB:
2695 /* Store a pointer to the BLOB buffer to dest: the BLOB was
2696 already copied to the buffer in row_sel_store_mysql_rec */
2697
2698 row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2699 len);
2700 break;
2701
2702 case DATA_MYSQL:
2703 memcpy(dest, data, len);
2704
2705 ut_ad(templ->mysql_col_len >= len);
2706 ut_ad(templ->mbmaxlen >= templ->mbminlen);
2707
2708 /* If field_no equals to templ->icp_rec_field_no,
2709 we are examining a row pointed by "icp_rec_field_no".
2710 There is possibility that icp_rec_field_no refers to
2711 a field in a secondary index while templ->rec_field_no
2712 points to field in a primary index. The length
2713 should still be equal, unless the field pointed
2714 by icp_rec_field_no has a prefix */
2715 ut_ad(templ->mbmaxlen > templ->mbminlen
2716 || templ->mysql_col_len == len
2717 || (field_no == templ->icp_rec_field_no
2718 && field->prefix_len > 0));
2719
2720 /* The following assertion would fail for old tables
2721 containing UTF-8 ENUM columns due to Bug #9526. */
2722 ut_ad(!templ->mbmaxlen
2723 || !(templ->mysql_col_len % templ->mbmaxlen));
2724 ut_ad(clust_templ_for_sec
2725 || len * templ->mbmaxlen >= templ->mysql_col_len
2726 || (field_no == templ->icp_rec_field_no
2727 && field->prefix_len > 0));
2728 ut_ad(!(field->prefix_len % templ->mbmaxlen));
2729
2730 if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
2731 /* Pad with spaces. This undoes the stripping
2732 done in row0mysql.cc, function
2733 row_mysql_store_col_in_innobase_format(). */
2734
2735 memset(dest + len, 0x20, templ->mysql_col_len - len);
2736 }
2737 break;
2738
2739 default:
2740 #ifdef UNIV_DEBUG
2741 case DATA_SYS_CHILD:
2742 case DATA_SYS:
2743 /* These column types should never be shipped to MySQL. */
2744 ut_ad(0);
2745
2746 case DATA_CHAR:
2747 case DATA_FIXBINARY:
2748 case DATA_FLOAT:
2749 case DATA_DOUBLE:
2750 case DATA_DECIMAL:
2751 /* Above are the valid column types for MySQL data. */
2752 #endif /* UNIV_DEBUG */
2753 /* If sec_field value is present then mapping of
2754 secondary index records to clustered index template
2755 happens for end range comparison. So length can
2756 vary according to secondary index record length. */
2757 ut_ad(field->prefix_len
2758 ? field->prefix_len == len
2759 : (clust_templ_for_sec ?
2760 1 : (templ->mysql_col_len == len)));
2761 memcpy(dest, data, len);
2762 }
2763 }
2764
2765 #ifdef UNIV_DEBUG
2766 /** Convert a field from Innobase format to MySQL format. */
2767 # define row_sel_store_mysql_field(m,p,r,i,o,f,t,s) \
2768 row_sel_store_mysql_field_func(m,p,r,i,o,f,t,s)
2769 #else /* UNIV_DEBUG */
2770 /** Convert a field from Innobase format to MySQL format. */
2771 # define row_sel_store_mysql_field(m,p,r,i,o,f,t,s) \
2772 row_sel_store_mysql_field_func(m,p,r,o,f,t,s)
2773 #endif /* UNIV_DEBUG */
2774 /** Convert a field in the Innobase format to a field in the MySQL format.
2775 @param[out] mysql_rec record in the MySQL format
2776 @param[in,out] prebuilt prebuilt struct
2777 @param[in] rec InnoDB record; must be protected
2778 by a page latch
2779 @param[in] index index of rec
2780 @param[in] offsets array returned by rec_get_offsets()
2781 @param[in] field_no templ->rec_field_no or
2782 templ->clust_rec_field_no
2783 or templ->icp_rec_field_no
2784 or sec field no if clust_templ_for_sec
2785 is TRUE
2786 @param[in] templ row template
2787 @param[in] sec_field_no field_no if rec belongs to secondary index
2788 but prebuilt template is in clustered
2789 index format and used only for end
2790 range comparison. */
2791 static MY_ATTRIBUTE((warn_unused_result))
2792 ibool
row_sel_store_mysql_field_func(byte * mysql_rec,row_prebuilt_t * prebuilt,const rec_t * rec,const dict_index_t * index,const ulint * offsets,ulint field_no,const mysql_row_templ_t * templ,ulint sec_field_no)2793 row_sel_store_mysql_field_func(
2794 byte* mysql_rec,
2795 row_prebuilt_t* prebuilt,
2796 const rec_t* rec,
2797 #ifdef UNIV_DEBUG
2798 const dict_index_t* index,
2799 #endif
2800 const ulint* offsets,
2801 ulint field_no,
2802 const mysql_row_templ_t*templ,
2803 ulint sec_field_no)
2804 {
2805 const byte* data;
2806 ulint len;
2807 ulint clust_field_no = 0;
2808 bool clust_templ_for_sec = (sec_field_no != ULINT_UNDEFINED);
2809
2810 ut_ad(prebuilt->default_rec);
2811 ut_ad(templ);
2812 ut_ad(templ >= prebuilt->mysql_template);
2813 ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
2814 ut_ad(clust_templ_for_sec
2815 || field_no == templ->clust_rec_field_no
2816 || field_no == templ->rec_field_no
2817 || field_no == templ->icp_rec_field_no);
2818 ut_ad(rec_offs_validate(rec,
2819 clust_templ_for_sec ? prebuilt->index : index, offsets));
2820
2821 /* If sec_field_no is present then extract the data from record
2822 using secondary field no. */
2823 if (clust_templ_for_sec) {
2824 clust_field_no = field_no;
2825 field_no = sec_field_no;
2826 }
2827
2828 if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
2829
2830 mem_heap_t* heap;
2831 /* Copy an externally stored field to a temporary heap */
2832
2833 ut_a(!prebuilt->trx->has_search_latch);
2834 ut_ad(field_no == templ->clust_rec_field_no);
2835
2836 if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2837 if (prebuilt->blob_heap == NULL) {
2838 prebuilt->blob_heap = mem_heap_create(
2839 UNIV_PAGE_SIZE);
2840 }
2841
2842 heap = prebuilt->blob_heap;
2843 } else {
2844 heap = mem_heap_create(UNIV_PAGE_SIZE);
2845 }
2846
2847 /* NOTE: if we are retrieving a big BLOB, we may
2848 already run out of memory in the next call, which
2849 causes an assert */
2850
2851 data = btr_rec_copy_externally_stored_field(
2852 rec, offsets,
2853 dict_table_zip_size(prebuilt->table),
2854 field_no, &len, heap);
2855
2856 if (UNIV_UNLIKELY(!data)) {
2857 /* The externally stored field was not written
2858 yet. This record should only be seen by
2859 recv_recovery_rollback_active() or any
2860 TRX_ISO_READ_UNCOMMITTED transactions. */
2861
2862 if (heap != prebuilt->blob_heap) {
2863 mem_heap_free(heap);
2864 }
2865
2866 ut_a(prebuilt->trx->isolation_level
2867 == TRX_ISO_READ_UNCOMMITTED);
2868 return(FALSE);
2869 }
2870
2871 ut_a(len != UNIV_SQL_NULL);
2872
2873 row_sel_field_store_in_mysql_format(
2874 mysql_rec + templ->mysql_col_offset,
2875 templ, index, field_no, data, len,
2876 ULINT_UNDEFINED);
2877
2878 if (heap != prebuilt->blob_heap) {
2879 mem_heap_free(heap);
2880 }
2881 } else {
2882 /* Field is stored in the row. */
2883
2884 data = rec_get_nth_field(rec, offsets, field_no, &len);
2885
2886 if (len == UNIV_SQL_NULL) {
2887 /* MySQL assumes that the field for an SQL
2888 NULL value is set to the default value. */
2889 ut_ad(templ->mysql_null_bit_mask);
2890
2891 UNIV_MEM_ASSERT_RW(prebuilt->default_rec
2892 + templ->mysql_col_offset,
2893 templ->mysql_col_len);
2894 mysql_rec[templ->mysql_null_byte_offset]
2895 |= (byte) templ->mysql_null_bit_mask;
2896 memcpy(mysql_rec + templ->mysql_col_offset,
2897 (const byte*) prebuilt->default_rec
2898 + templ->mysql_col_offset,
2899 templ->mysql_col_len);
2900 return(TRUE);
2901 }
2902
2903 if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2904
2905 /* It is a BLOB field locally stored in the
2906 InnoDB record: we MUST copy its contents to
2907 prebuilt->blob_heap here because
2908 row_sel_field_store_in_mysql_format() stores a
2909 pointer to the data, and the data passed to us
2910 will be invalid as soon as the
2911 mini-transaction is committed and the page
2912 latch on the clustered index page is
2913 released. */
2914
2915 if (prebuilt->blob_heap == NULL) {
2916 prebuilt->blob_heap = mem_heap_create(
2917 UNIV_PAGE_SIZE);
2918 }
2919
2920 data = static_cast<byte*>(
2921 mem_heap_dup(prebuilt->blob_heap, data, len));
2922 }
2923
2924 /* Reassign the clustered index field no. */
2925 if (clust_templ_for_sec) {
2926 field_no = clust_field_no;
2927 }
2928
2929 row_sel_field_store_in_mysql_format(
2930 mysql_rec + templ->mysql_col_offset,
2931 templ, index, field_no, data, len, sec_field_no);
2932 }
2933
2934 ut_ad(len != UNIV_SQL_NULL);
2935
2936 if (templ->mysql_null_bit_mask) {
2937 /* It is a nullable column with a non-NULL
2938 value */
2939 mysql_rec[templ->mysql_null_byte_offset]
2940 &= ~(byte) templ->mysql_null_bit_mask;
2941 }
2942
2943 return(TRUE);
2944 }
2945
2946 /** Convert a row in the Innobase format to a row in the MySQL format.
2947 Note that the template in prebuilt may advise us to copy only a few
2948 columns to mysql_rec, other columns are left blank. All columns may not
2949 be needed in the query.
2950 @param[out] mysql_rec row in the MySQL format
2951 @param[in] prebuilt prebuilt structure
2952 @param[in] rec Innobase record in the index
2953 which was described in prebuilt's
2954 template, or in the clustered index;
2955 must be protected by a page latch
2956 @param[in] rec_clust TRUE if the rec in the clustered index
2957 @param[in] index index of rec
2958 @param[in] offsets array returned by rec_get_offsets(rec)
2959 @param[in] clust_templ_for_sec TRUE if rec belongs to secondary index
2960 but the prebuilt->template is in
2961 clustered index format and it is
2962 used only for end range comparison
2963 @return TRUE on success, FALSE if not all columns could be retrieved */
2964 static MY_ATTRIBUTE((warn_unused_result))
2965 ibool
row_sel_store_mysql_rec(byte * mysql_rec,row_prebuilt_t * prebuilt,const rec_t * rec,ibool rec_clust,const dict_index_t * index,const ulint * offsets,bool clust_templ_for_sec)2966 row_sel_store_mysql_rec(
2967 byte* mysql_rec,
2968 row_prebuilt_t* prebuilt,
2969 const rec_t* rec,
2970 ibool rec_clust,
2971 const dict_index_t* index,
2972 const ulint* offsets,
2973 bool clust_templ_for_sec)
2974 {
2975 ulint i;
2976 std::vector<const dict_col_t*> template_col;
2977
2978 ut_ad(rec_clust || index == prebuilt->index);
2979 ut_ad(!rec_clust || dict_index_is_clust(index));
2980
2981 if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2982 mem_heap_free(prebuilt->blob_heap);
2983 prebuilt->blob_heap = NULL;
2984 }
2985
2986 if (clust_templ_for_sec) {
2987 /* Store all clustered index column of
2988 secondary index record. */
2989 for (i = 0; i < dict_index_get_n_fields(
2990 prebuilt->index); i++) {
2991 ulint sec_field = dict_index_get_nth_field_pos(
2992 index, prebuilt->index, i);
2993
2994 if (sec_field == ULINT_UNDEFINED) {
2995 template_col.push_back(NULL);
2996 continue;
2997 }
2998
2999 const dict_field_t* field =
3000 dict_index_get_nth_field(index, sec_field);
3001 const dict_col_t* col =
3002 dict_field_get_col(field);
3003
3004 template_col.push_back(col);
3005 }
3006 }
3007
3008 for (i = 0; i < prebuilt->n_template; i++) {
3009 const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
3010 ulint field_no
3011 = rec_clust
3012 ? templ->clust_rec_field_no
3013 : templ->rec_field_no;
3014 ulint sec_field_no = ULINT_UNDEFINED;
3015
3016 /* We should never deliver column prefixes to MySQL,
3017 except for evaluating innobase_index_cond(). */
3018 ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
3019 == 0);
3020
3021 if (clust_templ_for_sec) {
3022 std::vector<const dict_col_t*>::iterator it;
3023 const dict_field_t* field =
3024 dict_index_get_nth_field(index, field_no);
3025 const dict_col_t* col = dict_field_get_col(
3026 field);
3027 it = std::find(template_col.begin(),
3028 template_col.end(), col);
3029
3030 if (it == template_col.end()) {
3031 continue;
3032 }
3033
3034 ut_ad(templ->rec_field_no == templ->clust_rec_field_no);
3035
3036 sec_field_no = it - template_col.begin();
3037 }
3038
3039 if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
3040 rec, index, offsets,
3041 field_no, templ,
3042 sec_field_no)) {
3043 return(FALSE);
3044 }
3045 }
3046
3047 /* FIXME: We only need to read the doc_id if an FTS indexed
3048 column is being updated.
3049 NOTE, the record must be cluster index record. Secondary index
3050 might not have the Doc ID */
3051 if (dict_table_has_fts_index(prebuilt->table)
3052 && dict_index_is_clust(index)
3053 && !clust_templ_for_sec) {
3054
3055 prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
3056 prebuilt->table, rec, NULL);
3057 }
3058
3059 return(TRUE);
3060 }
3061
3062 /*********************************************************************//**
3063 Builds a previous version of a clustered index record for a consistent read
3064 @return DB_SUCCESS or error code */
3065 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3066 dberr_t
row_sel_build_prev_vers_for_mysql(read_view_t * read_view,dict_index_t * clust_index,row_prebuilt_t * prebuilt,const rec_t * rec,ulint ** offsets,mem_heap_t ** offset_heap,rec_t ** old_vers,mtr_t * mtr)3067 row_sel_build_prev_vers_for_mysql(
3068 /*==============================*/
3069 read_view_t* read_view, /*!< in: read view */
3070 dict_index_t* clust_index, /*!< in: clustered index */
3071 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
3072 const rec_t* rec, /*!< in: record in a clustered index */
3073 ulint** offsets, /*!< in/out: offsets returned by
3074 rec_get_offsets(rec, clust_index) */
3075 mem_heap_t** offset_heap, /*!< in/out: memory heap from which
3076 the offsets are allocated */
3077 rec_t** old_vers, /*!< out: old version, or NULL if the
3078 record does not exist in the view:
3079 i.e., it was freshly inserted
3080 afterwards */
3081 mtr_t* mtr) /*!< in: mtr */
3082 {
3083 dberr_t err;
3084
3085 if (prebuilt->old_vers_heap) {
3086 mem_heap_empty(prebuilt->old_vers_heap);
3087 } else {
3088 prebuilt->old_vers_heap = mem_heap_create(200);
3089 }
3090
3091 err = row_vers_build_for_consistent_read(
3092 rec, mtr, clust_index, offsets, read_view, offset_heap,
3093 prebuilt->old_vers_heap, old_vers);
3094 return(err);
3095 }
3096
3097 /*********************************************************************//**
3098 Retrieves the clustered index record corresponding to a record in a
3099 non-clustered index. Does the necessary locking. Used in the MySQL
3100 interface.
3101 @return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
3102 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3103 dberr_t
row_sel_get_clust_rec_for_mysql(row_prebuilt_t * prebuilt,dict_index_t * sec_index,const rec_t * rec,que_thr_t * thr,const rec_t ** out_rec,ulint ** offsets,mem_heap_t ** offset_heap,mtr_t * mtr)3104 row_sel_get_clust_rec_for_mysql(
3105 /*============================*/
3106 row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
3107 dict_index_t* sec_index,/*!< in: secondary index where rec resides */
3108 const rec_t* rec, /*!< in: record in a non-clustered index; if
3109 this is a locking read, then rec is not
3110 allowed to be delete-marked, and that would
3111 not make sense either */
3112 que_thr_t* thr, /*!< in: query thread */
3113 const rec_t** out_rec,/*!< out: clustered record or an old version of
3114 it, NULL if the old version did not exist
3115 in the read view, i.e., it was a fresh
3116 inserted version */
3117 ulint** offsets,/*!< in: offsets returned by
3118 rec_get_offsets(rec, sec_index);
3119 out: offsets returned by
3120 rec_get_offsets(out_rec, clust_index) */
3121 mem_heap_t** offset_heap,/*!< in/out: memory heap from which
3122 the offsets are allocated */
3123 mtr_t* mtr) /*!< in: mtr used to get access to the
3124 non-clustered record; the same mtr is used to
3125 access the clustered index */
3126 {
3127 dict_index_t* clust_index;
3128 const rec_t* clust_rec;
3129 rec_t* old_vers;
3130 dberr_t err;
3131 trx_t* trx;
3132
3133 *out_rec = NULL;
3134 trx = thr_get_trx(thr);
3135
3136 row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
3137 sec_index, *offsets, trx);
3138
3139 clust_index = dict_table_get_first_index(sec_index->table);
3140
3141 btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
3142 PAGE_CUR_LE, BTR_SEARCH_LEAF,
3143 &prebuilt->clust_pcur, 0, mtr);
3144
3145 clust_rec = btr_pcur_get_rec(&prebuilt->clust_pcur);
3146
3147 prebuilt->clust_pcur.trx_if_known = trx;
3148
3149 /* Note: only if the search ends up on a non-infimum record is the
3150 low_match value the real match to the search tuple */
3151
3152 if (!page_rec_is_user_rec(clust_rec)
3153 || btr_pcur_get_low_match(&prebuilt->clust_pcur)
3154 < dict_index_get_n_unique(clust_index)) {
3155
3156 /* In a rare case it is possible that no clust rec is found
3157 for a delete-marked secondary index record: if in row0umod.cc
3158 in row_undo_mod_remove_clust_low() we have already removed
3159 the clust rec, while purge is still cleaning and removing
3160 secondary index records associated with earlier versions of
3161 the clustered index record. In that case we know that the
3162 clustered index record did not exist in the read view of
3163 trx. */
3164
3165 if (!rec_get_deleted_flag(rec,
3166 dict_table_is_comp(sec_index->table))
3167 || prebuilt->select_lock_type != LOCK_NONE) {
3168 ut_print_timestamp(stderr);
3169 fputs(" InnoDB: error clustered record"
3170 " for sec rec not found\n"
3171 "InnoDB: ", stderr);
3172 dict_index_name_print(stderr, trx, sec_index);
3173 fputs("\n"
3174 "InnoDB: sec index record ", stderr);
3175 rec_print(stderr, rec, sec_index);
3176 fputs("\n"
3177 "InnoDB: clust index record ", stderr);
3178 rec_print(stderr, clust_rec, clust_index);
3179 putc('\n', stderr);
3180 trx_print(stderr, trx, 600);
3181 fputs("\n"
3182 "InnoDB: Submit a detailed bug report"
3183 " to http://bugs.mysql.com\n", stderr);
3184 ut_ad(0);
3185 }
3186
3187 clust_rec = NULL;
3188
3189 err = DB_SUCCESS;
3190 goto func_exit;
3191 }
3192
3193 *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
3194 ULINT_UNDEFINED, offset_heap);
3195
3196 if (prebuilt->select_lock_type != LOCK_NONE) {
3197 /* Try to place a lock on the index record; we are searching
3198 the clust rec with a unique condition, hence
3199 we set a LOCK_REC_NOT_GAP type lock */
3200
3201 err = lock_clust_rec_read_check_and_lock(
3202 0, btr_pcur_get_block(&prebuilt->clust_pcur),
3203 clust_rec, clust_index, *offsets,
3204 static_cast<enum lock_mode>(prebuilt->select_lock_type),
3205 LOCK_REC_NOT_GAP,
3206 thr);
3207
3208 switch (err) {
3209 case DB_SUCCESS:
3210 case DB_SUCCESS_LOCKED_REC:
3211 break;
3212 default:
3213 goto err_exit;
3214 }
3215 } else {
3216 /* This is a non-locking consistent read: if necessary, fetch
3217 a previous version of the record */
3218
3219 old_vers = NULL;
3220
3221 /* If the isolation level allows reading of uncommitted data,
3222 then we never look for an earlier version */
3223
3224 if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3225 && !lock_clust_rec_cons_read_sees(
3226 clust_rec, clust_index, *offsets,
3227 trx->read_view)) {
3228
3229 /* The following call returns 'offsets' associated with
3230 'old_vers' */
3231 err = row_sel_build_prev_vers_for_mysql(
3232 trx->read_view, clust_index, prebuilt,
3233 clust_rec, offsets, offset_heap, &old_vers,
3234 mtr);
3235
3236 if (err != DB_SUCCESS || old_vers == NULL) {
3237
3238 goto err_exit;
3239 }
3240
3241 clust_rec = old_vers;
3242 }
3243
3244 /* If we had to go to an earlier version of row or the
3245 secondary index record is delete marked, then it may be that
3246 the secondary index record corresponding to clust_rec
3247 (or old_vers) is not rec; in that case we must ignore
3248 such row because in our snapshot rec would not have existed.
3249 Remember that from rec we cannot see directly which transaction
3250 id corresponds to it: we have to go to the clustered index
3251 record. A query where we want to fetch all rows where
3252 the secondary index value is in some interval would return
3253 a wrong result if we would not drop rows which we come to
3254 visit through secondary index records that would not really
3255 exist in our snapshot. */
3256
3257 if (clust_rec
3258 && (old_vers
3259 || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
3260 || rec_get_deleted_flag(rec, dict_table_is_comp(
3261 sec_index->table)))
3262 && !row_sel_sec_rec_is_for_clust_rec(
3263 rec, sec_index, clust_rec, clust_index)) {
3264 clust_rec = NULL;
3265 #ifdef UNIV_SEARCH_DEBUG
3266 } else {
3267 ut_a(clust_rec == NULL
3268 || row_sel_sec_rec_is_for_clust_rec(
3269 rec, sec_index, clust_rec, clust_index));
3270 #endif
3271 }
3272
3273 err = DB_SUCCESS;
3274 }
3275
3276 func_exit:
3277 *out_rec = clust_rec;
3278
3279 /* Store the current position if select_lock_type is not
3280 LOCK_NONE or if we are scanning using InnoDB APIs */
3281 if (prebuilt->select_lock_type != LOCK_NONE
3282 || prebuilt->innodb_api) {
3283 /* We may use the cursor in update or in unlock_row():
3284 store its position */
3285
3286 btr_pcur_store_position(&prebuilt->clust_pcur, mtr);
3287 }
3288
3289 err_exit:
3290 return(err);
3291 }
3292
3293 /********************************************************************//**
3294 Restores cursor position after it has been stored. We have to take into
3295 account that the record cursor was positioned on may have been deleted.
3296 Then we may have to move the cursor one step up or down.
3297 @return TRUE if we may need to process the record the cursor is now
3298 positioned on (i.e. we should not go to the next record yet) */
3299 static
3300 ibool
sel_restore_position_for_mysql(ibool * same_user_rec,ulint latch_mode,btr_pcur_t * pcur,ibool moves_up,mtr_t * mtr)3301 sel_restore_position_for_mysql(
3302 /*===========================*/
3303 ibool* same_user_rec, /*!< out: TRUE if we were able to restore
3304 the cursor on a user record with the
3305 same ordering prefix in in the
3306 B-tree index */
3307 ulint latch_mode, /*!< in: latch mode wished in
3308 restoration */
3309 btr_pcur_t* pcur, /*!< in: cursor whose position
3310 has been stored */
3311 ibool moves_up, /*!< in: TRUE if the cursor moves up
3312 in the index */
3313 mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
3314 mtr temporarily! */
3315 {
3316 ibool success;
3317
3318 success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3319
3320 *same_user_rec = success;
3321
3322 ut_ad(!success || pcur->rel_pos == BTR_PCUR_ON);
3323 #ifdef UNIV_DEBUG
3324 if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
3325 ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
3326 || pcur->rel_pos == BTR_PCUR_AFTER);
3327 } else {
3328 ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
3329 ut_ad((pcur->rel_pos == BTR_PCUR_ON)
3330 == btr_pcur_is_on_user_rec(pcur));
3331 }
3332 #endif
3333
3334 /* The position may need be adjusted for rel_pos and moves_up. */
3335
3336 switch (pcur->rel_pos) {
3337 case BTR_PCUR_ON:
3338 if (!success && moves_up) {
3339 next:
3340 btr_pcur_move_to_next(pcur, mtr);
3341 return(TRUE);
3342 }
3343 return(!success);
3344 case BTR_PCUR_AFTER_LAST_IN_TREE:
3345 case BTR_PCUR_BEFORE_FIRST_IN_TREE:
3346 return(TRUE);
3347 case BTR_PCUR_AFTER:
3348 /* positioned to record after pcur->old_rec. */
3349 pcur->pos_state = BTR_PCUR_IS_POSITIONED;
3350 prev:
3351 if (btr_pcur_is_on_user_rec(pcur) && !moves_up) {
3352 btr_pcur_move_to_prev(pcur, mtr);
3353 }
3354 return(TRUE);
3355 case BTR_PCUR_BEFORE:
3356 /* For non optimistic restoration:
3357 The position is now set to the record before pcur->old_rec.
3358
3359 For optimistic restoration:
3360 The position also needs to take the previous search_mode into
3361 consideration. */
3362
3363 switch (pcur->pos_state) {
3364 case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
3365 pcur->pos_state = BTR_PCUR_IS_POSITIONED;
3366 if (pcur->search_mode == PAGE_CUR_GE) {
3367 /* Positioned during Greater or Equal search
3368 with BTR_PCUR_BEFORE. Optimistic restore to
3369 the same record. If scanning for lower then
3370 we must move to previous record.
3371 This can happen with:
3372 HANDLER READ idx a = (const);
3373 HANDLER READ idx PREV; */
3374 goto prev;
3375 }
3376 return(TRUE);
3377 case BTR_PCUR_IS_POSITIONED:
3378 if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3379 goto next;
3380 }
3381 return(TRUE);
3382 case BTR_PCUR_WAS_POSITIONED:
3383 case BTR_PCUR_NOT_POSITIONED:
3384 break;
3385 }
3386 }
3387 ut_ad(0);
3388 return(TRUE);
3389 }
3390
3391 /********************************************************************//**
3392 Copies a cached field for MySQL from the fetch cache. */
3393 static
3394 void
row_sel_copy_cached_field_for_mysql(byte * buf,const byte * cache,const mysql_row_templ_t * templ)3395 row_sel_copy_cached_field_for_mysql(
3396 /*================================*/
3397 byte* buf, /*!< in/out: row buffer */
3398 const byte* cache, /*!< in: cached row */
3399 const mysql_row_templ_t*templ) /*!< in: column template */
3400 {
3401 ulint len;
3402
3403 buf += templ->mysql_col_offset;
3404 cache += templ->mysql_col_offset;
3405
3406 UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len);
3407
3408 if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
3409 && templ->type != DATA_INT) {
3410 /* Check for != DATA_INT to make sure we do
3411 not treat MySQL ENUM or SET as a true VARCHAR!
3412 Find the actual length of the true VARCHAR field. */
3413 row_mysql_read_true_varchar(
3414 &len, cache, templ->mysql_length_bytes);
3415 len += templ->mysql_length_bytes;
3416 UNIV_MEM_INVALID(buf, templ->mysql_col_len);
3417 } else {
3418 len = templ->mysql_col_len;
3419 }
3420
3421 ut_memcpy(buf, cache, len);
3422 }
3423
3424 /** Copy used fields from cached row.
3425 Copy cache record field by field, don't touch fields that
3426 are not covered by current key.
3427 @param[out] buf Where to copy the MySQL row.
3428 @param[in] cached_rec What to copy (in MySQL row format).
3429 @param[in] prebuilt prebuilt struct. */
3430 void
row_sel_copy_cached_fields_for_mysql(byte * buf,const byte * cached_rec,row_prebuilt_t * prebuilt)3431 row_sel_copy_cached_fields_for_mysql(
3432 byte* buf,
3433 const byte* cached_rec,
3434 row_prebuilt_t* prebuilt)
3435 {
3436 const mysql_row_templ_t*templ;
3437 ulint i;
3438 for (i = 0; i < prebuilt->n_template; i++) {
3439 templ = prebuilt->mysql_template + i;
3440
3441 row_sel_copy_cached_field_for_mysql(
3442 buf, cached_rec, templ);
3443 /* Copy NULL bit of the current field from cached_rec
3444 to buf */
3445 if (templ->mysql_null_bit_mask) {
3446 buf[templ->mysql_null_byte_offset]
3447 ^= (buf[templ->mysql_null_byte_offset]
3448 ^ cached_rec[templ->mysql_null_byte_offset])
3449 & (byte) templ->mysql_null_bit_mask;
3450 }
3451 }
3452 }
3453
3454 /********************************************************************//**
3455 Pops a cached row for MySQL from the fetch cache. */
3456 UNIV_INLINE
3457 void
row_sel_dequeue_cached_row_for_mysql(byte * buf,row_prebuilt_t * prebuilt)3458 row_sel_dequeue_cached_row_for_mysql(
3459 /*=================================*/
3460 byte* buf, /*!< in/out: buffer where to copy the
3461 row */
3462 row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
3463 {
3464 ulint i;
3465 const mysql_row_templ_t*templ;
3466 const byte* cached_rec;
3467 ut_ad(prebuilt->n_fetch_cached > 0);
3468 ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3469
3470 UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len);
3471
3472 cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
3473
3474 if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3475 /* Copy cache record field by field, don't touch fields that
3476 are not covered by current key */
3477
3478 for (i = 0; i < prebuilt->n_template; i++) {
3479 templ = prebuilt->mysql_template + i;
3480 row_sel_copy_cached_field_for_mysql(
3481 buf, cached_rec, templ);
3482 /* Copy NULL bit of the current field from cached_rec
3483 to buf */
3484 if (templ->mysql_null_bit_mask) {
3485 buf[templ->mysql_null_byte_offset]
3486 ^= (buf[templ->mysql_null_byte_offset]
3487 ^ cached_rec[templ->mysql_null_byte_offset])
3488 & (byte) templ->mysql_null_bit_mask;
3489 }
3490 }
3491 } else if (prebuilt->mysql_prefix_len > 63) {
3492 /* The record is long. Copy it field by field, in case
3493 there are some long VARCHAR column of which only a
3494 small length is being used. */
3495 UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len);
3496
3497 /* First copy the NULL bits. */
3498 ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
3499 /* Then copy the requested fields. */
3500
3501 for (i = 0; i < prebuilt->n_template; i++) {
3502 row_sel_copy_cached_field_for_mysql(
3503 buf, cached_rec, prebuilt->mysql_template + i);
3504 }
3505 } else {
3506 ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
3507 }
3508
3509 prebuilt->n_fetch_cached--;
3510 prebuilt->fetch_cache_first++;
3511
3512 if (prebuilt->n_fetch_cached == 0) {
3513 prebuilt->fetch_cache_first = 0;
3514 }
3515 }
3516
3517 /********************************************************************//**
3518 Initialise the prefetch cache. */
3519 UNIV_INLINE
3520 void
row_sel_prefetch_cache_init(row_prebuilt_t * prebuilt)3521 row_sel_prefetch_cache_init(
3522 /*========================*/
3523 row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
3524 {
3525 ulint i;
3526 ulint sz;
3527 byte* ptr;
3528
3529 /* Reserve space for the magic number. */
3530 sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
3531 ptr = static_cast<byte*>(mem_alloc(sz));
3532
3533 for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
3534
3535 /* A user has reported memory corruption in these
3536 buffers in Linux. Put magic numbers there to help
3537 to track a possible bug. */
3538
3539 mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3540 ptr += 4;
3541
3542 prebuilt->fetch_cache[i] = ptr;
3543 ptr += prebuilt->mysql_row_len;
3544
3545 mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3546 ptr += 4;
3547 }
3548 }
3549
3550 /********************************************************************//**
3551 Get the last fetch cache buffer from the queue.
3552 @return pointer to buffer. */
3553 UNIV_INLINE
3554 byte*
row_sel_fetch_last_buf(row_prebuilt_t * prebuilt)3555 row_sel_fetch_last_buf(
3556 /*===================*/
3557 row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
3558 {
3559 ut_ad(!prebuilt->templ_contains_blob);
3560 ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3561
3562 if (prebuilt->fetch_cache[0] == NULL) {
3563 /* Allocate memory for the fetch cache */
3564 ut_ad(prebuilt->n_fetch_cached == 0);
3565
3566 row_sel_prefetch_cache_init(prebuilt);
3567 }
3568
3569 ut_ad(prebuilt->fetch_cache_first == 0);
3570 UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
3571 prebuilt->mysql_row_len);
3572
3573 return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
3574 }
3575
3576 /********************************************************************//**
3577 Pushes a row for MySQL to the fetch cache. */
3578 UNIV_INLINE
3579 void
row_sel_enqueue_cache_row_for_mysql(byte * mysql_rec,row_prebuilt_t * prebuilt)3580 row_sel_enqueue_cache_row_for_mysql(
3581 /*================================*/
3582 byte* mysql_rec, /*!< in/out: MySQL record */
3583 row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
3584 {
3585 /* For non ICP code path the row should already exist in the
3586 next fetch cache slot. */
3587
3588 if (prebuilt->idx_cond != NULL) {
3589 byte* dest = row_sel_fetch_last_buf(prebuilt);
3590
3591 ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
3592 }
3593
3594 ++prebuilt->n_fetch_cached;
3595 }
3596
3597 /*********************************************************************//**
3598 Tries to do a shortcut to fetch a clustered index record with a unique key,
3599 using the hash index if possible (not always). We assume that the search
3600 mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3601 btr search latch has been locked in S-mode if AHI is enabled.
3602 @return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3603 static
3604 ulint
row_sel_try_search_shortcut_for_mysql(const rec_t ** out_rec,row_prebuilt_t * prebuilt,ulint ** offsets,mem_heap_t ** heap,mtr_t * mtr)3605 row_sel_try_search_shortcut_for_mysql(
3606 /*==================================*/
3607 const rec_t** out_rec,/*!< out: record if found */
3608 row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
3609 ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
3610 mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3611 mtr_t* mtr) /*!< in: started mtr */
3612 {
3613 dict_index_t* index = prebuilt->index;
3614 const dtuple_t* search_tuple = prebuilt->search_tuple;
3615 btr_pcur_t* pcur = &prebuilt->pcur;
3616 trx_t* trx = prebuilt->trx;
3617 const rec_t* rec;
3618
3619 ut_ad(dict_index_is_clust(index));
3620 ut_ad(!prebuilt->templ_contains_blob);
3621
3622 #ifndef UNIV_SEARCH_DEBUG
3623 btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3624 BTR_SEARCH_LEAF, pcur,
3625 (trx->has_search_latch)
3626 ? RW_S_LATCH
3627 : 0,
3628 mtr);
3629 #else /* UNIV_SEARCH_DEBUG */
3630 btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3631 BTR_SEARCH_LEAF, pcur,
3632 0,
3633 mtr);
3634 #endif /* UNIV_SEARCH_DEBUG */
3635 rec = btr_pcur_get_rec(pcur);
3636
3637 if (!page_rec_is_user_rec(rec)) {
3638
3639 return(SEL_RETRY);
3640 }
3641
3642 /* As the cursor is now placed on a user record after a search with
3643 the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3644 fields in the user record matched to the search tuple */
3645
3646 if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3647
3648 return(SEL_EXHAUSTED);
3649 }
3650
3651 /* This is a non-locking consistent read: if necessary, fetch
3652 a previous version of the record */
3653
3654 *offsets = rec_get_offsets(rec, index, *offsets,
3655 ULINT_UNDEFINED, heap);
3656
3657 if (!lock_clust_rec_cons_read_sees(rec, index,
3658 *offsets, trx->read_view)) {
3659
3660 return(SEL_RETRY);
3661 }
3662
3663 if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3664
3665 return(SEL_EXHAUSTED);
3666 }
3667
3668 *out_rec = rec;
3669
3670 return(SEL_FOUND);
3671 }
3672
3673 /*********************************************************************//**
3674 Check a pushed-down index condition.
3675 @return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
3676 static
3677 enum icp_result
row_search_idx_cond_check(byte * mysql_rec,row_prebuilt_t * prebuilt,const rec_t * rec,const ulint * offsets)3678 row_search_idx_cond_check(
3679 /*======================*/
3680 byte* mysql_rec, /*!< out: record
3681 in MySQL format (invalid unless
3682 prebuilt->idx_cond!=NULL and
3683 we return ICP_MATCH) */
3684 row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
3685 for the table handle */
3686 const rec_t* rec, /*!< in: InnoDB record */
3687 const ulint* offsets) /*!< in: rec_get_offsets() */
3688 {
3689 enum icp_result result;
3690 ulint i;
3691
3692 ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
3693
3694 if (!prebuilt->idx_cond) {
3695 return(ICP_MATCH);
3696 }
3697
3698 MONITOR_INC(MONITOR_ICP_ATTEMPTS);
3699
3700 /* Convert to MySQL format those fields that are needed for
3701 evaluating the index condition. */
3702
3703 if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
3704 mem_heap_empty(prebuilt->blob_heap);
3705 }
3706
3707 for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
3708 const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
3709
3710 if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
3711 rec, prebuilt->index, offsets,
3712 templ->icp_rec_field_no,
3713 templ, ULINT_UNDEFINED)) {
3714 return(ICP_NO_MATCH);
3715 }
3716 }
3717
3718 /* We assume that the index conditions on
3719 case-insensitive columns are case-insensitive. The
3720 case of such columns may be wrong in a secondary
3721 index, if the case of the column has been updated in
3722 the past, or a record has been deleted and a record
3723 inserted in a different case. */
3724 result = innobase_index_cond(prebuilt->idx_cond);
3725 switch (result) {
3726 case ICP_MATCH:
3727 /* Convert the remaining fields to MySQL format.
3728 If this is a secondary index record, we must defer
3729 this until we have fetched the clustered index record. */
3730 if (!prebuilt->need_to_access_clustered
3731 || dict_index_is_clust(prebuilt->index)) {
3732 if (!row_sel_store_mysql_rec(
3733 mysql_rec, prebuilt, rec, FALSE,
3734 prebuilt->index, offsets, false)) {
3735 ut_ad(dict_index_is_clust(prebuilt->index));
3736 return(ICP_NO_MATCH);
3737 }
3738 }
3739 MONITOR_INC(MONITOR_ICP_MATCH);
3740 return(result);
3741 case ICP_NO_MATCH:
3742 MONITOR_INC(MONITOR_ICP_NO_MATCH);
3743 return(result);
3744 case ICP_OUT_OF_RANGE:
3745 MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
3746 return(result);
3747 }
3748
3749 ut_error;
3750 return(result);
3751 }
3752
3753 /** Check the pushed down end range condition to avoid extra traversal
3754 if records are not within view and also to avoid prefetching in the
3755 cache buffer.
3756 @param[in] mysql_rec record in MySQL format
3757 @param[in,out] handler the MySQL handler performing the scan
3758 @retval true if the row in mysql_rec is out of range
3759 @retval false if the row in mysql_rec is in range */
3760 static
3761 bool
row_search_end_range_check(const byte * mysql_rec,ha_innobase * handler)3762 row_search_end_range_check(
3763 const byte* mysql_rec,
3764 ha_innobase* handler)
3765 {
3766 if (handler->end_range &&
3767 handler->compare_key_in_buffer(mysql_rec) > 0) {
3768 return(true);
3769 }
3770
3771 return(false);
3772 }
3773
3774 /********************************************************************//**
3775 Searches for rows in the database. This is used in the interface to
3776 MySQL. This function opens a cursor, and also implements fetch next
3777 and fetch prev. NOTE that if we do a search with a full key value
3778 from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3779 position and fetch next or fetch prev must not be tried to the cursor!
3780 @return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
3781 DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
3782 UNIV_INTERN
3783 dberr_t
row_search_for_mysql(byte * buf,ulint mode,row_prebuilt_t * prebuilt,ulint match_mode,ulint direction)3784 row_search_for_mysql(
3785 /*=================*/
3786 byte* buf, /*!< in/out: buffer for the fetched
3787 row in the MySQL format */
3788 ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
3789 row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
3790 table handle; this contains the info
3791 of search_tuple, index; if search
3792 tuple contains 0 fields then we
3793 position the cursor at the start or
3794 the end of the index, depending on
3795 'mode' */
3796 ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
3797 ROW_SEL_EXACT_PREFIX */
3798 ulint direction) /*!< in: 0 or ROW_SEL_NEXT or
3799 ROW_SEL_PREV; NOTE: if this is != 0,
3800 then prebuilt must have a pcur
3801 with stored position! In opening of a
3802 cursor 'direction' should be 0. */
3803 {
3804 dict_index_t* index = prebuilt->index;
3805 ibool comp = dict_table_is_comp(index->table);
3806 const dtuple_t* search_tuple = prebuilt->search_tuple;
3807 btr_pcur_t* pcur = &prebuilt->pcur;
3808 trx_t* trx = prebuilt->trx;
3809 dict_index_t* clust_index;
3810 que_thr_t* thr;
3811 const rec_t* prev_rec = NULL;
3812 const rec_t* rec = NULL;
3813 byte* end_range_cache = NULL;
3814 const rec_t* result_rec = NULL;
3815 const rec_t* clust_rec;
3816 dberr_t err = DB_SUCCESS;
3817 ibool unique_search = FALSE;
3818 ibool mtr_has_extra_clust_latch = FALSE;
3819 ibool moves_up = FALSE;
3820 ibool set_also_gap_locks = TRUE;
3821 /* if the query is a plain locking SELECT, and the isolation level
3822 is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3823 ibool did_semi_consistent_read = FALSE;
3824 /* if the returned record was locked and we did a semi-consistent
3825 read (fetch the newest committed version), then this is set to
3826 TRUE */
3827 #ifdef UNIV_SEARCH_DEBUG
3828 ulint cnt = 0;
3829 #endif /* UNIV_SEARCH_DEBUG */
3830 ulint next_offs;
3831 ibool same_user_rec;
3832 mtr_t mtr;
3833 mem_heap_t* heap = NULL;
3834 ulint offsets_[REC_OFFS_NORMAL_SIZE];
3835 ulint* offsets = offsets_;
3836 ibool table_lock_waited = FALSE;
3837 byte* next_buf = 0;
3838 ulint end_loop = 0;
3839
3840 rec_offs_init(offsets_);
3841
3842 ut_ad(index && pcur && search_tuple);
3843
3844 /* We don't support FTS queries from the HANDLER interfaces, because
3845 we implemented FTS as reversed inverted index with auxiliary tables.
3846 So anything related to traditional index query would not apply to
3847 it. */
3848 if (index->type & DICT_FTS) {
3849 return(DB_END_OF_INDEX);
3850 }
3851
3852 #ifdef UNIV_SYNC_DEBUG
3853 ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
3854 #endif /* UNIV_SYNC_DEBUG */
3855
3856 if (dict_table_is_discarded(prebuilt->table)) {
3857
3858 return(DB_TABLESPACE_DELETED);
3859
3860 } else if (prebuilt->table->ibd_file_missing) {
3861
3862 return(DB_TABLESPACE_NOT_FOUND);
3863
3864 } else if (!prebuilt->index_usable) {
3865
3866 return(DB_MISSING_HISTORY);
3867
3868 } else if (dict_index_is_corrupted(index)) {
3869
3870 return(DB_CORRUPTION);
3871
3872 } else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
3873 fprintf(stderr,
3874 "InnoDB: Error: trying to free a corrupt\n"
3875 "InnoDB: table handle. Magic n %lu, table name ",
3876 (ulong) prebuilt->magic_n);
3877 ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3878 putc('\n', stderr);
3879
3880 mem_analyze_corruption(prebuilt);
3881
3882 ut_error;
3883 }
3884
3885 #if 0
3886 /* August 19, 2005 by Heikki: temporarily disable this error
3887 print until the cursor lock count is done correctly.
3888 See bugs #12263 and #12456!*/
3889
3890 if (trx->n_mysql_tables_in_use == 0
3891 && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
3892 /* Note that if MySQL uses an InnoDB temp table that it
3893 created inside LOCK TABLES, then n_mysql_tables_in_use can
3894 be zero; in that case select_lock_type is set to LOCK_X in
3895 ::start_stmt. */
3896
3897 fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
3898 "InnoDB: but it has not locked"
3899 " any tables in ::external_lock()!\n",
3900 stderr);
3901 trx_print(stderr, trx, 600);
3902 fputc('\n', stderr);
3903 }
3904 #endif
3905
3906 #if 0
3907 fprintf(stderr, "Match mode %lu\n search tuple ",
3908 (ulong) match_mode);
3909 dtuple_print(search_tuple);
3910 fprintf(stderr, "N tables locked %lu\n",
3911 (ulong) trx->mysql_n_tables_locked);
3912 #endif
3913 /*-------------------------------------------------------------*/
3914 /* PHASE 0: Release a possible s-latch we are holding on the
3915 adaptive hash index latch if there is someone waiting behind */
3916
3917 if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
3918 && trx->has_search_latch) {
3919
3920 /* There is an x-latch request on the adaptive hash index:
3921 release the s-latch to reduce starvation and wait for
3922 BTR_SEA_TIMEOUT rounds before trying to keep it again over
3923 calls from MySQL */
3924
3925 rw_lock_s_unlock(&btr_search_latch);
3926 trx->has_search_latch = FALSE;
3927
3928 trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3929 }
3930
3931 /* Reset the new record lock info if srv_locks_unsafe_for_binlog
3932 is set or session is using a READ COMMITED isolation level. Then
3933 we are able to remove the record locks set here on an individual
3934 row. */
3935 prebuilt->new_rec_locks = 0;
3936
3937 /*-------------------------------------------------------------*/
3938 /* PHASE 1: Try to pop the row from the prefetch cache */
3939
3940 if (UNIV_UNLIKELY(direction == 0)) {
3941 trx->op_info = "starting index read";
3942
3943 prebuilt->n_rows_fetched = 0;
3944 prebuilt->n_fetch_cached = 0;
3945 prebuilt->fetch_cache_first = 0;
3946 prebuilt->end_range = false;
3947
3948 if (prebuilt->sel_graph == NULL) {
3949 /* Build a dummy select query graph */
3950 row_prebuild_sel_graph(prebuilt);
3951 }
3952 } else {
3953 trx->op_info = "fetching rows";
3954
3955 if (prebuilt->n_rows_fetched == 0) {
3956 prebuilt->fetch_direction = direction;
3957 }
3958
3959 if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3960 if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3961 ut_error;
3962 /* TODO: scrollable cursor: restore cursor to
3963 the place of the latest returned row,
3964 or better: prevent caching for a scroll
3965 cursor! */
3966 }
3967
3968 prebuilt->n_rows_fetched = 0;
3969 prebuilt->n_fetch_cached = 0;
3970 prebuilt->fetch_cache_first = 0;
3971
3972 } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3973 row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
3974
3975 prebuilt->n_rows_fetched++;
3976
3977 err = DB_SUCCESS;
3978 goto func_exit;
3979 } else if (prebuilt->end_range == true) {
3980 prebuilt->end_range = false;
3981 err = DB_RECORD_NOT_FOUND;
3982 goto func_exit;
3983 }
3984
3985 if (prebuilt->fetch_cache_first > 0
3986 && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3987
3988 /* The previous returned row was popped from the fetch
3989 cache, but the cache was not full at the time of the
3990 popping: no more rows can exist in the result set */
3991
3992 err = DB_RECORD_NOT_FOUND;
3993 goto func_exit;
3994 }
3995
3996 prebuilt->n_rows_fetched++;
3997
3998 if (prebuilt->n_rows_fetched > 1000000000) {
3999 /* Prevent wrap-over */
4000 prebuilt->n_rows_fetched = 500000000;
4001 }
4002
4003 mode = pcur->search_mode;
4004 }
4005
4006 /* In a search where at most one record in the index may match, we
4007 can use a LOCK_REC_NOT_GAP type record lock when locking a
4008 non-delete-marked matching record.
4009
4010 Note that in a unique secondary index there may be different
4011 delete-marked versions of a record where only the primary key
4012 values differ: thus in a secondary index we must use next-key
4013 locks when locking delete-marked records. */
4014
4015 if (match_mode == ROW_SEL_EXACT
4016 && dict_index_is_unique(index)
4017 && dtuple_get_n_fields(search_tuple)
4018 == dict_index_get_n_unique(index)
4019 && (dict_index_is_clust(index)
4020 || !dtuple_contains_null(search_tuple))) {
4021
4022 /* Note above that a UNIQUE secondary index can contain many
4023 rows with the same key value if one of the columns is the SQL
4024 null. A clustered index under MySQL can never contain null
4025 columns because we demand that all the columns in primary key
4026 are non-null. */
4027
4028 unique_search = TRUE;
4029
4030 /* Even if the condition is unique, MySQL seems to try to
4031 retrieve also a second row if a primary key contains more than
4032 1 column. Return immediately if this is not a HANDLER
4033 command. */
4034
4035 if (UNIV_UNLIKELY(direction != 0
4036 && !prebuilt->used_in_HANDLER)) {
4037
4038 err = DB_RECORD_NOT_FOUND;
4039 goto func_exit;
4040 }
4041 }
4042
4043 mtr_start(&mtr);
4044
4045 /*-------------------------------------------------------------*/
4046 /* PHASE 2: Try fast adaptive hash index search if possible */
4047
4048 /* Next test if this is the special case where we can use the fast
4049 adaptive hash index to try the search. Since we must release the
4050 search system latch when we retrieve an externally stored field, we
4051 cannot use the adaptive hash index in a search in the case the row
4052 may be long and there may be externally stored fields */
4053
4054 if (UNIV_UNLIKELY(direction == 0)
4055 && unique_search
4056 && dict_index_is_clust(index)
4057 && !prebuilt->templ_contains_blob
4058 && !prebuilt->used_in_HANDLER
4059 && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
4060 && !prebuilt->innodb_api) {
4061
4062 mode = PAGE_CUR_GE;
4063
4064 if (trx->mysql_n_tables_locked == 0
4065 && prebuilt->select_lock_type == LOCK_NONE
4066 && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
4067 && trx->read_view) {
4068
4069 /* This is a SELECT query done as a consistent read,
4070 and the read view has already been allocated:
4071 let us try a search shortcut through the hash
4072 index.
4073 NOTE that we must also test that
4074 mysql_n_tables_locked == 0, because this might
4075 also be INSERT INTO ... SELECT ... or
4076 CREATE TABLE ... SELECT ... . Our algorithm is
4077 NOT prepared to inserts interleaved with the SELECT,
4078 and if we try that, we can deadlock on the adaptive
4079 hash index semaphore! */
4080
4081 #ifndef UNIV_SEARCH_DEBUG
4082 if (!trx->has_search_latch) {
4083 rw_lock_s_lock(&btr_search_latch);
4084 trx->has_search_latch = TRUE;
4085 }
4086 #endif
4087 switch (row_sel_try_search_shortcut_for_mysql(
4088 &rec, prebuilt, &offsets, &heap,
4089 &mtr)) {
4090 case SEL_FOUND:
4091 #ifdef UNIV_SEARCH_DEBUG
4092 ut_a(0 == cmp_dtuple_rec(search_tuple,
4093 rec, offsets));
4094 #endif
4095 /* At this point, rec is protected by
4096 a page latch that was acquired by
4097 row_sel_try_search_shortcut_for_mysql().
4098 The latch will not be released until
4099 mtr_commit(&mtr). */
4100 ut_ad(!rec_get_deleted_flag(rec, comp));
4101
4102 if (prebuilt->idx_cond) {
4103 switch (row_search_idx_cond_check(
4104 buf, prebuilt,
4105 rec, offsets)) {
4106 case ICP_NO_MATCH:
4107 case ICP_OUT_OF_RANGE:
4108 goto shortcut_mismatch;
4109 case ICP_MATCH:
4110 goto shortcut_match;
4111 }
4112 }
4113
4114 if (!row_sel_store_mysql_rec(
4115 buf, prebuilt,
4116 rec, FALSE, index,
4117 offsets, false)) {
4118 /* Only fresh inserts may contain
4119 incomplete externally stored
4120 columns. Pretend that such
4121 records do not exist. Such
4122 records may only be accessed
4123 at the READ UNCOMMITTED
4124 isolation level or when
4125 rolling back a recovered
4126 transaction. Rollback happens
4127 at a lower level, not here. */
4128
4129 /* Proceed as in case SEL_RETRY. */
4130 break;
4131 }
4132
4133 shortcut_match:
4134 mtr_commit(&mtr);
4135
4136 /* ut_print_name(stderr, index->name);
4137 fputs(" shortcut\n", stderr); */
4138
4139 err = DB_SUCCESS;
4140 goto release_search_latch_if_needed;
4141
4142 case SEL_EXHAUSTED:
4143 shortcut_mismatch:
4144 mtr_commit(&mtr);
4145
4146 /* ut_print_name(stderr, index->name);
4147 fputs(" record not found 2\n", stderr); */
4148
4149 err = DB_RECORD_NOT_FOUND;
4150 release_search_latch_if_needed:
4151 if (trx->search_latch_timeout > 0
4152 && trx->has_search_latch) {
4153
4154 trx->search_latch_timeout--;
4155
4156 rw_lock_s_unlock(&btr_search_latch);
4157 trx->has_search_latch = FALSE;
4158 }
4159
4160 /* NOTE that we do NOT store the cursor
4161 position */
4162 goto func_exit;
4163
4164 case SEL_RETRY:
4165 break;
4166
4167 default:
4168 ut_ad(0);
4169 }
4170
4171 mtr_commit(&mtr);
4172 mtr_start(&mtr);
4173 }
4174 }
4175
4176 /*-------------------------------------------------------------*/
4177 /* PHASE 3: Open or restore index cursor position */
4178
4179 if (trx->has_search_latch) {
4180 rw_lock_s_unlock(&btr_search_latch);
4181 trx->has_search_latch = FALSE;
4182 }
4183
4184 /* The state of a running trx can only be changed by the
4185 thread that is currently serving the transaction. Because we
4186 are that thread, we can read trx->state without holding any
4187 mutex. */
4188 ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE);
4189
4190 ut_ad(trx->state == TRX_STATE_NOT_STARTED
4191 || trx->state == TRX_STATE_ACTIVE);
4192
4193 ut_ad(prebuilt->sql_stat_start
4194 || prebuilt->select_lock_type != LOCK_NONE
4195 || trx->read_view);
4196
4197 trx_start_if_not_started(trx);
4198
4199 if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
4200 && prebuilt->select_lock_type != LOCK_NONE
4201 && trx->mysql_thd != NULL
4202 && thd_is_select(trx->mysql_thd)) {
4203 /* It is a plain locking SELECT and the isolation
4204 level is low: do not lock gaps */
4205
4206 set_also_gap_locks = FALSE;
4207 }
4208
4209 /* Note that if the search mode was GE or G, then the cursor
4210 naturally moves upward (in fetch next) in alphabetical order,
4211 otherwise downward */
4212
4213 if (UNIV_UNLIKELY(direction == 0)) {
4214 if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
4215 moves_up = TRUE;
4216 }
4217 } else if (direction == ROW_SEL_NEXT) {
4218 moves_up = TRUE;
4219 }
4220
4221 thr = que_fork_get_first_thr(prebuilt->sel_graph);
4222
4223 que_thr_move_to_run_state_for_mysql(thr, trx);
4224
4225 clust_index = dict_table_get_first_index(index->table);
4226
4227 /* Do some start-of-statement preparations */
4228
4229 if (!prebuilt->sql_stat_start) {
4230 /* No need to set an intention lock or assign a read view */
4231
4232 if (UNIV_UNLIKELY
4233 (trx->read_view == NULL
4234 && prebuilt->select_lock_type == LOCK_NONE)) {
4235
4236 fputs("InnoDB: Error: MySQL is trying to"
4237 " perform a consistent read\n"
4238 "InnoDB: but the read view is not assigned!\n",
4239 stderr);
4240 trx_print(stderr, trx, 600);
4241 fputc('\n', stderr);
4242 ut_error;
4243 }
4244 } else if (prebuilt->select_lock_type == LOCK_NONE) {
4245 /* This is a consistent read */
4246 /* Assign a read view for the query */
4247
4248 trx_assign_read_view(trx);
4249 prebuilt->sql_stat_start = FALSE;
4250 } else {
4251 wait_table_again:
4252 err = lock_table(0, index->table,
4253 prebuilt->select_lock_type == LOCK_S
4254 ? LOCK_IS : LOCK_IX, thr);
4255
4256 if (err != DB_SUCCESS) {
4257
4258 table_lock_waited = TRUE;
4259 goto lock_table_wait;
4260 }
4261 prebuilt->sql_stat_start = FALSE;
4262 }
4263
4264 /* Open or restore index cursor position */
4265
4266 if (UNIV_LIKELY(direction != 0)) {
4267 ibool need_to_process = sel_restore_position_for_mysql(
4268 &same_user_rec, BTR_SEARCH_LEAF,
4269 pcur, moves_up, &mtr);
4270
4271 if (UNIV_UNLIKELY(need_to_process)) {
4272 if (UNIV_UNLIKELY(prebuilt->row_read_type
4273 == ROW_READ_DID_SEMI_CONSISTENT)) {
4274 /* We did a semi-consistent read,
4275 but the record was removed in
4276 the meantime. */
4277 prebuilt->row_read_type
4278 = ROW_READ_TRY_SEMI_CONSISTENT;
4279 }
4280 } else if (UNIV_LIKELY(prebuilt->row_read_type
4281 != ROW_READ_DID_SEMI_CONSISTENT)) {
4282
4283 /* The cursor was positioned on the record
4284 that we returned previously. If we need
4285 to repeat a semi-consistent read as a
4286 pessimistic locking read, the record
4287 cannot be skipped. */
4288
4289 goto next_rec;
4290 }
4291
4292 } else if (dtuple_get_n_fields(search_tuple) > 0) {
4293
4294 btr_pcur_open_with_no_init(index, search_tuple, mode,
4295 BTR_SEARCH_LEAF,
4296 pcur, 0, &mtr);
4297
4298 pcur->trx_if_known = trx;
4299
4300 rec = btr_pcur_get_rec(pcur);
4301
4302 if (!moves_up
4303 && !page_rec_is_supremum(rec)
4304 && set_also_gap_locks
4305 && !(srv_locks_unsafe_for_binlog
4306 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4307 && prebuilt->select_lock_type != LOCK_NONE) {
4308
4309 /* Try to place a gap lock on the next index record
4310 to prevent phantoms in ORDER BY ... DESC queries */
4311 const rec_t* next_rec = page_rec_get_next_const(rec);
4312
4313 offsets = rec_get_offsets(next_rec, index, offsets,
4314 ULINT_UNDEFINED, &heap);
4315 err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4316 next_rec, index, offsets,
4317 prebuilt->select_lock_type,
4318 LOCK_GAP, thr);
4319
4320 switch (err) {
4321 case DB_SUCCESS_LOCKED_REC:
4322 err = DB_SUCCESS;
4323 case DB_SUCCESS:
4324 break;
4325 default:
4326 goto lock_wait_or_error;
4327 }
4328 }
4329 } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
4330 btr_pcur_open_at_index_side(
4331 mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
4332 pcur, false, 0, &mtr);
4333 }
4334
4335 rec_loop:
4336 DEBUG_SYNC_C("row_search_rec_loop");
4337 if (trx_is_interrupted(trx)) {
4338 btr_pcur_store_position(pcur, &mtr);
4339 err = DB_INTERRUPTED;
4340 goto normal_return;
4341 }
4342
4343 /*-------------------------------------------------------------*/
4344 /* PHASE 4: Look for matching records in a loop */
4345
4346 rec = btr_pcur_get_rec(pcur);
4347 ut_ad(!!page_rec_is_comp(rec) == comp);
4348 #ifdef UNIV_SEARCH_DEBUG
4349 /*
4350 fputs("Using ", stderr);
4351 dict_index_name_print(stderr, trx, index);
4352 fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
4353 page_get_page_no(page_align(rec)));
4354 rec_print(stderr, rec, index);
4355 printf("delete-mark: %lu\n",
4356 rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
4357 */
4358 #endif /* UNIV_SEARCH_DEBUG */
4359
4360 if (page_rec_is_infimum(rec)) {
4361
4362 /* The infimum record on a page cannot be in the result set,
4363 and neither can a record lock be placed on it: we skip such
4364 a record. */
4365
4366 prev_rec = NULL;
4367 goto next_rec;
4368 }
4369
4370 if (page_rec_is_supremum(rec)) {
4371
4372 DBUG_EXECUTE_IF("compare_end_range",
4373 if (end_loop < 100) {
4374 end_loop = 100;
4375 });
4376 /** Compare the last record of the page with end range
4377 passed to InnoDB when there is no ICP and number of loops
4378 in row_search_for_mysql for rows found but not
4379 reporting due to search views etc. */
4380 if (prev_rec != NULL && !prebuilt->innodb_api
4381 && prebuilt->mysql_handler->end_range != NULL
4382 && prebuilt->idx_cond == NULL
4383 && end_loop >= 100) {
4384
4385 dict_index_t* key_index = prebuilt->index;
4386 bool clust_templ_for_sec = false;
4387
4388 if (end_range_cache == NULL) {
4389 end_range_cache = static_cast<byte*>(
4390 ut_malloc(prebuilt->mysql_row_len));
4391 }
4392
4393 if (index != clust_index
4394 && prebuilt->need_to_access_clustered) {
4395 /** Secondary index record but the template
4396 based on PK. */
4397 key_index = clust_index;
4398 clust_templ_for_sec = true;
4399 }
4400
4401 /** Create offsets based on prebuilt index. */
4402 offsets = rec_get_offsets(prev_rec, prebuilt->index,
4403 offsets, ULINT_UNDEFINED, &heap);
4404
4405 if (row_sel_store_mysql_rec(
4406 end_range_cache, prebuilt, prev_rec,
4407 clust_templ_for_sec, key_index, offsets,
4408 clust_templ_for_sec)) {
4409
4410 if (row_search_end_range_check(
4411 end_range_cache,
4412 prebuilt->mysql_handler)) {
4413
4414 /** In case of prebuilt->fetch,
4415 set the error in prebuilt->end_range. */
4416 if (next_buf != NULL) {
4417 prebuilt->end_range = true;
4418 }
4419
4420 err = DB_RECORD_NOT_FOUND;
4421 goto normal_return;
4422 }
4423 }
4424 }
4425
4426 if (set_also_gap_locks
4427 && !(srv_locks_unsafe_for_binlog
4428 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4429 && prebuilt->select_lock_type != LOCK_NONE) {
4430
4431 /* Try to place a lock on the index record */
4432
4433 /* If innodb_locks_unsafe_for_binlog option is used
4434 or this session is using a READ COMMITTED isolation
4435 level we do not lock gaps. Supremum record is really
4436 a gap and therefore we do not set locks there. */
4437
4438 offsets = rec_get_offsets(rec, index, offsets,
4439 ULINT_UNDEFINED, &heap);
4440 err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4441 rec, index, offsets,
4442 prebuilt->select_lock_type,
4443 LOCK_ORDINARY, thr);
4444
4445 switch (err) {
4446 case DB_SUCCESS_LOCKED_REC:
4447 err = DB_SUCCESS;
4448 case DB_SUCCESS:
4449 break;
4450 default:
4451 goto lock_wait_or_error;
4452 }
4453 }
4454 /* A page supremum record cannot be in the result set: skip
4455 it now that we have placed a possible lock on it */
4456
4457 prev_rec = NULL;
4458 goto next_rec;
4459 }
4460
4461 /*-------------------------------------------------------------*/
4462 /* Do sanity checks in case our cursor has bumped into page
4463 corruption */
4464
4465 if (comp) {
4466 next_offs = rec_get_next_offs(rec, TRUE);
4467 if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
4468
4469 goto wrong_offs;
4470 }
4471 } else {
4472 next_offs = rec_get_next_offs(rec, FALSE);
4473 if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
4474
4475 goto wrong_offs;
4476 }
4477 }
4478
4479 if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
4480
4481 wrong_offs:
4482 if (srv_force_recovery == 0 || moves_up == FALSE) {
4483 ut_print_timestamp(stderr);
4484 buf_page_print(page_align(rec), 0,
4485 BUF_PAGE_PRINT_NO_CRASH);
4486 fprintf(stderr,
4487 "\nInnoDB: rec address %p,"
4488 " buf block fix count %lu\n",
4489 (void*) rec, (ulong)
4490 btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
4491 ->page.buf_fix_count);
4492 fprintf(stderr,
4493 "InnoDB: Index corruption: rec offs %lu"
4494 " next offs %lu, page no %lu,\n"
4495 "InnoDB: ",
4496 (ulong) page_offset(rec),
4497 (ulong) next_offs,
4498 (ulong) page_get_page_no(page_align(rec)));
4499 dict_index_name_print(stderr, trx, index);
4500 fputs(". Run CHECK TABLE. You may need to\n"
4501 "InnoDB: restore from a backup, or"
4502 " dump + drop + reimport the table.\n",
4503 stderr);
4504 ut_ad(0);
4505 err = DB_CORRUPTION;
4506
4507 goto lock_wait_or_error;
4508 } else {
4509 /* The user may be dumping a corrupt table. Jump
4510 over the corruption to recover as much as possible. */
4511
4512 fprintf(stderr,
4513 "InnoDB: Index corruption: rec offs %lu"
4514 " next offs %lu, page no %lu,\n"
4515 "InnoDB: ",
4516 (ulong) page_offset(rec),
4517 (ulong) next_offs,
4518 (ulong) page_get_page_no(page_align(rec)));
4519 dict_index_name_print(stderr, trx, index);
4520 fputs(". We try to skip the rest of the page.\n",
4521 stderr);
4522
4523 btr_pcur_move_to_last_on_page(pcur, &mtr);
4524
4525 prev_rec = NULL;
4526 goto next_rec;
4527 }
4528 }
4529 /*-------------------------------------------------------------*/
4530
4531 /* Calculate the 'offsets' associated with 'rec' */
4532
4533 ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX);
4534 ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
4535
4536 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4537
4538 if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
4539 if (!rec_validate(rec, offsets)
4540 || !btr_index_rec_validate(rec, index, FALSE)) {
4541 fprintf(stderr,
4542 "InnoDB: Index corruption: rec offs %lu"
4543 " next offs %lu, page no %lu,\n"
4544 "InnoDB: ",
4545 (ulong) page_offset(rec),
4546 (ulong) next_offs,
4547 (ulong) page_get_page_no(page_align(rec)));
4548 dict_index_name_print(stderr, trx, index);
4549 fputs(". We try to skip the record.\n",
4550 stderr);
4551
4552 prev_rec = NULL;
4553 goto next_rec;
4554 }
4555 }
4556
4557 prev_rec = rec;
4558
4559 /* Note that we cannot trust the up_match value in the cursor at this
4560 place because we can arrive here after moving the cursor! Thus
4561 we have to recompare rec and search_tuple to determine if they
4562 match enough. */
4563
4564 if (match_mode == ROW_SEL_EXACT) {
4565 /* Test if the index record matches completely to search_tuple
4566 in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
4567
4568 /* fputs("Comparing rec and search tuple\n", stderr); */
4569
4570 if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
4571
4572 if (set_also_gap_locks
4573 && !(srv_locks_unsafe_for_binlog
4574 || trx->isolation_level
4575 <= TRX_ISO_READ_COMMITTED)
4576 && prebuilt->select_lock_type != LOCK_NONE) {
4577
4578 /* Try to place a gap lock on the index
4579 record only if innodb_locks_unsafe_for_binlog
4580 option is not set or this session is not
4581 using a READ COMMITTED isolation level. */
4582
4583 err = sel_set_rec_lock(
4584 btr_pcur_get_block(pcur),
4585 rec, index, offsets,
4586 prebuilt->select_lock_type, LOCK_GAP,
4587 thr);
4588
4589 switch (err) {
4590 case DB_SUCCESS_LOCKED_REC:
4591 case DB_SUCCESS:
4592 break;
4593 default:
4594 goto lock_wait_or_error;
4595 }
4596 }
4597
4598 btr_pcur_store_position(pcur, &mtr);
4599
4600 /* The found record was not a match, but may be used
4601 as NEXT record (index_next). Set the relative position
4602 to BTR_PCUR_BEFORE, to reflect that the position of
4603 the persistent cursor is before the found/stored row
4604 (pcur->old_rec). */
4605 ut_ad(pcur->rel_pos == BTR_PCUR_ON);
4606 pcur->rel_pos = BTR_PCUR_BEFORE;
4607
4608 err = DB_RECORD_NOT_FOUND;
4609 #if 0
4610 ut_print_name(stderr, trx, FALSE, index->name);
4611 fputs(" record not found 3\n", stderr);
4612 #endif
4613
4614 goto normal_return;
4615 }
4616
4617 } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4618
4619 if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4620
4621 if (set_also_gap_locks
4622 && !(srv_locks_unsafe_for_binlog
4623 || trx->isolation_level
4624 <= TRX_ISO_READ_COMMITTED)
4625 && prebuilt->select_lock_type != LOCK_NONE) {
4626
4627 /* Try to place a gap lock on the index
4628 record only if innodb_locks_unsafe_for_binlog
4629 option is not set or this session is not
4630 using a READ COMMITTED isolation level. */
4631
4632 err = sel_set_rec_lock(
4633 btr_pcur_get_block(pcur),
4634 rec, index, offsets,
4635 prebuilt->select_lock_type, LOCK_GAP,
4636 thr);
4637
4638 switch (err) {
4639 case DB_SUCCESS_LOCKED_REC:
4640 case DB_SUCCESS:
4641 break;
4642 default:
4643 goto lock_wait_or_error;
4644 }
4645 }
4646
4647 btr_pcur_store_position(pcur, &mtr);
4648
4649 /* The found record was not a match, but may be used
4650 as NEXT record (index_next). Set the relative position
4651 to BTR_PCUR_BEFORE, to reflect that the position of
4652 the persistent cursor is before the found/stored row
4653 (pcur->old_rec). */
4654 ut_ad(pcur->rel_pos == BTR_PCUR_ON);
4655 pcur->rel_pos = BTR_PCUR_BEFORE;
4656
4657 err = DB_RECORD_NOT_FOUND;
4658 #if 0
4659 ut_print_name(stderr, trx, FALSE, index->name);
4660 fputs(" record not found 4\n", stderr);
4661 #endif
4662
4663 goto normal_return;
4664 }
4665 }
4666
4667 /* We are ready to look at a possible new index entry in the result
4668 set: the cursor is now placed on a user record */
4669
4670 if (prebuilt->select_lock_type != LOCK_NONE) {
4671 /* Try to place a lock on the index record; note that delete
4672 marked records are a special case in a unique search. If there
4673 is a non-delete marked record, then it is enough to lock its
4674 existence with LOCK_REC_NOT_GAP. */
4675
4676 /* If innodb_locks_unsafe_for_binlog option is used
4677 or this session is using a READ COMMITED isolation
4678 level we lock only the record, i.e., next-key locking is
4679 not used. */
4680
4681 ulint lock_type;
4682
4683 if (!set_also_gap_locks
4684 || srv_locks_unsafe_for_binlog
4685 || trx->isolation_level <= TRX_ISO_READ_COMMITTED
4686 || (unique_search && !rec_get_deleted_flag(rec, comp))) {
4687
4688 goto no_gap_lock;
4689 } else {
4690 lock_type = LOCK_ORDINARY;
4691 }
4692
4693 /* If we are doing a 'greater or equal than a primary key
4694 value' search from a clustered index, and we find a record
4695 that has that exact primary key value, then there is no need
4696 to lock the gap before the record, because no insert in the
4697 gap can be in our search range. That is, no phantom row can
4698 appear that way.
4699
4700 An example: if col1 is the primary key, the search is WHERE
4701 col1 >= 100, and we find a record where col1 = 100, then no
4702 need to lock the gap before that record. */
4703
4704 if (index == clust_index
4705 && mode == PAGE_CUR_GE
4706 && direction == 0
4707 && dtuple_get_n_fields_cmp(search_tuple)
4708 == dict_index_get_n_unique(index)
4709 && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4710 no_gap_lock:
4711 lock_type = LOCK_REC_NOT_GAP;
4712 }
4713
4714 err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4715 rec, index, offsets,
4716 prebuilt->select_lock_type,
4717 lock_type, thr);
4718
4719 switch (err) {
4720 const rec_t* old_vers;
4721 case DB_SUCCESS_LOCKED_REC:
4722 if (srv_locks_unsafe_for_binlog
4723 || trx->isolation_level
4724 <= TRX_ISO_READ_COMMITTED) {
4725 /* Note that a record of
4726 prebuilt->index was locked. */
4727 prebuilt->new_rec_locks = 1;
4728 }
4729 err = DB_SUCCESS;
4730 case DB_SUCCESS:
4731 break;
4732 case DB_LOCK_WAIT:
4733 /* Never unlock rows that were part of a conflict. */
4734 prebuilt->new_rec_locks = 0;
4735
4736 if (UNIV_LIKELY(prebuilt->row_read_type
4737 != ROW_READ_TRY_SEMI_CONSISTENT)
4738 || unique_search
4739 || index != clust_index) {
4740
4741 goto lock_wait_or_error;
4742 }
4743
4744 /* The following call returns 'offsets'
4745 associated with 'old_vers' */
4746 row_sel_build_committed_vers_for_mysql(
4747 clust_index, prebuilt, rec,
4748 &offsets, &heap, &old_vers, &mtr);
4749
4750 /* Check whether it was a deadlock or not, if not
4751 a deadlock and the transaction had to wait then
4752 release the lock it is waiting on. */
4753
4754 err = lock_trx_handle_wait(trx);
4755
4756 switch (err) {
4757 case DB_SUCCESS:
4758 /* The lock was granted while we were
4759 searching for the last committed version.
4760 Do a normal locking read. */
4761
4762 offsets = rec_get_offsets(
4763 rec, index, offsets, ULINT_UNDEFINED,
4764 &heap);
4765 goto locks_ok;
4766 case DB_DEADLOCK:
4767 goto lock_wait_or_error;
4768 case DB_LOCK_WAIT:
4769 err = DB_SUCCESS;
4770 break;
4771 default:
4772 ut_error;
4773 }
4774
4775 if (old_vers == NULL) {
4776 /* The row was not yet committed */
4777
4778 goto next_rec;
4779 }
4780
4781 did_semi_consistent_read = TRUE;
4782 rec = old_vers;
4783 prev_rec = rec;
4784 break;
4785 default:
4786
4787 goto lock_wait_or_error;
4788 }
4789 } else {
4790 /* This is a non-locking consistent read: if necessary, fetch
4791 a previous version of the record */
4792
4793 if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4794
4795 /* Do nothing: we let a non-locking SELECT read the
4796 latest version of the record */
4797
4798 } else if (index == clust_index) {
4799
4800 /* Fetch a previous version of the row if the current
4801 one is not visible in the snapshot; if we have a very
4802 high force recovery level set, we try to avoid crashes
4803 by skipping this lookup */
4804
4805 if (UNIV_LIKELY(srv_force_recovery < 5)
4806 && !lock_clust_rec_cons_read_sees(
4807 rec, index, offsets, trx->read_view)) {
4808
4809 rec_t* old_vers;
4810 /* The following call returns 'offsets'
4811 associated with 'old_vers' */
4812 err = row_sel_build_prev_vers_for_mysql(
4813 trx->read_view, clust_index,
4814 prebuilt, rec, &offsets, &heap,
4815 &old_vers, &mtr);
4816
4817 if (err != DB_SUCCESS) {
4818
4819 goto lock_wait_or_error;
4820 }
4821
4822 if (old_vers == NULL) {
4823 /* The row did not exist yet in
4824 the read view */
4825
4826 goto next_rec;
4827 }
4828
4829 rec = old_vers;
4830 prev_rec = rec;
4831 }
4832 } else {
4833 /* We are looking into a non-clustered index,
4834 and to get the right version of the record we
4835 have to look also into the clustered index: this
4836 is necessary, because we can only get the undo
4837 information via the clustered index record. */
4838
4839 ut_ad(!dict_index_is_clust(index));
4840
4841 if (!lock_sec_rec_cons_read_sees(
4842 rec, trx->read_view)) {
4843 /* We should look at the clustered index.
4844 However, as this is a non-locking read,
4845 we can skip the clustered index lookup if
4846 the condition does not match the secondary
4847 index entry. */
4848 switch (row_search_idx_cond_check(
4849 buf, prebuilt, rec, offsets)) {
4850 case ICP_NO_MATCH:
4851 goto next_rec;
4852 case ICP_OUT_OF_RANGE:
4853 err = DB_RECORD_NOT_FOUND;
4854 goto idx_cond_failed;
4855 case ICP_MATCH:
4856 goto requires_clust_rec;
4857 }
4858
4859 ut_error;
4860 }
4861 }
4862 }
4863
4864 locks_ok:
4865 /* NOTE that at this point rec can be an old version of a clustered
4866 index record built for a consistent read. We cannot assume after this
4867 point that rec is on a buffer pool page. Functions like
4868 page_rec_is_comp() cannot be used! */
4869
4870 if (rec_get_deleted_flag(rec, comp)) {
4871
4872 /* The record is delete-marked: we can skip it */
4873
4874 if ((srv_locks_unsafe_for_binlog
4875 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4876 && prebuilt->select_lock_type != LOCK_NONE
4877 && !did_semi_consistent_read) {
4878
4879 /* No need to keep a lock on a delete-marked record
4880 if we do not want to use next-key locking. */
4881
4882 row_unlock_for_mysql(prebuilt, TRUE);
4883 }
4884
4885 /* This is an optimization to skip setting the next key lock
4886 on the record that follows this delete-marked record. This
4887 optimization works because of the unique search criteria
4888 which precludes the presence of a range lock between this
4889 delete marked record and the record following it.
4890
4891 For now this is applicable only to clustered indexes while
4892 doing a unique search except for HANDLER queries because
4893 HANDLER allows NEXT and PREV even in unique search on
4894 clustered index. There is scope for further optimization
4895 applicable to unique secondary indexes. Current behaviour is
4896 to widen the scope of a lock on an already delete marked record
4897 if the same record is deleted twice by the same transaction */
4898 if (index == clust_index && unique_search
4899 && !prebuilt->used_in_HANDLER) {
4900
4901 err = DB_RECORD_NOT_FOUND;
4902
4903 goto normal_return;
4904 }
4905
4906 goto next_rec;
4907 }
4908
4909 /* Check if the record matches the index condition. */
4910 switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
4911 case ICP_NO_MATCH:
4912 if (did_semi_consistent_read) {
4913 row_unlock_for_mysql(prebuilt, TRUE);
4914 }
4915 goto next_rec;
4916 case ICP_OUT_OF_RANGE:
4917 err = DB_RECORD_NOT_FOUND;
4918 goto idx_cond_failed;
4919 case ICP_MATCH:
4920 break;
4921 }
4922
4923 /* Get the clustered index record if needed, if we did not do the
4924 search using the clustered index. */
4925
4926 if (index != clust_index && prebuilt->need_to_access_clustered) {
4927
4928 requires_clust_rec:
4929 ut_ad(index != clust_index);
4930 /* We use a 'goto' to the preceding label if a consistent
4931 read of a secondary index record requires us to look up old
4932 versions of the associated clustered index record. */
4933
4934 ut_ad(rec_offs_validate(rec, index, offsets));
4935
4936 /* It was a non-clustered index and we must fetch also the
4937 clustered index record */
4938
4939 mtr_has_extra_clust_latch = TRUE;
4940
4941 /* The following call returns 'offsets' associated with
4942 'clust_rec'. Note that 'clust_rec' can be an old version
4943 built for a consistent read. */
4944
4945 err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4946 thr, &clust_rec,
4947 &offsets, &heap, &mtr);
4948 switch (err) {
4949 case DB_SUCCESS:
4950 if (clust_rec == NULL) {
4951 /* The record did not exist in the read view */
4952 ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4953
4954 goto next_rec;
4955 }
4956 break;
4957 case DB_SUCCESS_LOCKED_REC:
4958 ut_a(clust_rec != NULL);
4959 if (srv_locks_unsafe_for_binlog
4960 || trx->isolation_level
4961 <= TRX_ISO_READ_COMMITTED) {
4962 /* Note that the clustered index record
4963 was locked. */
4964 prebuilt->new_rec_locks = 2;
4965 }
4966 err = DB_SUCCESS;
4967 break;
4968 default:
4969 goto lock_wait_or_error;
4970 }
4971
4972 if (rec_get_deleted_flag(clust_rec, comp)) {
4973
4974 /* The record is delete marked: we can skip it */
4975
4976 if ((srv_locks_unsafe_for_binlog
4977 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4978 && prebuilt->select_lock_type != LOCK_NONE) {
4979
4980 /* No need to keep a lock on a delete-marked
4981 record if we do not want to use next-key
4982 locking. */
4983
4984 row_unlock_for_mysql(prebuilt, TRUE);
4985 }
4986
4987 goto next_rec;
4988 }
4989
4990 result_rec = clust_rec;
4991 ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
4992
4993 if (prebuilt->idx_cond) {
4994 /* Convert the record to MySQL format. We were
4995 unable to do this in row_search_idx_cond_check(),
4996 because the condition is on the secondary index
4997 and the requested column is in the clustered index.
4998 We convert all fields, including those that
4999 may have been used in ICP, because the
5000 secondary index may contain a column prefix
5001 rather than the full column. Also, as noted
5002 in Bug #56680, the column in the secondary
5003 index may be in the wrong case, and the
5004 authoritative case is in result_rec, the
5005 appropriate version of the clustered index record. */
5006 if (!row_sel_store_mysql_rec(
5007 buf, prebuilt, result_rec,
5008 TRUE, clust_index, offsets, false)) {
5009 goto next_rec;
5010 }
5011 }
5012 } else {
5013 result_rec = rec;
5014 }
5015
5016 /* We found a qualifying record 'result_rec'. At this point,
5017 'offsets' are associated with 'result_rec'. */
5018
5019 ut_ad(rec_offs_validate(result_rec,
5020 result_rec != rec ? clust_index : index,
5021 offsets));
5022 ut_ad(!rec_get_deleted_flag(result_rec, comp));
5023
5024 /* At this point, the clustered index record is protected
5025 by a page latch that was acquired when pcur was positioned.
5026 The latch will not be released until mtr_commit(&mtr). */
5027
5028 if ((match_mode == ROW_SEL_EXACT
5029 || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
5030 && prebuilt->select_lock_type == LOCK_NONE
5031 && !prebuilt->templ_contains_blob
5032 && !prebuilt->clust_index_was_generated
5033 && !prebuilt->used_in_HANDLER
5034 && !prebuilt->innodb_api
5035 && prebuilt->template_type
5036 != ROW_MYSQL_DUMMY_TEMPLATE
5037 && !prebuilt->in_fts_query) {
5038
5039 /* Inside an update, for example, we do not cache rows,
5040 since we may use the cursor position to do the actual
5041 update, that is why we require ...lock_type == LOCK_NONE.
5042 Since we keep space in prebuilt only for the BLOBs of
5043 a single row, we cannot cache rows in the case there
5044 are BLOBs in the fields to be fetched. In HANDLER we do
5045 not cache rows because there the cursor is a scrollable
5046 cursor. */
5047
5048 ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
5049
5050 /* We only convert from InnoDB row format to MySQL row
5051 format when ICP is disabled. */
5052
5053 if (!prebuilt->idx_cond) {
5054
5055 /* We use next_buf to track the allocation of buffers
5056 where we store and enqueue the buffers for our
5057 pre-fetch optimisation.
5058
5059 If next_buf == 0 then we store the converted record
5060 directly into the MySQL record buffer (buf). If it is
5061 != 0 then we allocate a pre-fetch buffer and store the
5062 converted record there.
5063
5064 If the conversion fails and the MySQL record buffer
5065 was not written to then we reset next_buf so that
5066 we can re-use the MySQL record buffer in the next
5067 iteration. */
5068
5069 next_buf = next_buf
5070 ? row_sel_fetch_last_buf(prebuilt) : buf;
5071
5072 if (!row_sel_store_mysql_rec(
5073 next_buf, prebuilt, result_rec,
5074 result_rec != rec,
5075 result_rec != rec ? clust_index : index,
5076 offsets, false)) {
5077
5078 if (next_buf == buf) {
5079 ut_a(prebuilt->n_fetch_cached == 0);
5080 next_buf = 0;
5081 }
5082
5083 /* Only fresh inserts may contain incomplete
5084 externally stored columns. Pretend that such
5085 records do not exist. Such records may only be
5086 accessed at the READ UNCOMMITTED isolation
5087 level or when rolling back a recovered
5088 transaction. Rollback happens at a lower
5089 level, not here. */
5090 goto next_rec;
5091 }
5092
5093 if (next_buf != buf) {
5094 row_sel_enqueue_cache_row_for_mysql(
5095 next_buf, prebuilt);
5096 }
5097 } else {
5098 row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
5099 }
5100
5101 if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
5102 goto next_rec;
5103 }
5104
5105 } else {
5106 if (UNIV_UNLIKELY
5107 (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
5108 /* CHECK TABLE: fetch the row */
5109
5110 if (result_rec != rec
5111 && !prebuilt->need_to_access_clustered) {
5112 /* We used 'offsets' for the clust
5113 rec, recalculate them for 'rec' */
5114 offsets = rec_get_offsets(rec, index, offsets,
5115 ULINT_UNDEFINED,
5116 &heap);
5117 result_rec = rec;
5118 }
5119
5120 memcpy(buf + 4, result_rec
5121 - rec_offs_extra_size(offsets),
5122 rec_offs_size(offsets));
5123 mach_write_to_4(buf,
5124 rec_offs_extra_size(offsets) + 4);
5125 } else if (!prebuilt->idx_cond && !prebuilt->innodb_api) {
5126 /* The record was not yet converted to MySQL format. */
5127 if (!row_sel_store_mysql_rec(
5128 buf, prebuilt, result_rec,
5129 result_rec != rec,
5130 result_rec != rec ? clust_index : index,
5131 offsets, false)) {
5132 /* Only fresh inserts may contain
5133 incomplete externally stored
5134 columns. Pretend that such records do
5135 not exist. Such records may only be
5136 accessed at the READ UNCOMMITTED
5137 isolation level or when rolling back a
5138 recovered transaction. Rollback
5139 happens at a lower level, not here. */
5140 goto next_rec;
5141 }
5142 }
5143
5144 if (prebuilt->clust_index_was_generated) {
5145 row_sel_store_row_id_to_prebuilt(
5146 prebuilt, result_rec,
5147 result_rec == rec ? index : clust_index,
5148 offsets);
5149 }
5150 }
5151
5152 /* From this point on, 'offsets' are invalid. */
5153
5154 /* We have an optimization to save CPU time: if this is a consistent
5155 read on a unique condition on the clustered index, then we do not
5156 store the pcur position, because any fetch next or prev will anyway
5157 return 'end of file'. Exceptions are locking reads and the MySQL
5158 HANDLER command where the user can move the cursor with PREV or NEXT
5159 even after a unique search. */
5160
5161 err = DB_SUCCESS;
5162
5163 idx_cond_failed:
5164 if (!unique_search
5165 || !dict_index_is_clust(index)
5166 || direction != 0
5167 || prebuilt->select_lock_type != LOCK_NONE
5168 || prebuilt->used_in_HANDLER
5169 || prebuilt->innodb_api) {
5170
5171 /* Inside an update always store the cursor position */
5172
5173 btr_pcur_store_position(pcur, &mtr);
5174
5175 if (prebuilt->innodb_api
5176 && (btr_pcur_get_rec(pcur) != result_rec)) {
5177 ulint rec_size = rec_offs_size(offsets);
5178 if (!prebuilt->innodb_api_rec_size ||
5179 (prebuilt->innodb_api_rec_size < rec_size)) {
5180 prebuilt->innodb_api_buf =
5181 static_cast<byte*>
5182 (mem_heap_alloc(prebuilt->cursor_heap,rec_size));
5183 prebuilt->innodb_api_rec_size = rec_size;
5184 }
5185 prebuilt->innodb_api_rec =
5186 rec_copy(
5187 prebuilt->innodb_api_buf, result_rec, offsets);
5188 }
5189 }
5190
5191 goto normal_return;
5192
5193 next_rec:
5194 end_loop++;
5195
5196 /* Reset the old and new "did semi-consistent read" flags. */
5197 if (UNIV_UNLIKELY(prebuilt->row_read_type
5198 == ROW_READ_DID_SEMI_CONSISTENT)) {
5199 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5200 }
5201 did_semi_consistent_read = FALSE;
5202 prebuilt->new_rec_locks = 0;
5203
5204 /*-------------------------------------------------------------*/
5205 /* PHASE 5: Move the cursor to the next index record */
5206
5207 /* NOTE: For moves_up==FALSE, the mini-transaction will be
5208 committed and restarted every time when switching b-tree
5209 pages. For moves_up==TRUE in index condition pushdown, we can
5210 scan an entire secondary index tree within a single
5211 mini-transaction. As long as the prebuilt->idx_cond does not
5212 match, we do not need to consult the clustered index or
5213 return records to MySQL, and thus we can avoid repositioning
5214 the cursor. What prevents us from buffer-fixing all leaf pages
5215 within the mini-transaction is the btr_leaf_page_release()
5216 call in btr_pcur_move_to_next_page(). Only the leaf page where
5217 the cursor is positioned will remain buffer-fixed. */
5218
5219 if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
5220 /* We must commit mtr if we are moving to the next
5221 non-clustered index record, because we could break the
5222 latching order if we would access a different clustered
5223 index page right away without releasing the previous. */
5224
5225 btr_pcur_store_position(pcur, &mtr);
5226
5227 mtr_commit(&mtr);
5228 mtr_has_extra_clust_latch = FALSE;
5229
5230 mtr_start(&mtr);
5231 if (sel_restore_position_for_mysql(&same_user_rec,
5232 BTR_SEARCH_LEAF,
5233 pcur, moves_up, &mtr)) {
5234 #ifdef UNIV_SEARCH_DEBUG
5235 cnt++;
5236 #endif /* UNIV_SEARCH_DEBUG */
5237
5238 goto rec_loop;
5239 }
5240 }
5241
5242 if (moves_up) {
5243 if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
5244 not_moved:
5245 btr_pcur_store_position(pcur, &mtr);
5246
5247 if (match_mode != 0) {
5248 err = DB_RECORD_NOT_FOUND;
5249 } else {
5250 err = DB_END_OF_INDEX;
5251 }
5252
5253 goto normal_return;
5254 }
5255 } else {
5256 if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
5257 goto not_moved;
5258 }
5259 }
5260
5261 #ifdef UNIV_SEARCH_DEBUG
5262 cnt++;
5263 #endif /* UNIV_SEARCH_DEBUG */
5264
5265 goto rec_loop;
5266
5267 lock_wait_or_error:
5268 /* Reset the old and new "did semi-consistent read" flags. */
5269 if (UNIV_UNLIKELY(prebuilt->row_read_type
5270 == ROW_READ_DID_SEMI_CONSISTENT)) {
5271 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5272 }
5273 did_semi_consistent_read = FALSE;
5274
5275 /*-------------------------------------------------------------*/
5276
5277 btr_pcur_store_position(pcur, &mtr);
5278
5279 lock_table_wait:
5280 mtr_commit(&mtr);
5281 mtr_has_extra_clust_latch = FALSE;
5282
5283 trx->error_state = err;
5284
5285 /* The following is a patch for MySQL */
5286
5287 que_thr_stop_for_mysql(thr);
5288
5289 thr->lock_state = QUE_THR_LOCK_ROW;
5290
5291 if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
5292 /* It was a lock wait, and it ended */
5293
5294 thr->lock_state = QUE_THR_LOCK_NOLOCK;
5295 mtr_start(&mtr);
5296
5297 /* Table lock waited, go try to obtain table lock
5298 again */
5299 if (table_lock_waited) {
5300 table_lock_waited = FALSE;
5301
5302 goto wait_table_again;
5303 }
5304
5305 sel_restore_position_for_mysql(&same_user_rec,
5306 BTR_SEARCH_LEAF, pcur,
5307 moves_up, &mtr);
5308
5309 if ((srv_locks_unsafe_for_binlog
5310 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
5311 && !same_user_rec) {
5312
5313 /* Since we were not able to restore the cursor
5314 on the same user record, we cannot use
5315 row_unlock_for_mysql() to unlock any records, and
5316 we must thus reset the new rec lock info. Since
5317 in lock0lock.cc we have blocked the inheriting of gap
5318 X-locks, we actually do not have any new record locks
5319 set in this case.
5320
5321 Note that if we were able to restore on the 'same'
5322 user record, it is still possible that we were actually
5323 waiting on a delete-marked record, and meanwhile
5324 it was removed by purge and inserted again by some
5325 other user. But that is no problem, because in
5326 rec_loop we will again try to set a lock, and
5327 new_rec_lock_info in trx will be right at the end. */
5328
5329 prebuilt->new_rec_locks = 0;
5330 }
5331
5332 mode = pcur->search_mode;
5333
5334 goto rec_loop;
5335 }
5336
5337 thr->lock_state = QUE_THR_LOCK_NOLOCK;
5338
5339 #ifdef UNIV_SEARCH_DEBUG
5340 /* fputs("Using ", stderr);
5341 dict_index_name_print(stderr, index);
5342 fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
5343 #endif /* UNIV_SEARCH_DEBUG */
5344 goto func_exit;
5345
5346 normal_return:
5347 /*-------------------------------------------------------------*/
5348 que_thr_stop_for_mysql_no_error(thr, trx);
5349
5350 mtr_commit(&mtr);
5351
5352 if (prebuilt->idx_cond != 0) {
5353
5354 /* When ICP is active we don't write to the MySQL buffer
5355 directly, only to buffers that are enqueued in the pre-fetch
5356 queue. We need to dequeue the first buffer and copy the contents
5357 to the record buffer that was passed in by MySQL. */
5358
5359 if (prebuilt->n_fetch_cached > 0) {
5360 row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
5361 err = DB_SUCCESS;
5362 }
5363
5364 } else if (next_buf != 0) {
5365
5366 /* We may or may not have enqueued some buffers to the
5367 pre-fetch queue, but we definitely wrote to the record
5368 buffer passed to use by MySQL. */
5369
5370 DEBUG_SYNC_C("row_search_cached_row");
5371 err = DB_SUCCESS;
5372 }
5373
5374 #ifdef UNIV_SEARCH_DEBUG
5375 /* fputs("Using ", stderr);
5376 dict_index_name_print(stderr, index);
5377 fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
5378 #endif /* UNIV_SEARCH_DEBUG */
5379
5380 func_exit:
5381 trx->op_info = "";
5382
5383 if (end_range_cache != NULL) {
5384 ut_free(end_range_cache);
5385 }
5386
5387 if (UNIV_LIKELY_NULL(heap)) {
5388 mem_heap_free(heap);
5389 }
5390
5391 /* Set or reset the "did semi-consistent read" flag on return.
5392 The flag did_semi_consistent_read is set if and only if
5393 the record being returned was fetched with a semi-consistent read. */
5394 ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
5395 || !did_semi_consistent_read);
5396
5397 if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
5398 if (UNIV_UNLIKELY(did_semi_consistent_read)) {
5399 prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
5400 } else {
5401 prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5402 }
5403 }
5404
5405 #ifdef UNIV_SYNC_DEBUG
5406 ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
5407 #endif /* UNIV_SYNC_DEBUG */
5408
5409 DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
5410
5411 return(err);
5412 }
5413
5414 /*******************************************************************//**
5415 Checks if MySQL at the moment is allowed for this table to retrieve a
5416 consistent read result, or store it to the query cache.
5417 @return TRUE if storing or retrieving from the query cache is permitted */
5418 UNIV_INTERN
5419 ibool
row_search_check_if_query_cache_permitted(trx_t * trx,const char * norm_name)5420 row_search_check_if_query_cache_permitted(
5421 /*======================================*/
5422 trx_t* trx, /*!< in: transaction object */
5423 const char* norm_name) /*!< in: concatenation of database name,
5424 '/' char, table name */
5425 {
5426 dict_table_t* table;
5427 ibool ret = FALSE;
5428
5429 /* Disable query cache altogether for all tables if recovered XA
5430 transactions in prepared state exist. This is because we do not
5431 restore the table locks for those transactions and we may wrongly
5432 set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See
5433 "Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH
5434 QUERY CACHE ENABLED".
5435 Read trx_sys->n_prepared_recovered_trx without mutex protection,
5436 not possible to end up with a torn read since n_prepared_recovered_trx
5437 is word size. */
5438 if (trx_sys->n_prepared_recovered_trx > 0) {
5439
5440 return(FALSE);
5441 }
5442
5443 table = dict_table_open_on_name(norm_name, FALSE, FALSE,
5444 DICT_ERR_IGNORE_NONE);
5445
5446 if (table == NULL) {
5447
5448 return(FALSE);
5449 }
5450
5451 /* Start the transaction if it is not started yet */
5452
5453 trx_start_if_not_started(trx);
5454
5455 /* If there are locks on the table or some trx has invalidated the
5456 cache up to our trx id, then ret = FALSE.
5457 We do not check what type locks there are on the table, though only
5458 IX type locks actually would require ret = FALSE. */
5459
5460 if (lock_table_get_n_locks(table) == 0
5461 && trx->id >= table->query_cache_inv_trx_id) {
5462
5463 ret = TRUE;
5464
5465 /* If the isolation level is high, assign a read view for the
5466 transaction if it does not yet have one */
5467
5468 if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
5469 && !trx->read_view) {
5470
5471 trx->read_view = read_view_open_now(
5472 trx->id, trx->global_read_view_heap);
5473
5474 trx->global_read_view = trx->read_view;
5475 }
5476 }
5477
5478 dict_table_close(table, FALSE, FALSE);
5479
5480 return(ret);
5481 }
5482
5483 /*******************************************************************//**
5484 Read the AUTOINC column from the current row. If the value is less than
5485 0 and the type is not unsigned then we reset the value to 0.
5486 @return value read from the column */
5487 static
5488 ib_uint64_t
row_search_autoinc_read_column(dict_index_t * index,const rec_t * rec,ulint col_no,ulint mtype,ibool unsigned_type)5489 row_search_autoinc_read_column(
5490 /*===========================*/
5491 dict_index_t* index, /*!< in: index to read from */
5492 const rec_t* rec, /*!< in: current rec */
5493 ulint col_no, /*!< in: column number */
5494 ulint mtype, /*!< in: column main type */
5495 ibool unsigned_type) /*!< in: signed or unsigned flag */
5496 {
5497 ulint len;
5498 const byte* data;
5499 ib_uint64_t value;
5500 mem_heap_t* heap = NULL;
5501 ulint offsets_[REC_OFFS_NORMAL_SIZE];
5502 ulint* offsets = offsets_;
5503
5504 rec_offs_init(offsets_);
5505
5506 offsets = rec_get_offsets(rec, index, offsets, col_no + 1, &heap);
5507
5508 if (rec_offs_nth_sql_null(offsets, col_no)) {
5509 /* There is no non-NULL value in the auto-increment column. */
5510 value = 0;
5511 goto func_exit;
5512 }
5513
5514 data = rec_get_nth_field(rec, offsets, col_no, &len);
5515
5516 switch (mtype) {
5517 case DATA_INT:
5518 ut_a(len <= sizeof value);
5519 value = mach_read_int_type(data, len, unsigned_type);
5520 break;
5521
5522 case DATA_FLOAT:
5523 ut_a(len == sizeof(float));
5524 value = (ib_uint64_t) mach_float_read(data);
5525 break;
5526
5527 case DATA_DOUBLE:
5528 ut_a(len == sizeof(double));
5529 value = (ib_uint64_t) mach_double_read(data);
5530 break;
5531
5532 default:
5533 ut_error;
5534 }
5535
5536 if (!unsigned_type && (ib_int64_t) value < 0) {
5537 value = 0;
5538 }
5539
5540 func_exit:
5541 if (UNIV_LIKELY_NULL(heap)) {
5542 mem_heap_free(heap);
5543 }
5544
5545 return(value);
5546 }
5547
5548 /** Get the maximum and non-delete-marked record in an index.
5549 @param[in] index index tree
5550 @param[in,out] mtr mini-transaction (may be committed and restarted)
5551 @return maximum record, page s-latched in mtr
5552 @retval NULL if there are no records, or if all of them are delete-marked */
5553 static
5554 const rec_t*
row_search_get_max_rec(dict_index_t * index,mtr_t * mtr)5555 row_search_get_max_rec(
5556 dict_index_t* index,
5557 mtr_t* mtr)
5558 {
5559 btr_pcur_t pcur;
5560 const rec_t* rec;
5561 /* Open at the high/right end (false), and init cursor */
5562 btr_pcur_open_at_index_side(
5563 false, index, BTR_SEARCH_LEAF, &pcur, true, 0, mtr);
5564
5565 do {
5566 const page_t* page;
5567
5568 page = btr_pcur_get_page(&pcur);
5569 rec = page_find_rec_max_not_deleted(page);
5570
5571 if (page_rec_is_user_rec(rec)) {
5572 break;
5573 } else {
5574 rec = NULL;
5575 }
5576 btr_pcur_move_before_first_on_page(&pcur);
5577 } while (btr_pcur_move_to_prev(&pcur, mtr));
5578
5579 btr_pcur_close(&pcur);
5580
5581 return(rec);
5582 }
5583
5584 /*******************************************************************//**
5585 Read the max AUTOINC value from an index.
5586 @return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
5587 column name can't be found in index */
5588 UNIV_INTERN
5589 dberr_t
row_search_max_autoinc(dict_index_t * index,const char * col_name,ib_uint64_t * value)5590 row_search_max_autoinc(
5591 /*===================*/
5592 dict_index_t* index, /*!< in: index to search */
5593 const char* col_name, /*!< in: name of autoinc column */
5594 ib_uint64_t* value) /*!< out: AUTOINC value read */
5595 {
5596 dict_field_t* dfield = dict_index_get_nth_field(index, 0);
5597 dberr_t error = DB_SUCCESS;
5598 *value = 0;
5599
5600 if (strcmp(col_name, dfield->name) != 0) {
5601 error = DB_RECORD_NOT_FOUND;
5602 } else {
5603 mtr_t mtr;
5604 const rec_t* rec;
5605
5606 mtr_start(&mtr);
5607
5608 rec = row_search_get_max_rec(index, &mtr);
5609
5610 if (rec != NULL) {
5611 ibool unsigned_type = (
5612 dfield->col->prtype & DATA_UNSIGNED);
5613
5614 *value = row_search_autoinc_read_column(
5615 index, rec, 0,
5616 dfield->col->mtype, unsigned_type);
5617 }
5618
5619 mtr_commit(&mtr);
5620 }
5621
5622 return(error);
5623 }
5624