1 /*****************************************************************************
2 
3 Copyright (c) 1997, 2020, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11 
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15 
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation.  The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22 
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26 GNU General Public License, version 2.0, for more details.
27 
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31 
32 *****************************************************************************/
33 
34 /***************************************************//**
35 @file row/row0sel.cc
36 Select
37 
38 Created 12/19/1997 Heikki Tuuri
39 *******************************************************/
40 
41 #include "row0sel.h"
42 
43 #ifdef UNIV_NONINL
44 #include "row0sel.ic"
45 #endif
46 
47 #include "dict0dict.h"
48 #include "dict0boot.h"
49 #include "trx0undo.h"
50 #include "trx0trx.h"
51 #include "btr0btr.h"
52 #include "btr0cur.h"
53 #include "btr0sea.h"
54 #include "mach0data.h"
55 #include "que0que.h"
56 #include "row0upd.h"
57 #include "row0row.h"
58 #include "row0vers.h"
59 #include "rem0cmp.h"
60 #include "lock0lock.h"
61 #include "eval0eval.h"
62 #include "pars0sym.h"
63 #include "pars0pars.h"
64 #include "row0mysql.h"
65 #include "read0read.h"
66 #include "buf0lru.h"
67 #include "ha_prototypes.h"
68 #include "m_string.h" /* for my_sys.h */
69 #include "my_sys.h" /* DEBUG_SYNC_C */
70 
71 #include "my_compare.h" /* enum icp_result */
72 #include "thr_lock.h"
73 #include "handler.h"
74 #include "ha_innodb.h"
75 
76 /* Maximum number of rows to prefetch; MySQL interface has another parameter */
77 #define SEL_MAX_N_PREFETCH	16
78 
79 /* Number of rows fetched, after which to start prefetching; MySQL interface
80 has another parameter */
81 #define SEL_PREFETCH_LIMIT	1
82 
83 /* When a select has accessed about this many pages, it returns control back
84 to que_run_threads: this is to allow canceling runaway queries */
85 
86 #define SEL_COST_LIMIT	100
87 
88 /* Flags for search shortcut */
89 #define SEL_FOUND	0
90 #define	SEL_EXHAUSTED	1
91 #define SEL_RETRY	2
92 
93 /********************************************************************//**
94 Returns TRUE if the user-defined column in a secondary index record
95 is alphabetically the same as the corresponding BLOB column in the clustered
96 index record.
97 NOTE: the comparison is NOT done as a binary comparison, but character
98 fields are compared with collation!
99 @return	TRUE if the columns are equal */
100 static
101 ibool
row_sel_sec_rec_is_for_blob(ulint mtype,ulint prtype,ulint mbminmaxlen,const byte * clust_field,ulint clust_len,const byte * sec_field,ulint sec_len,ulint prefix_len,dict_table_t * table)102 row_sel_sec_rec_is_for_blob(
103 /*========================*/
104 	ulint		mtype,		/*!< in: main type */
105 	ulint		prtype,		/*!< in: precise type */
106 	ulint		mbminmaxlen,	/*!< in: minimum and maximum length of
107 					a multi-byte character */
108 	const byte*	clust_field,	/*!< in: the locally stored part of
109 					the clustered index column, including
110 					the BLOB pointer; the clustered
111 					index record must be covered by
112 					a lock or a page latch to protect it
113 					against deletion (rollback or purge) */
114 	ulint		clust_len,	/*!< in: length of clust_field */
115 	const byte*	sec_field,	/*!< in: column in secondary index */
116 	ulint		sec_len,	/*!< in: length of sec_field */
117 	ulint		prefix_len,	/*!< in: index column prefix length
118 					in bytes */
119 	dict_table_t*	table)		/*!< in: table */
120 {
121 	ulint	len;
122 	byte	buf[REC_VERSION_56_MAX_INDEX_COL_LEN];
123 	ulint	zip_size = dict_tf_get_zip_size(table->flags);
124 
125 	/* This function should never be invoked on an Antelope format
126 	table, because they should always contain enough prefix in the
127 	clustered index record. */
128 	ut_ad(dict_table_get_format(table) >= UNIV_FORMAT_B);
129 	ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
130 	ut_ad(prefix_len >= sec_len);
131 	ut_ad(prefix_len > 0);
132 	ut_a(prefix_len <= sizeof buf);
133 
134 	if (UNIV_UNLIKELY
135 	    (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
136 		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
137 		/* The externally stored field was not written yet.
138 		This record should only be seen by
139 		recv_recovery_rollback_active() or any
140 		TRX_ISO_READ_UNCOMMITTED transactions. */
141 		return(FALSE);
142 	}
143 
144 	len = btr_copy_externally_stored_field_prefix(buf, prefix_len,
145 						      zip_size,
146 						      clust_field, clust_len);
147 
148 	if (UNIV_UNLIKELY(len == 0)) {
149 		/* The BLOB was being deleted as the server crashed.
150 		There should not be any secondary index records
151 		referring to this clustered index record, because
152 		btr_free_externally_stored_field() is called after all
153 		secondary index entries of the row have been purged. */
154 		return(FALSE);
155 	}
156 
157 	len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen,
158 					  prefix_len, len, (const char*) buf);
159 
160 	return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
161 }
162 
163 /********************************************************************//**
164 Returns TRUE if the user-defined column values in a secondary index record
165 are alphabetically the same as the corresponding columns in the clustered
166 index record.
167 NOTE: the comparison is NOT done as a binary comparison, but character
168 fields are compared with collation!
169 @return TRUE if the secondary record is equal to the corresponding
170 fields in the clustered record, when compared with collation;
171 FALSE if not equal or if the clustered record has been marked for deletion */
172 static
173 ibool
row_sel_sec_rec_is_for_clust_rec(const rec_t * sec_rec,dict_index_t * sec_index,const rec_t * clust_rec,dict_index_t * clust_index)174 row_sel_sec_rec_is_for_clust_rec(
175 /*=============================*/
176 	const rec_t*	sec_rec,	/*!< in: secondary index record */
177 	dict_index_t*	sec_index,	/*!< in: secondary index */
178 	const rec_t*	clust_rec,	/*!< in: clustered index record;
179 					must be protected by a lock or
180 					a page latch against deletion
181 					in rollback or purge */
182 	dict_index_t*	clust_index)	/*!< in: clustered index */
183 {
184 	const byte*	sec_field;
185 	ulint		sec_len;
186 	const byte*	clust_field;
187 	ulint		n;
188 	ulint		i;
189 	mem_heap_t*	heap		= NULL;
190 	ulint		clust_offsets_[REC_OFFS_NORMAL_SIZE];
191 	ulint		sec_offsets_[REC_OFFS_SMALL_SIZE];
192 	ulint*		clust_offs	= clust_offsets_;
193 	ulint*		sec_offs	= sec_offsets_;
194 	ibool		is_equal	= TRUE;
195 
196 	rec_offs_init(clust_offsets_);
197 	rec_offs_init(sec_offsets_);
198 
199 	if (rec_get_deleted_flag(clust_rec,
200 				 dict_table_is_comp(clust_index->table))) {
201 
202 		/* The clustered index record is delete-marked;
203 		it is not visible in the read view.  Besides,
204 		if there are any externally stored columns,
205 		some of them may have already been purged. */
206 		return(FALSE);
207 	}
208 
209 	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
210 				     ULINT_UNDEFINED, &heap);
211 	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
212 				   ULINT_UNDEFINED, &heap);
213 
214 	n = dict_index_get_n_ordering_defined_by_user(sec_index);
215 
216 	for (i = 0; i < n; i++) {
217 		const dict_field_t*	ifield;
218 		const dict_col_t*	col;
219 		ulint			clust_pos;
220 		ulint			clust_len;
221 		ulint			len;
222 
223 		ifield = dict_index_get_nth_field(sec_index, i);
224 		col = dict_field_get_col(ifield);
225 		clust_pos = dict_col_get_clust_pos(col, clust_index);
226 
227 		clust_field = rec_get_nth_field(
228 			clust_rec, clust_offs, clust_pos, &clust_len);
229 		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
230 
231 		len = clust_len;
232 
233 		if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL
234 		    && sec_len != UNIV_SQL_NULL) {
235 
236 			if (rec_offs_nth_extern(clust_offs, clust_pos)) {
237 				len -= BTR_EXTERN_FIELD_REF_SIZE;
238 			}
239 
240 			len = dtype_get_at_most_n_mbchars(
241 				col->prtype, col->mbminmaxlen,
242 				ifield->prefix_len, len, (char*) clust_field);
243 
244 			if (rec_offs_nth_extern(clust_offs, clust_pos)
245 			    && len < sec_len) {
246 				if (!row_sel_sec_rec_is_for_blob(
247 					    col->mtype, col->prtype,
248 					    col->mbminmaxlen,
249 					    clust_field, clust_len,
250 					    sec_field, sec_len,
251 					    ifield->prefix_len,
252 					    clust_index->table)) {
253 					goto inequal;
254 				}
255 
256 				continue;
257 			}
258 		}
259 
260 		if (0 != cmp_data_data(col->mtype, col->prtype,
261 				       clust_field, len,
262 				       sec_field, sec_len)) {
263 inequal:
264 			is_equal = FALSE;
265 			goto func_exit;
266 		}
267 	}
268 
269 func_exit:
270 	if (UNIV_LIKELY_NULL(heap)) {
271 		mem_heap_free(heap);
272 	}
273 	return(is_equal);
274 }
275 
276 /*********************************************************************//**
277 Creates a select node struct.
278 @return	own: select node struct */
279 UNIV_INTERN
280 sel_node_t*
sel_node_create(mem_heap_t * heap)281 sel_node_create(
282 /*============*/
283 	mem_heap_t*	heap)	/*!< in: memory heap where created */
284 {
285 	sel_node_t*	node;
286 
287 	node = static_cast<sel_node_t*>(
288 		mem_heap_alloc(heap, sizeof(sel_node_t)));
289 
290 	node->common.type = QUE_NODE_SELECT;
291 	node->state = SEL_NODE_OPEN;
292 
293 	node->plans = NULL;
294 
295 	return(node);
296 }
297 
298 /*********************************************************************//**
299 Frees the memory private to a select node when a query graph is freed,
300 does not free the heap where the node was originally created. */
301 UNIV_INTERN
302 void
sel_node_free_private(sel_node_t * node)303 sel_node_free_private(
304 /*==================*/
305 	sel_node_t*	node)	/*!< in: select node struct */
306 {
307 	ulint	i;
308 	plan_t*	plan;
309 
310 	if (node->plans != NULL) {
311 		for (i = 0; i < node->n_tables; i++) {
312 			plan = sel_node_get_nth_plan(node, i);
313 
314 			btr_pcur_close(&(plan->pcur));
315 			btr_pcur_close(&(plan->clust_pcur));
316 
317 			if (plan->old_vers_heap) {
318 				mem_heap_free(plan->old_vers_heap);
319 			}
320 		}
321 	}
322 }
323 
324 /*********************************************************************//**
325 Evaluates the values in a select list. If there are aggregate functions,
326 their argument value is added to the aggregate total. */
327 UNIV_INLINE
328 void
sel_eval_select_list(sel_node_t * node)329 sel_eval_select_list(
330 /*=================*/
331 	sel_node_t*	node)	/*!< in: select node */
332 {
333 	que_node_t*	exp;
334 
335 	exp = node->select_list;
336 
337 	while (exp) {
338 		eval_exp(exp);
339 
340 		exp = que_node_get_next(exp);
341 	}
342 }
343 
344 /*********************************************************************//**
345 Assigns the values in the select list to the possible into-variables in
346 SELECT ... INTO ... */
347 UNIV_INLINE
348 void
sel_assign_into_var_values(sym_node_t * var,sel_node_t * node)349 sel_assign_into_var_values(
350 /*=======================*/
351 	sym_node_t*	var,	/*!< in: first variable in a list of
352 				variables */
353 	sel_node_t*	node)	/*!< in: select node */
354 {
355 	que_node_t*	exp;
356 
357 	if (var == NULL) {
358 
359 		return;
360 	}
361 
362 	for (exp = node->select_list;
363 	     var != 0;
364 	     var = static_cast<sym_node_t*>(que_node_get_next(var))) {
365 
366 		ut_ad(exp);
367 
368 		eval_node_copy_val(var->alias, exp);
369 
370 		exp = que_node_get_next(exp);
371 	}
372 }
373 
374 /*********************************************************************//**
375 Resets the aggregate value totals in the select list of an aggregate type
376 query. */
377 UNIV_INLINE
378 void
sel_reset_aggregate_vals(sel_node_t * node)379 sel_reset_aggregate_vals(
380 /*=====================*/
381 	sel_node_t*	node)	/*!< in: select node */
382 {
383 	func_node_t*	func_node;
384 
385 	ut_ad(node->is_aggregate);
386 
387 	for (func_node = static_cast<func_node_t*>(node->select_list);
388 	     func_node != 0;
389 	     func_node = static_cast<func_node_t*>(
390 		     	que_node_get_next(func_node))) {
391 
392 		eval_node_set_int_val(func_node, 0);
393 	}
394 
395 	node->aggregate_already_fetched = FALSE;
396 }
397 
398 /*********************************************************************//**
399 Copies the input variable values when an explicit cursor is opened. */
400 UNIV_INLINE
401 void
row_sel_copy_input_variable_vals(sel_node_t * node)402 row_sel_copy_input_variable_vals(
403 /*=============================*/
404 	sel_node_t*	node)	/*!< in: select node */
405 {
406 	sym_node_t*	var;
407 
408 	var = UT_LIST_GET_FIRST(node->copy_variables);
409 
410 	while (var) {
411 		eval_node_copy_val(var, var->alias);
412 
413 		var->indirection = NULL;
414 
415 		var = UT_LIST_GET_NEXT(col_var_list, var);
416 	}
417 }
418 
419 /*********************************************************************//**
420 Fetches the column values from a record. */
421 static
422 void
row_sel_fetch_columns(dict_index_t * index,const rec_t * rec,const ulint * offsets,sym_node_t * column)423 row_sel_fetch_columns(
424 /*==================*/
425 	dict_index_t*	index,	/*!< in: record index */
426 	const rec_t*	rec,	/*!< in: record in a clustered or non-clustered
427 				index; must be protected by a page latch */
428 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
429 	sym_node_t*	column)	/*!< in: first column in a column list, or
430 				NULL */
431 {
432 	dfield_t*	val;
433 	ulint		index_type;
434 	ulint		field_no;
435 	const byte*	data;
436 	ulint		len;
437 
438 	ut_ad(rec_offs_validate(rec, index, offsets));
439 
440 	if (dict_index_is_clust(index)) {
441 		index_type = SYM_CLUST_FIELD_NO;
442 	} else {
443 		index_type = SYM_SEC_FIELD_NO;
444 	}
445 
446 	while (column) {
447 		mem_heap_t*	heap = NULL;
448 		ibool		needs_copy;
449 
450 		field_no = column->field_nos[index_type];
451 
452 		if (field_no != ULINT_UNDEFINED) {
453 
454 			if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
455 							      field_no))) {
456 
457 				/* Copy an externally stored field to the
458 				temporary heap, if possible. */
459 
460 				heap = mem_heap_create(1);
461 
462 				data = btr_rec_copy_externally_stored_field(
463 					rec, offsets,
464 					dict_table_zip_size(index->table),
465 					field_no, &len, heap);
466 
467 				/* data == NULL means that the
468 				externally stored field was not
469 				written yet. This record
470 				should only be seen by
471 				recv_recovery_rollback_active() or any
472 				TRX_ISO_READ_UNCOMMITTED
473 				transactions. The InnoDB SQL parser
474 				(the sole caller of this function)
475 				does not implement READ UNCOMMITTED,
476 				and it is not involved during rollback. */
477 				ut_a(data);
478 				ut_a(len != UNIV_SQL_NULL);
479 
480 				needs_copy = TRUE;
481 			} else {
482 				data = rec_get_nth_field(rec, offsets,
483 							 field_no, &len);
484 
485 				needs_copy = column->copy_val;
486 			}
487 
488 			if (needs_copy) {
489 				eval_node_copy_and_alloc_val(column, data,
490 							     len);
491 			} else {
492 				val = que_node_get_val(column);
493 				dfield_set_data(val, data, len);
494 			}
495 
496 			if (UNIV_LIKELY_NULL(heap)) {
497 				mem_heap_free(heap);
498 			}
499 		}
500 
501 		column = UT_LIST_GET_NEXT(col_var_list, column);
502 	}
503 }
504 
505 /*********************************************************************//**
506 Allocates a prefetch buffer for a column when prefetch is first time done. */
507 static
508 void
sel_col_prefetch_buf_alloc(sym_node_t * column)509 sel_col_prefetch_buf_alloc(
510 /*=======================*/
511 	sym_node_t*	column)	/*!< in: symbol table node for a column */
512 {
513 	sel_buf_t*	sel_buf;
514 	ulint		i;
515 
516 	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
517 
518 	column->prefetch_buf = static_cast<sel_buf_t*>(
519 		mem_alloc(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
520 
521 	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
522 		sel_buf = column->prefetch_buf + i;
523 
524 		sel_buf->data = NULL;
525 		sel_buf->len = 0;
526 		sel_buf->val_buf_size = 0;
527 	}
528 }
529 
530 /*********************************************************************//**
531 Frees a prefetch buffer for a column, including the dynamically allocated
532 memory for data stored there. */
533 UNIV_INTERN
534 void
sel_col_prefetch_buf_free(sel_buf_t * prefetch_buf)535 sel_col_prefetch_buf_free(
536 /*======================*/
537 	sel_buf_t*	prefetch_buf)	/*!< in, own: prefetch buffer */
538 {
539 	sel_buf_t*	sel_buf;
540 	ulint		i;
541 
542 	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
543 		sel_buf = prefetch_buf + i;
544 
545 		if (sel_buf->val_buf_size > 0) {
546 
547 			mem_free(sel_buf->data);
548 		}
549 	}
550 
551 	mem_free(prefetch_buf);
552 }
553 
554 /*********************************************************************//**
555 Pops the column values for a prefetched, cached row from the column prefetch
556 buffers and places them to the val fields in the column nodes. */
557 static
558 void
sel_dequeue_prefetched_row(plan_t * plan)559 sel_dequeue_prefetched_row(
560 /*=======================*/
561 	plan_t*	plan)	/*!< in: plan node for a table */
562 {
563 	sym_node_t*	column;
564 	sel_buf_t*	sel_buf;
565 	dfield_t*	val;
566 	byte*		data;
567 	ulint		len;
568 	ulint		val_buf_size;
569 
570 	ut_ad(plan->n_rows_prefetched > 0);
571 
572 	column = UT_LIST_GET_FIRST(plan->columns);
573 
574 	while (column) {
575 		val = que_node_get_val(column);
576 
577 		if (!column->copy_val) {
578 			/* We did not really push any value for the
579 			column */
580 
581 			ut_ad(!column->prefetch_buf);
582 			ut_ad(que_node_get_val_buf_size(column) == 0);
583 			ut_d(dfield_set_null(val));
584 
585 			goto next_col;
586 		}
587 
588 		ut_ad(column->prefetch_buf);
589 		ut_ad(!dfield_is_ext(val));
590 
591 		sel_buf = column->prefetch_buf + plan->first_prefetched;
592 
593 		data = sel_buf->data;
594 		len = sel_buf->len;
595 		val_buf_size = sel_buf->val_buf_size;
596 
597 		/* We must keep track of the allocated memory for
598 		column values to be able to free it later: therefore
599 		we swap the values for sel_buf and val */
600 
601 		sel_buf->data = static_cast<byte*>(dfield_get_data(val));
602 		sel_buf->len = dfield_get_len(val);
603 		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
604 
605 		dfield_set_data(val, data, len);
606 		que_node_set_val_buf_size(column, val_buf_size);
607 next_col:
608 		column = UT_LIST_GET_NEXT(col_var_list, column);
609 	}
610 
611 	plan->n_rows_prefetched--;
612 
613 	plan->first_prefetched++;
614 }
615 
616 /*********************************************************************//**
617 Pushes the column values for a prefetched, cached row to the column prefetch
618 buffers from the val fields in the column nodes. */
619 UNIV_INLINE
620 void
sel_enqueue_prefetched_row(plan_t * plan)621 sel_enqueue_prefetched_row(
622 /*=======================*/
623 	plan_t*	plan)	/*!< in: plan node for a table */
624 {
625 	sym_node_t*	column;
626 	sel_buf_t*	sel_buf;
627 	dfield_t*	val;
628 	byte*		data;
629 	ulint		len;
630 	ulint		pos;
631 	ulint		val_buf_size;
632 
633 	if (plan->n_rows_prefetched == 0) {
634 		pos = 0;
635 		plan->first_prefetched = 0;
636 	} else {
637 		pos = plan->n_rows_prefetched;
638 
639 		/* We have the convention that pushing new rows starts only
640 		after the prefetch stack has been emptied: */
641 
642 		ut_ad(plan->first_prefetched == 0);
643 	}
644 
645 	plan->n_rows_prefetched++;
646 
647 	ut_ad(pos < SEL_MAX_N_PREFETCH);
648 
649 	for (column = UT_LIST_GET_FIRST(plan->columns);
650 	     column != 0;
651 	     column = UT_LIST_GET_NEXT(col_var_list, column)) {
652 
653 		if (!column->copy_val) {
654 			/* There is no sense to push pointers to database
655 			page fields when we do not keep latch on the page! */
656 			continue;
657 		}
658 
659 		if (!column->prefetch_buf) {
660 			/* Allocate a new prefetch buffer */
661 
662 			sel_col_prefetch_buf_alloc(column);
663 		}
664 
665 		sel_buf = column->prefetch_buf + pos;
666 
667 		val = que_node_get_val(column);
668 
669 		data = static_cast<byte*>(dfield_get_data(val));
670 		len = dfield_get_len(val);
671 		val_buf_size = que_node_get_val_buf_size(column);
672 
673 		/* We must keep track of the allocated memory for
674 		column values to be able to free it later: therefore
675 		we swap the values for sel_buf and val */
676 
677 		dfield_set_data(val, sel_buf->data, sel_buf->len);
678 		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
679 
680 		sel_buf->data = data;
681 		sel_buf->len = len;
682 		sel_buf->val_buf_size = val_buf_size;
683 	}
684 }
685 
686 /*********************************************************************//**
687 Builds a previous version of a clustered index record for a consistent read
688 @return	DB_SUCCESS or error code */
689 static MY_ATTRIBUTE((nonnull, warn_unused_result))
690 dberr_t
row_sel_build_prev_vers(read_view_t * read_view,dict_index_t * index,rec_t * rec,ulint ** offsets,mem_heap_t ** offset_heap,mem_heap_t ** old_vers_heap,rec_t ** old_vers,mtr_t * mtr)691 row_sel_build_prev_vers(
692 /*====================*/
693 	read_view_t*	read_view,	/*!< in: read view */
694 	dict_index_t*	index,		/*!< in: plan node for table */
695 	rec_t*		rec,		/*!< in: record in a clustered index */
696 	ulint**		offsets,	/*!< in/out: offsets returned by
697 					rec_get_offsets(rec, plan->index) */
698 	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
699 					the offsets are allocated */
700 	mem_heap_t**    old_vers_heap,  /*!< out: old version heap to use */
701 	rec_t**		old_vers,	/*!< out: old version, or NULL if the
702 					record does not exist in the view:
703 					i.e., it was freshly inserted
704 					afterwards */
705 	mtr_t*		mtr)		/*!< in: mtr */
706 {
707 	dberr_t	err;
708 
709 	if (*old_vers_heap) {
710 		mem_heap_empty(*old_vers_heap);
711 	} else {
712 		*old_vers_heap = mem_heap_create(512);
713 	}
714 
715 	err = row_vers_build_for_consistent_read(
716 		rec, mtr, index, offsets, read_view, offset_heap,
717 		*old_vers_heap, old_vers);
718 	return(err);
719 }
720 
721 /*********************************************************************//**
722 Builds the last committed version of a clustered index record for a
723 semi-consistent read. */
724 static MY_ATTRIBUTE((nonnull))
725 void
row_sel_build_committed_vers_for_mysql(dict_index_t * clust_index,row_prebuilt_t * prebuilt,const rec_t * rec,ulint ** offsets,mem_heap_t ** offset_heap,const rec_t ** old_vers,mtr_t * mtr)726 row_sel_build_committed_vers_for_mysql(
727 /*===================================*/
728 	dict_index_t*	clust_index,	/*!< in: clustered index */
729 	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
730 	const rec_t*	rec,		/*!< in: record in a clustered index */
731 	ulint**		offsets,	/*!< in/out: offsets returned by
732 					rec_get_offsets(rec, clust_index) */
733 	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
734 					the offsets are allocated */
735 	const rec_t**	old_vers,	/*!< out: old version, or NULL if the
736 					record does not exist in the view:
737 					i.e., it was freshly inserted
738 					afterwards */
739 	mtr_t*		mtr)		/*!< in: mtr */
740 {
741 	if (prebuilt->old_vers_heap) {
742 		mem_heap_empty(prebuilt->old_vers_heap);
743 	} else {
744 		prebuilt->old_vers_heap = mem_heap_create(
745 			rec_offs_size(*offsets));
746 	}
747 
748 	row_vers_build_for_semi_consistent_read(
749 		rec, mtr, clust_index, offsets, offset_heap,
750 		prebuilt->old_vers_heap, old_vers);
751 }
752 
753 /*********************************************************************//**
754 Tests the conditions which determine when the index segment we are searching
755 through has been exhausted.
756 @return	TRUE if row passed the tests */
757 UNIV_INLINE
758 ibool
row_sel_test_end_conds(plan_t * plan)759 row_sel_test_end_conds(
760 /*===================*/
761 	plan_t*	plan)	/*!< in: plan for the table; the column values must
762 			already have been retrieved and the right sides of
763 			comparisons evaluated */
764 {
765 	func_node_t*	cond;
766 
767 	/* All conditions in end_conds are comparisons of a column to an
768 	expression */
769 
770 	for (cond = UT_LIST_GET_FIRST(plan->end_conds);
771 	     cond != 0;
772 	     cond = UT_LIST_GET_NEXT(cond_list, cond)) {
773 
774 		/* Evaluate the left side of the comparison, i.e., get the
775 		column value if there is an indirection */
776 
777 		eval_sym(static_cast<sym_node_t*>(cond->args));
778 
779 		/* Do the comparison */
780 
781 		if (!eval_cmp(cond)) {
782 
783 			return(FALSE);
784 		}
785 	}
786 
787 	return(TRUE);
788 }
789 
790 /*********************************************************************//**
791 Tests the other conditions.
792 @return	TRUE if row passed the tests */
793 UNIV_INLINE
794 ibool
row_sel_test_other_conds(plan_t * plan)795 row_sel_test_other_conds(
796 /*=====================*/
797 	plan_t*	plan)	/*!< in: plan for the table; the column values must
798 			already have been retrieved */
799 {
800 	func_node_t*	cond;
801 
802 	cond = UT_LIST_GET_FIRST(plan->other_conds);
803 
804 	while (cond) {
805 		eval_exp(cond);
806 
807 		if (!eval_node_get_ibool_val(cond)) {
808 
809 			return(FALSE);
810 		}
811 
812 		cond = UT_LIST_GET_NEXT(cond_list, cond);
813 	}
814 
815 	return(TRUE);
816 }
817 
818 /*********************************************************************//**
819 Retrieves the clustered index record corresponding to a record in a
820 non-clustered index. Does the necessary locking.
821 @return	DB_SUCCESS or error code */
822 static MY_ATTRIBUTE((nonnull, warn_unused_result))
823 dberr_t
row_sel_get_clust_rec(sel_node_t * node,plan_t * plan,rec_t * rec,que_thr_t * thr,rec_t ** out_rec,mtr_t * mtr)824 row_sel_get_clust_rec(
825 /*==================*/
826 	sel_node_t*	node,	/*!< in: select_node */
827 	plan_t*		plan,	/*!< in: plan node for table */
828 	rec_t*		rec,	/*!< in: record in a non-clustered index */
829 	que_thr_t*	thr,	/*!< in: query thread */
830 	rec_t**		out_rec,/*!< out: clustered record or an old version of
831 				it, NULL if the old version did not exist
832 				in the read view, i.e., it was a fresh
833 				inserted version */
834 	mtr_t*		mtr)	/*!< in: mtr used to get access to the
835 				non-clustered record; the same mtr is used to
836 				access the clustered index */
837 {
838 	dict_index_t*	index;
839 	rec_t*		clust_rec;
840 	rec_t*		old_vers;
841 	dberr_t		err;
842 	mem_heap_t*	heap		= NULL;
843 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
844 	ulint*		offsets		= offsets_;
845 	rec_offs_init(offsets_);
846 
847 	*out_rec = NULL;
848 
849 	offsets = rec_get_offsets(rec,
850 				  btr_pcur_get_btr_cur(&plan->pcur)->index,
851 				  offsets, ULINT_UNDEFINED, &heap);
852 
853 	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
854 
855 	index = dict_table_get_first_index(plan->table);
856 
857 	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
858 				   BTR_SEARCH_LEAF, &plan->clust_pcur,
859 				   0, mtr);
860 
861 	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
862 
863 	/* Note: only if the search ends up on a non-infimum record is the
864 	low_match value the real match to the search tuple */
865 
866 	if (!page_rec_is_user_rec(clust_rec)
867 	    || btr_pcur_get_low_match(&(plan->clust_pcur))
868 	    < dict_index_get_n_unique(index)) {
869 
870 		ut_a(rec_get_deleted_flag(rec,
871 					  dict_table_is_comp(plan->table)));
872 		ut_a(node->read_view);
873 
874 		/* In a rare case it is possible that no clust rec is found
875 		for a delete-marked secondary index record: if in row0umod.cc
876 		in row_undo_mod_remove_clust_low() we have already removed
877 		the clust rec, while purge is still cleaning and removing
878 		secondary index records associated with earlier versions of
879 		the clustered index record. In that case we know that the
880 		clustered index record did not exist in the read view of
881 		trx. */
882 
883 		goto func_exit;
884 	}
885 
886 	offsets = rec_get_offsets(clust_rec, index, offsets,
887 				  ULINT_UNDEFINED, &heap);
888 
889 	if (!node->read_view) {
890 		/* Try to place a lock on the index record */
891 
892 		/* If innodb_locks_unsafe_for_binlog option is used
893 		or this session is using READ COMMITTED isolation level
894 		we lock only the record, i.e., next-key locking is
895 		not used. */
896 		ulint	lock_type;
897 		trx_t*	trx;
898 
899 		trx = thr_get_trx(thr);
900 
901 		if (srv_locks_unsafe_for_binlog
902 		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
903 			lock_type = LOCK_REC_NOT_GAP;
904 		} else {
905 			lock_type = LOCK_ORDINARY;
906 		}
907 
908 		err = lock_clust_rec_read_check_and_lock(
909 			0, btr_pcur_get_block(&plan->clust_pcur),
910 			clust_rec, index, offsets,
911 			static_cast<enum lock_mode>(node->row_lock_mode),
912 			lock_type,
913 			thr);
914 
915 		switch (err) {
916 		case DB_SUCCESS:
917 		case DB_SUCCESS_LOCKED_REC:
918 			/* Declare the variable uninitialized in Valgrind.
919 			It should be set to DB_SUCCESS at func_exit. */
920 			UNIV_MEM_INVALID(&err, sizeof err);
921 			break;
922 		default:
923 			goto err_exit;
924 		}
925 	} else {
926 		/* This is a non-locking consistent read: if necessary, fetch
927 		a previous version of the record */
928 
929 		old_vers = NULL;
930 
931 		if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
932 						   node->read_view)) {
933 
934 			err = row_sel_build_prev_vers(
935 				node->read_view, index, clust_rec,
936 				&offsets, &heap, &plan->old_vers_heap,
937 				&old_vers, mtr);
938 
939 			if (err != DB_SUCCESS) {
940 
941 				goto err_exit;
942 			}
943 
944 			clust_rec = old_vers;
945 
946 			if (clust_rec == NULL) {
947 				goto func_exit;
948 			}
949 		}
950 
951 		/* If we had to go to an earlier version of row or the
952 		secondary index record is delete marked, then it may be that
953 		the secondary index record corresponding to clust_rec
954 		(or old_vers) is not rec; in that case we must ignore
955 		such row because in our snapshot rec would not have existed.
956 		Remember that from rec we cannot see directly which transaction
957 		id corresponds to it: we have to go to the clustered index
958 		record. A query where we want to fetch all rows where
959 		the secondary index value is in some interval would return
960 		a wrong result if we would not drop rows which we come to
961 		visit through secondary index records that would not really
962 		exist in our snapshot. */
963 
964 		if ((old_vers
965 		     || rec_get_deleted_flag(rec, dict_table_is_comp(
966 						     plan->table)))
967 		    && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
968 							 clust_rec, index)) {
969 			goto func_exit;
970 		}
971 	}
972 
973 	/* Fetch the columns needed in test conditions.  The clustered
974 	index record is protected by a page latch that was acquired
975 	when plan->clust_pcur was positioned.  The latch will not be
976 	released until mtr_commit(mtr). */
977 
978 	ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
979 	row_sel_fetch_columns(index, clust_rec, offsets,
980 			      UT_LIST_GET_FIRST(plan->columns));
981 	*out_rec = clust_rec;
982 func_exit:
983 	err = DB_SUCCESS;
984 err_exit:
985 	if (UNIV_LIKELY_NULL(heap)) {
986 		mem_heap_free(heap);
987 	}
988 	return(err);
989 }
990 
991 /*********************************************************************//**
992 Sets a lock on a record.
993 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
994 UNIV_INLINE
995 dberr_t
sel_set_rec_lock(const buf_block_t * block,const rec_t * rec,dict_index_t * index,const ulint * offsets,ulint mode,ulint type,que_thr_t * thr)996 sel_set_rec_lock(
997 /*=============*/
998 	const buf_block_t*	block,	/*!< in: buffer block of rec */
999 	const rec_t*		rec,	/*!< in: record */
1000 	dict_index_t*		index,	/*!< in: index */
1001 	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
1002 	ulint			mode,	/*!< in: lock mode */
1003 	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
1004 					LOC_REC_NOT_GAP */
1005 	que_thr_t*		thr)	/*!< in: query thread */
1006 {
1007 	trx_t*		trx;
1008 	dberr_t		err;
1009 
1010 	trx = thr_get_trx(thr);
1011 
1012 	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) {
1013 		if (buf_LRU_buf_pool_running_out()) {
1014 
1015 			return(DB_LOCK_TABLE_FULL);
1016 		}
1017 	}
1018 
1019 	if (dict_index_is_clust(index)) {
1020 		err = lock_clust_rec_read_check_and_lock(
1021 			0, block, rec, index, offsets,
1022 			static_cast<enum lock_mode>(mode), type, thr);
1023 	} else {
1024 		err = lock_sec_rec_read_check_and_lock(
1025 			0, block, rec, index, offsets,
1026 			static_cast<enum lock_mode>(mode), type, thr);
1027 	}
1028 
1029 	return(err);
1030 }
1031 
1032 /*********************************************************************//**
1033 Opens a pcur to a table index. */
1034 static
1035 void
row_sel_open_pcur(plan_t * plan,ibool search_latch_locked,mtr_t * mtr)1036 row_sel_open_pcur(
1037 /*==============*/
1038 	plan_t*		plan,		/*!< in: table plan */
1039 	ibool		search_latch_locked,
1040 					/*!< in: TRUE if the thread currently
1041 					has the search latch locked in
1042 					s-mode */
1043 	mtr_t*		mtr)		/*!< in: mtr */
1044 {
1045 	dict_index_t*	index;
1046 	func_node_t*	cond;
1047 	que_node_t*	exp;
1048 	ulint		n_fields;
1049 	ulint		has_search_latch = 0;	/* RW_S_LATCH or 0 */
1050 	ulint		i;
1051 
1052 	if (search_latch_locked) {
1053 		has_search_latch = RW_S_LATCH;
1054 	}
1055 
1056 	index = plan->index;
1057 
1058 	/* Calculate the value of the search tuple: the exact match columns
1059 	get their expressions evaluated when we evaluate the right sides of
1060 	end_conds */
1061 
1062 	cond = UT_LIST_GET_FIRST(plan->end_conds);
1063 
1064 	while (cond) {
1065 		eval_exp(que_node_get_next(cond->args));
1066 
1067 		cond = UT_LIST_GET_NEXT(cond_list, cond);
1068 	}
1069 
1070 	if (plan->tuple) {
1071 		n_fields = dtuple_get_n_fields(plan->tuple);
1072 
1073 		if (plan->n_exact_match < n_fields) {
1074 			/* There is a non-exact match field which must be
1075 			evaluated separately */
1076 
1077 			eval_exp(plan->tuple_exps[n_fields - 1]);
1078 		}
1079 
1080 		for (i = 0; i < n_fields; i++) {
1081 			exp = plan->tuple_exps[i];
1082 
1083 			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1084 					 que_node_get_val(exp));
1085 		}
1086 
1087 		/* Open pcur to the index */
1088 
1089 		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1090 					   BTR_SEARCH_LEAF, &plan->pcur,
1091 					   has_search_latch, mtr);
1092 	} else {
1093 		/* Open the cursor to the start or the end of the index
1094 		(FALSE: no init) */
1095 
1096 		btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
1097 					    &(plan->pcur), false, 0, mtr);
1098 	}
1099 
1100 	ut_ad(plan->n_rows_prefetched == 0);
1101 	ut_ad(plan->n_rows_fetched == 0);
1102 	ut_ad(plan->cursor_at_end == FALSE);
1103 
1104 	plan->pcur_is_open = TRUE;
1105 }
1106 
1107 /*********************************************************************//**
1108 Restores a stored pcur position to a table index.
1109 @return TRUE if the cursor should be moved to the next record after we
1110 return from this function (moved to the previous, in the case of a
1111 descending cursor) without processing again the current cursor
1112 record */
1113 static
1114 ibool
row_sel_restore_pcur_pos(plan_t * plan,mtr_t * mtr)1115 row_sel_restore_pcur_pos(
1116 /*=====================*/
1117 	plan_t*		plan,	/*!< in: table plan */
1118 	mtr_t*		mtr)	/*!< in: mtr */
1119 {
1120 	ibool	equal_position;
1121 	ulint	relative_position;
1122 
1123 	ut_ad(!plan->cursor_at_end);
1124 
1125 	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1126 
1127 	equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1128 						   &(plan->pcur), mtr);
1129 
1130 	/* If the cursor is traveling upwards, and relative_position is
1131 
1132 	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1133 	yet on the successor of the page infimum;
1134 	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1135 	first record GREATER than the predecessor of a page supremum; we have
1136 	not yet processed the cursor record: no need to move the cursor to the
1137 	next record;
1138 	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1139 	last record LESS or EQUAL to the old stored user record; (a) if
1140 	equal_position is FALSE, this means that the cursor is now on a record
1141 	less than the old user record, and we must move to the next record;
1142 	(b) if equal_position is TRUE, then if
1143 	plan->stored_cursor_rec_processed is TRUE, we must move to the next
1144 	record, else there is no need to move the cursor. */
1145 
1146 	if (plan->asc) {
1147 		if (relative_position == BTR_PCUR_ON) {
1148 
1149 			if (equal_position) {
1150 
1151 				return(plan->stored_cursor_rec_processed);
1152 			}
1153 
1154 			return(TRUE);
1155 		}
1156 
1157 		ut_ad(relative_position == BTR_PCUR_AFTER
1158 		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1159 
1160 		return(FALSE);
1161 	}
1162 
1163 	/* If the cursor is traveling downwards, and relative_position is
1164 
1165 	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1166 	the last record LESS than the successor of a page infimum; we have not
1167 	processed the cursor record: no need to move the cursor;
1168 	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1169 	first record GREATER than the predecessor of a page supremum; we have
1170 	processed the cursor record: we should move the cursor to the previous
1171 	record;
1172 	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1173 	last record LESS or EQUAL to the old stored user record; (a) if
1174 	equal_position is FALSE, this means that the cursor is now on a record
1175 	less than the old user record, and we need not move to the previous
1176 	record; (b) if equal_position is TRUE, then if
1177 	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1178 	record, else there is no need to move the cursor. */
1179 
1180 	if (relative_position == BTR_PCUR_BEFORE
1181 	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1182 
1183 		return(FALSE);
1184 	}
1185 
1186 	if (relative_position == BTR_PCUR_ON) {
1187 
1188 		if (equal_position) {
1189 
1190 			return(plan->stored_cursor_rec_processed);
1191 		}
1192 
1193 		return(FALSE);
1194 	}
1195 
1196 	ut_ad(relative_position == BTR_PCUR_AFTER
1197 	      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1198 
1199 	return(TRUE);
1200 }
1201 
1202 /*********************************************************************//**
1203 Resets a plan cursor to a closed state. */
1204 UNIV_INLINE
1205 void
plan_reset_cursor(plan_t * plan)1206 plan_reset_cursor(
1207 /*==============*/
1208 	plan_t*	plan)	/*!< in: plan */
1209 {
1210 	plan->pcur_is_open = FALSE;
1211 	plan->cursor_at_end = FALSE;
1212 	plan->n_rows_fetched = 0;
1213 	plan->n_rows_prefetched = 0;
1214 }
1215 
1216 /*********************************************************************//**
1217 Tries to do a shortcut to fetch a clustered index record with a unique key,
1218 using the hash index if possible (not always).
1219 @return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1220 static
1221 ulint
row_sel_try_search_shortcut(sel_node_t * node,plan_t * plan,ibool search_latch_locked,mtr_t * mtr)1222 row_sel_try_search_shortcut(
1223 /*========================*/
1224 	sel_node_t*	node,	/*!< in: select node for a consistent read */
1225 	plan_t*		plan,	/*!< in: plan for a unique search in clustered
1226 				index */
1227 	ibool		search_latch_locked,
1228 				/*!< in: whether the search holds
1229 				btr_search_latch */
1230 	mtr_t*		mtr)	/*!< in: mtr */
1231 {
1232 	dict_index_t*	index;
1233 	rec_t*		rec;
1234 	mem_heap_t*	heap		= NULL;
1235 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
1236 	ulint*		offsets		= offsets_;
1237 	ulint		ret;
1238 	rec_offs_init(offsets_);
1239 
1240 	index = plan->index;
1241 
1242 	ut_ad(node->read_view);
1243 	ut_ad(plan->unique_search);
1244 	ut_ad(!plan->must_get_clust);
1245 #ifdef UNIV_SYNC_DEBUG
1246 	if (search_latch_locked) {
1247 		ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1248 	}
1249 #endif /* UNIV_SYNC_DEBUG */
1250 
1251 	row_sel_open_pcur(plan, search_latch_locked, mtr);
1252 
1253 	rec = btr_pcur_get_rec(&(plan->pcur));
1254 
1255 	if (!page_rec_is_user_rec(rec)) {
1256 
1257 		return(SEL_RETRY);
1258 	}
1259 
1260 	ut_ad(plan->mode == PAGE_CUR_GE);
1261 
1262 	/* As the cursor is now placed on a user record after a search with
1263 	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1264 	fields in the user record matched to the search tuple */
1265 
1266 	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1267 
1268 		return(SEL_EXHAUSTED);
1269 	}
1270 
1271 	/* This is a non-locking consistent read: if necessary, fetch
1272 	a previous version of the record */
1273 
1274 	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1275 
1276 	if (dict_index_is_clust(index)) {
1277 		if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1278 						   node->read_view)) {
1279 			ret = SEL_RETRY;
1280 			goto func_exit;
1281 		}
1282 	} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1283 
1284 		ret = SEL_RETRY;
1285 		goto func_exit;
1286 	}
1287 
1288 	/* Test the deleted flag. */
1289 
1290 	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1291 
1292 		ret = SEL_EXHAUSTED;
1293 		goto func_exit;
1294 	}
1295 
1296 	/* Fetch the columns needed in test conditions.  The index
1297 	record is protected by a page latch that was acquired when
1298 	plan->pcur was positioned.  The latch will not be released
1299 	until mtr_commit(mtr). */
1300 
1301 	row_sel_fetch_columns(index, rec, offsets,
1302 			      UT_LIST_GET_FIRST(plan->columns));
1303 
1304 	/* Test the rest of search conditions */
1305 
1306 	if (!row_sel_test_other_conds(plan)) {
1307 
1308 		ret = SEL_EXHAUSTED;
1309 		goto func_exit;
1310 	}
1311 
1312 	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1313 
1314 	plan->n_rows_fetched++;
1315 	ret = SEL_FOUND;
1316 func_exit:
1317 	if (UNIV_LIKELY_NULL(heap)) {
1318 		mem_heap_free(heap);
1319 	}
1320 	return(ret);
1321 }
1322 
1323 /*********************************************************************//**
1324 Performs a select step.
1325 @return	DB_SUCCESS or error code */
1326 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1327 dberr_t
row_sel(sel_node_t * node,que_thr_t * thr)1328 row_sel(
1329 /*====*/
1330 	sel_node_t*	node,	/*!< in: select node */
1331 	que_thr_t*	thr)	/*!< in: query thread */
1332 {
1333 	dict_index_t*	index;
1334 	plan_t*		plan;
1335 	mtr_t		mtr;
1336 	ibool		moved;
1337 	rec_t*		rec;
1338 	rec_t*		old_vers;
1339 	rec_t*		clust_rec;
1340 	ibool		search_latch_locked;
1341 	ibool		consistent_read;
1342 
1343 	/* The following flag becomes TRUE when we are doing a
1344 	consistent read from a non-clustered index and we must look
1345 	at the clustered index to find out the previous delete mark
1346 	state of the non-clustered record: */
1347 
1348 	ibool		cons_read_requires_clust_rec	= FALSE;
1349 	ulint		cost_counter			= 0;
1350 	ibool		cursor_just_opened;
1351 	ibool		must_go_to_next;
1352 	ibool		mtr_has_extra_clust_latch	= FALSE;
1353 	/* TRUE if the search was made using
1354 	a non-clustered index, and we had to
1355 	access the clustered record: now &mtr
1356 	contains a clustered index latch, and
1357 	&mtr must be committed before we move
1358 	to the next non-clustered record */
1359 	ulint		found_flag;
1360 	dberr_t		err;
1361 	mem_heap_t*	heap				= NULL;
1362 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
1363 	ulint*		offsets				= offsets_;
1364 	rec_offs_init(offsets_);
1365 
1366 	ut_ad(thr->run_node == node);
1367 
1368 	search_latch_locked = FALSE;
1369 
1370 	if (node->read_view) {
1371 		/* In consistent reads, we try to do with the hash index and
1372 		not to use the buffer page get. This is to reduce memory bus
1373 		load resulting from semaphore operations. The search latch
1374 		will be s-locked when we access an index with a unique search
1375 		condition, but not locked when we access an index with a
1376 		less selective search condition. */
1377 
1378 		consistent_read = TRUE;
1379 	} else {
1380 		consistent_read = FALSE;
1381 	}
1382 
1383 table_loop:
1384 	/* TABLE LOOP
1385 	----------
1386 	This is the outer major loop in calculating a join. We come here when
1387 	node->fetch_table changes, and after adding a row to aggregate totals
1388 	and, of course, when this function is called. */
1389 
1390 	ut_ad(mtr_has_extra_clust_latch == FALSE);
1391 
1392 	plan = sel_node_get_nth_plan(node, node->fetch_table);
1393 	index = plan->index;
1394 
1395 	if (plan->n_rows_prefetched > 0) {
1396 		sel_dequeue_prefetched_row(plan);
1397 
1398 		goto next_table_no_mtr;
1399 	}
1400 
1401 	if (plan->cursor_at_end) {
1402 		/* The cursor has already reached the result set end: no more
1403 		rows to process for this table cursor, as also the prefetch
1404 		stack was empty */
1405 
1406 		ut_ad(plan->pcur_is_open);
1407 
1408 		goto table_exhausted_no_mtr;
1409 	}
1410 
1411 	/* Open a cursor to index, or restore an open cursor position */
1412 
1413 	mtr_start(&mtr);
1414 
1415 	if (consistent_read && plan->unique_search && !plan->pcur_is_open
1416 	    && !plan->must_get_clust
1417 	    && !plan->table->big_rows) {
1418 		if (!search_latch_locked) {
1419 			rw_lock_s_lock(&btr_search_latch);
1420 
1421 			search_latch_locked = TRUE;
1422 		} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
1423 
1424 			/* There is an x-latch request waiting: release the
1425 			s-latch for a moment; as an s-latch here is often
1426 			kept for some 10 searches before being released,
1427 			a waiting x-latch request would block other threads
1428 			from acquiring an s-latch for a long time, lowering
1429 			performance significantly in multiprocessors. */
1430 
1431 			rw_lock_s_unlock(&btr_search_latch);
1432 			rw_lock_s_lock(&btr_search_latch);
1433 		}
1434 
1435 		found_flag = row_sel_try_search_shortcut(node, plan,
1436 							 search_latch_locked,
1437 							 &mtr);
1438 
1439 		if (found_flag == SEL_FOUND) {
1440 
1441 			goto next_table;
1442 
1443 		} else if (found_flag == SEL_EXHAUSTED) {
1444 
1445 			goto table_exhausted;
1446 		}
1447 
1448 		ut_ad(found_flag == SEL_RETRY);
1449 
1450 		plan_reset_cursor(plan);
1451 
1452 		mtr_commit(&mtr);
1453 		mtr_start(&mtr);
1454 	}
1455 
1456 	if (search_latch_locked) {
1457 		rw_lock_s_unlock(&btr_search_latch);
1458 
1459 		search_latch_locked = FALSE;
1460 	}
1461 
1462 	if (!plan->pcur_is_open) {
1463 		/* Evaluate the expressions to build the search tuple and
1464 		open the cursor */
1465 
1466 		row_sel_open_pcur(plan, search_latch_locked, &mtr);
1467 
1468 		cursor_just_opened = TRUE;
1469 
1470 		/* A new search was made: increment the cost counter */
1471 		cost_counter++;
1472 	} else {
1473 		/* Restore pcur position to the index */
1474 
1475 		must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1476 
1477 		cursor_just_opened = FALSE;
1478 
1479 		if (must_go_to_next) {
1480 			/* We have already processed the cursor record: move
1481 			to the next */
1482 
1483 			goto next_rec;
1484 		}
1485 	}
1486 
1487 rec_loop:
1488 	/* RECORD LOOP
1489 	-----------
1490 	In this loop we use pcur and try to fetch a qualifying row, and
1491 	also fill the prefetch buffer for this table if n_rows_fetched has
1492 	exceeded a threshold. While we are inside this loop, the following
1493 	holds:
1494 	(1) &mtr is started,
1495 	(2) pcur is positioned and open.
1496 
1497 	NOTE that if cursor_just_opened is TRUE here, it means that we came
1498 	to this point right after row_sel_open_pcur. */
1499 
1500 	ut_ad(mtr_has_extra_clust_latch == FALSE);
1501 
1502 	rec = btr_pcur_get_rec(&(plan->pcur));
1503 
1504 	/* PHASE 1: Set a lock if specified */
1505 
1506 	if (!node->asc && cursor_just_opened
1507 	    && !page_rec_is_supremum(rec)) {
1508 
1509 		/* When we open a cursor for a descending search, we must set
1510 		a next-key lock on the successor record: otherwise it would
1511 		be possible to insert new records next to the cursor position,
1512 		and it might be that these new records should appear in the
1513 		search result set, resulting in the phantom problem. */
1514 
1515 		if (!consistent_read) {
1516 
1517 			/* If innodb_locks_unsafe_for_binlog option is used
1518 			or this session is using READ COMMITTED isolation
1519 			level, we lock only the record, i.e., next-key
1520 			locking is not used. */
1521 
1522 			rec_t*	next_rec = page_rec_get_next(rec);
1523 			ulint	lock_type;
1524 			trx_t*	trx;
1525 
1526 			trx = thr_get_trx(thr);
1527 
1528 			offsets = rec_get_offsets(next_rec, index, offsets,
1529 						  ULINT_UNDEFINED, &heap);
1530 
1531 			if (srv_locks_unsafe_for_binlog
1532 			    || trx->isolation_level
1533 			    <= TRX_ISO_READ_COMMITTED) {
1534 
1535 				if (page_rec_is_supremum(next_rec)) {
1536 
1537 					goto skip_lock;
1538 				}
1539 
1540 				lock_type = LOCK_REC_NOT_GAP;
1541 			} else {
1542 				lock_type = LOCK_ORDINARY;
1543 			}
1544 
1545 			err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1546 					       next_rec, index, offsets,
1547 					       node->row_lock_mode,
1548 					       lock_type, thr);
1549 
1550 			switch (err) {
1551 			case DB_SUCCESS_LOCKED_REC:
1552 				err = DB_SUCCESS;
1553 			case DB_SUCCESS:
1554 				break;
1555 			default:
1556 				/* Note that in this case we will store in pcur
1557 				the PREDECESSOR of the record we are waiting
1558 				the lock for */
1559 				goto lock_wait_or_error;
1560 			}
1561 		}
1562 	}
1563 
1564 skip_lock:
1565 	if (page_rec_is_infimum(rec)) {
1566 
1567 		/* The infimum record on a page cannot be in the result set,
1568 		and neither can a record lock be placed on it: we skip such
1569 		a record. We also increment the cost counter as we may have
1570 		processed yet another page of index. */
1571 
1572 		cost_counter++;
1573 
1574 		goto next_rec;
1575 	}
1576 
1577 	if (!consistent_read) {
1578 		/* Try to place a lock on the index record */
1579 
1580 		/* If innodb_locks_unsafe_for_binlog option is used
1581 		or this session is using READ COMMITTED isolation level,
1582 		we lock only the record, i.e., next-key locking is
1583 		not used. */
1584 
1585 		ulint	lock_type;
1586 		trx_t*	trx;
1587 
1588 		offsets = rec_get_offsets(rec, index, offsets,
1589 					  ULINT_UNDEFINED, &heap);
1590 
1591 		trx = thr_get_trx(thr);
1592 
1593 		if (srv_locks_unsafe_for_binlog
1594 		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
1595 
1596 			if (page_rec_is_supremum(rec)) {
1597 
1598 				goto next_rec;
1599 			}
1600 
1601 			lock_type = LOCK_REC_NOT_GAP;
1602 		} else {
1603 			lock_type = LOCK_ORDINARY;
1604 		}
1605 
1606 		err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1607 				       rec, index, offsets,
1608 				       node->row_lock_mode, lock_type, thr);
1609 
1610 		switch (err) {
1611 		case DB_SUCCESS_LOCKED_REC:
1612 			err = DB_SUCCESS;
1613 		case DB_SUCCESS:
1614 			break;
1615 		default:
1616 			goto lock_wait_or_error;
1617 		}
1618 	}
1619 
1620 	if (page_rec_is_supremum(rec)) {
1621 
1622 		/* A page supremum record cannot be in the result set: skip
1623 		it now when we have placed a possible lock on it */
1624 
1625 		goto next_rec;
1626 	}
1627 
1628 	ut_ad(page_rec_is_user_rec(rec));
1629 
1630 	if (cost_counter > SEL_COST_LIMIT) {
1631 
1632 		/* Now that we have placed the necessary locks, we can stop
1633 		for a while and store the cursor position; NOTE that if we
1634 		would store the cursor position BEFORE placing a record lock,
1635 		it might happen that the cursor would jump over some records
1636 		that another transaction could meanwhile insert adjacent to
1637 		the cursor: this would result in the phantom problem. */
1638 
1639 		goto stop_for_a_while;
1640 	}
1641 
1642 	/* PHASE 2: Check a mixed index mix id if needed */
1643 
1644 	if (plan->unique_search && cursor_just_opened) {
1645 
1646 		ut_ad(plan->mode == PAGE_CUR_GE);
1647 
1648 		/* As the cursor is now placed on a user record after a search
1649 		with the mode PAGE_CUR_GE, the up_match field in the cursor
1650 		tells how many fields in the user record matched to the search
1651 		tuple */
1652 
1653 		if (btr_pcur_get_up_match(&(plan->pcur))
1654 		    < plan->n_exact_match) {
1655 			goto table_exhausted;
1656 		}
1657 
1658 		/* Ok, no need to test end_conds or mix id */
1659 
1660 	}
1661 
1662 	/* We are ready to look at a possible new index entry in the result
1663 	set: the cursor is now placed on a user record */
1664 
1665 	/* PHASE 3: Get previous version in a consistent read */
1666 
1667 	cons_read_requires_clust_rec = FALSE;
1668 	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1669 
1670 	if (consistent_read) {
1671 		/* This is a non-locking consistent read: if necessary, fetch
1672 		a previous version of the record */
1673 
1674 		if (dict_index_is_clust(index)) {
1675 
1676 			if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1677 							   node->read_view)) {
1678 
1679 				err = row_sel_build_prev_vers(
1680 					node->read_view, index, rec,
1681 					&offsets, &heap, &plan->old_vers_heap,
1682 					&old_vers, &mtr);
1683 
1684 				if (err != DB_SUCCESS) {
1685 
1686 					goto lock_wait_or_error;
1687 				}
1688 
1689 				if (old_vers == NULL) {
1690 					/* The record does not exist
1691 					in our read view. Skip it, but
1692 					first attempt to determine
1693 					whether the index segment we
1694 					are searching through has been
1695 					exhausted. */
1696 
1697 					offsets = rec_get_offsets(
1698 						rec, index, offsets,
1699 						ULINT_UNDEFINED, &heap);
1700 
1701 					/* Fetch the columns needed in
1702 					test conditions. The clustered
1703 					index record is protected by a
1704 					page latch that was acquired
1705 					by row_sel_open_pcur() or
1706 					row_sel_restore_pcur_pos().
1707 					The latch will not be released
1708 					until mtr_commit(mtr). */
1709 
1710 					row_sel_fetch_columns(
1711 						index, rec, offsets,
1712 						UT_LIST_GET_FIRST(
1713 							plan->columns));
1714 
1715 					if (!row_sel_test_end_conds(plan)) {
1716 
1717 						goto table_exhausted;
1718 					}
1719 
1720 					goto next_rec;
1721 				}
1722 
1723 				rec = old_vers;
1724 			}
1725 		} else if (!lock_sec_rec_cons_read_sees(rec,
1726 							node->read_view)) {
1727 			cons_read_requires_clust_rec = TRUE;
1728 		}
1729 	}
1730 
1731 	/* PHASE 4: Test search end conditions and deleted flag */
1732 
1733 	/* Fetch the columns needed in test conditions.  The record is
1734 	protected by a page latch that was acquired by
1735 	row_sel_open_pcur() or row_sel_restore_pcur_pos().  The latch
1736 	will not be released until mtr_commit(mtr). */
1737 
1738 	row_sel_fetch_columns(index, rec, offsets,
1739 			      UT_LIST_GET_FIRST(plan->columns));
1740 
1741 	/* Test the selection end conditions: these can only contain columns
1742 	which already are found in the index, even though the index might be
1743 	non-clustered */
1744 
1745 	if (plan->unique_search && cursor_just_opened) {
1746 
1747 		/* No test necessary: the test was already made above */
1748 
1749 	} else if (!row_sel_test_end_conds(plan)) {
1750 
1751 		goto table_exhausted;
1752 	}
1753 
1754 	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1755 	    && !cons_read_requires_clust_rec) {
1756 
1757 		/* The record is delete marked: we can skip it if this is
1758 		not a consistent read which might see an earlier version
1759 		of a non-clustered index record */
1760 
1761 		if (plan->unique_search) {
1762 
1763 			goto table_exhausted;
1764 		}
1765 
1766 		goto next_rec;
1767 	}
1768 
1769 	/* PHASE 5: Get the clustered index record, if needed and if we did
1770 	not do the search using the clustered index */
1771 
1772 	if (plan->must_get_clust || cons_read_requires_clust_rec) {
1773 
1774 		/* It was a non-clustered index and we must fetch also the
1775 		clustered index record */
1776 
1777 		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1778 					    &mtr);
1779 		mtr_has_extra_clust_latch = TRUE;
1780 
1781 		if (err != DB_SUCCESS) {
1782 
1783 			goto lock_wait_or_error;
1784 		}
1785 
1786 		/* Retrieving the clustered record required a search:
1787 		increment the cost counter */
1788 
1789 		cost_counter++;
1790 
1791 		if (clust_rec == NULL) {
1792 			/* The record did not exist in the read view */
1793 			ut_ad(consistent_read);
1794 
1795 			goto next_rec;
1796 		}
1797 
1798 		if (rec_get_deleted_flag(clust_rec,
1799 					 dict_table_is_comp(plan->table))) {
1800 
1801 			/* The record is delete marked: we can skip it */
1802 
1803 			goto next_rec;
1804 		}
1805 
1806 		if (node->can_get_updated) {
1807 
1808 			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1809 		}
1810 	}
1811 
1812 	/* PHASE 6: Test the rest of search conditions */
1813 
1814 	if (!row_sel_test_other_conds(plan)) {
1815 
1816 		if (plan->unique_search) {
1817 
1818 			goto table_exhausted;
1819 		}
1820 
1821 		goto next_rec;
1822 	}
1823 
1824 	/* PHASE 7: We found a new qualifying row for the current table; push
1825 	the row if prefetch is on, or move to the next table in the join */
1826 
1827 	plan->n_rows_fetched++;
1828 
1829 	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1830 
1831 	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1832 	    || plan->unique_search || plan->no_prefetch
1833 	    || plan->table->big_rows) {
1834 
1835 		/* No prefetch in operation: go to the next table */
1836 
1837 		goto next_table;
1838 	}
1839 
1840 	sel_enqueue_prefetched_row(plan);
1841 
1842 	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1843 
1844 		/* The prefetch buffer is now full */
1845 
1846 		sel_dequeue_prefetched_row(plan);
1847 
1848 		goto next_table;
1849 	}
1850 
1851 next_rec:
1852 	ut_ad(!search_latch_locked);
1853 
1854 	if (mtr_has_extra_clust_latch) {
1855 
1856 		/* We must commit &mtr if we are moving to the next
1857 		non-clustered index record, because we could break the
1858 		latching order if we would access a different clustered
1859 		index page right away without releasing the previous. */
1860 
1861 		goto commit_mtr_for_a_while;
1862 	}
1863 
1864 	if (node->asc) {
1865 		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1866 	} else {
1867 		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1868 	}
1869 
1870 	if (!moved) {
1871 
1872 		goto table_exhausted;
1873 	}
1874 
1875 	cursor_just_opened = FALSE;
1876 
1877 	/* END OF RECORD LOOP
1878 	------------------ */
1879 	goto rec_loop;
1880 
1881 next_table:
1882 	/* We found a record which satisfies the conditions: we can move to
1883 	the next table or return a row in the result set */
1884 
1885 	ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
1886 
1887 	if (plan->unique_search && !node->can_get_updated) {
1888 
1889 		plan->cursor_at_end = TRUE;
1890 	} else {
1891 		ut_ad(!search_latch_locked);
1892 
1893 		plan->stored_cursor_rec_processed = TRUE;
1894 
1895 		btr_pcur_store_position(&(plan->pcur), &mtr);
1896 	}
1897 
1898 	mtr_commit(&mtr);
1899 
1900 	mtr_has_extra_clust_latch = FALSE;
1901 
1902 next_table_no_mtr:
1903 	/* If we use 'goto' to this label, it means that the row was popped
1904 	from the prefetched rows stack, and &mtr is already committed */
1905 
1906 	if (node->fetch_table + 1 == node->n_tables) {
1907 
1908 		sel_eval_select_list(node);
1909 
1910 		if (node->is_aggregate) {
1911 
1912 			goto table_loop;
1913 		}
1914 
1915 		sel_assign_into_var_values(node->into_list, node);
1916 
1917 		thr->run_node = que_node_get_parent(node);
1918 
1919 		err = DB_SUCCESS;
1920 		goto func_exit;
1921 	}
1922 
1923 	node->fetch_table++;
1924 
1925 	/* When we move to the next table, we first reset the plan cursor:
1926 	we do not care about resetting it when we backtrack from a table */
1927 
1928 	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1929 
1930 	goto table_loop;
1931 
1932 table_exhausted:
1933 	/* The table cursor pcur reached the result set end: backtrack to the
1934 	previous table in the join if we do not have cached prefetched rows */
1935 
1936 	plan->cursor_at_end = TRUE;
1937 
1938 	mtr_commit(&mtr);
1939 
1940 	mtr_has_extra_clust_latch = FALSE;
1941 
1942 	if (plan->n_rows_prefetched > 0) {
1943 		/* The table became exhausted during a prefetch */
1944 
1945 		sel_dequeue_prefetched_row(plan);
1946 
1947 		goto next_table_no_mtr;
1948 	}
1949 
1950 table_exhausted_no_mtr:
1951 	if (node->fetch_table == 0) {
1952 		err = DB_SUCCESS;
1953 
1954 		if (node->is_aggregate && !node->aggregate_already_fetched) {
1955 
1956 			node->aggregate_already_fetched = TRUE;
1957 
1958 			sel_assign_into_var_values(node->into_list, node);
1959 
1960 			thr->run_node = que_node_get_parent(node);
1961 		} else {
1962 			node->state = SEL_NODE_NO_MORE_ROWS;
1963 
1964 			thr->run_node = que_node_get_parent(node);
1965 		}
1966 
1967 		goto func_exit;
1968 	}
1969 
1970 	node->fetch_table--;
1971 
1972 	goto table_loop;
1973 
1974 stop_for_a_while:
1975 	/* Return control for a while to que_run_threads, so that runaway
1976 	queries can be canceled. NOTE that when we come here, we must, in a
1977 	locking read, have placed the necessary (possibly waiting request)
1978 	record lock on the cursor record or its successor: when we reposition
1979 	the cursor, this record lock guarantees that nobody can meanwhile have
1980 	inserted new records which should have appeared in the result set,
1981 	which would result in the phantom problem. */
1982 
1983 	ut_ad(!search_latch_locked);
1984 
1985 	plan->stored_cursor_rec_processed = FALSE;
1986 	btr_pcur_store_position(&(plan->pcur), &mtr);
1987 
1988 	mtr_commit(&mtr);
1989 
1990 #ifdef UNIV_SYNC_DEBUG
1991 	ut_ad(sync_thread_levels_empty_except_dict());
1992 #endif /* UNIV_SYNC_DEBUG */
1993 	err = DB_SUCCESS;
1994 	goto func_exit;
1995 
1996 commit_mtr_for_a_while:
1997 	/* Stores the cursor position and commits &mtr; this is used if
1998 	&mtr may contain latches which would break the latching order if
1999 	&mtr would not be committed and the latches released. */
2000 
2001 	plan->stored_cursor_rec_processed = TRUE;
2002 
2003 	ut_ad(!search_latch_locked);
2004 	btr_pcur_store_position(&(plan->pcur), &mtr);
2005 
2006 	mtr_commit(&mtr);
2007 
2008 	mtr_has_extra_clust_latch = FALSE;
2009 
2010 #ifdef UNIV_SYNC_DEBUG
2011 	ut_ad(sync_thread_levels_empty_except_dict());
2012 #endif /* UNIV_SYNC_DEBUG */
2013 
2014 	goto table_loop;
2015 
2016 lock_wait_or_error:
2017 	/* See the note at stop_for_a_while: the same holds for this case */
2018 
2019 	ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
2020 	ut_ad(!search_latch_locked);
2021 
2022 	plan->stored_cursor_rec_processed = FALSE;
2023 	btr_pcur_store_position(&(plan->pcur), &mtr);
2024 
2025 	mtr_commit(&mtr);
2026 
2027 #ifdef UNIV_SYNC_DEBUG
2028 	ut_ad(sync_thread_levels_empty_except_dict());
2029 #endif /* UNIV_SYNC_DEBUG */
2030 
2031 func_exit:
2032 	if (search_latch_locked) {
2033 		rw_lock_s_unlock(&btr_search_latch);
2034 	}
2035 	if (UNIV_LIKELY_NULL(heap)) {
2036 		mem_heap_free(heap);
2037 	}
2038 	return(err);
2039 }
2040 
2041 /**********************************************************************//**
2042 Performs a select step. This is a high-level function used in SQL execution
2043 graphs.
2044 @return	query thread to run next or NULL */
2045 UNIV_INTERN
2046 que_thr_t*
row_sel_step(que_thr_t * thr)2047 row_sel_step(
2048 /*=========*/
2049 	que_thr_t*	thr)	/*!< in: query thread */
2050 {
2051 	sel_node_t*	node;
2052 
2053 	ut_ad(thr);
2054 
2055 	node = static_cast<sel_node_t*>(thr->run_node);
2056 
2057 	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2058 
2059 	/* If this is a new time this node is executed (or when execution
2060 	resumes after wait for a table intention lock), set intention locks
2061 	on the tables, or assign a read view */
2062 
2063 	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2064 
2065 		node->state = SEL_NODE_OPEN;
2066 	}
2067 
2068 	if (node->state == SEL_NODE_OPEN) {
2069 
2070 		/* It may be that the current session has not yet started
2071 		its transaction, or it has been committed: */
2072 
2073 		trx_start_if_not_started_xa(thr_get_trx(thr));
2074 
2075 		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2076 
2077 		if (node->consistent_read) {
2078 			/* Assign a read view for the query */
2079 			node->read_view = trx_assign_read_view(
2080 				thr_get_trx(thr));
2081 		} else {
2082 			sym_node_t*	table_node;
2083 			enum lock_mode	i_lock_mode;
2084 
2085 			if (node->set_x_locks) {
2086 				i_lock_mode = LOCK_IX;
2087 			} else {
2088 				i_lock_mode = LOCK_IS;
2089 			}
2090 
2091 			for (table_node = node->table_list;
2092 			     table_node != 0;
2093 			     table_node = static_cast<sym_node_t*>(
2094 					que_node_get_next(table_node))) {
2095 
2096 				dberr_t	err = lock_table(
2097 					0, table_node->table, i_lock_mode,
2098 					thr);
2099 
2100 				if (err != DB_SUCCESS) {
2101 					trx_t*	trx;
2102 
2103 					trx = thr_get_trx(thr);
2104 					trx->error_state = err;
2105 
2106 					return(NULL);
2107 				}
2108 			}
2109 		}
2110 
2111 		/* If this is an explicit cursor, copy stored procedure
2112 		variable values, so that the values cannot change between
2113 		fetches (currently, we copy them also for non-explicit
2114 		cursors) */
2115 
2116 		if (node->explicit_cursor
2117 		    && UT_LIST_GET_FIRST(node->copy_variables)) {
2118 
2119 			row_sel_copy_input_variable_vals(node);
2120 		}
2121 
2122 		node->state = SEL_NODE_FETCH;
2123 		node->fetch_table = 0;
2124 
2125 		if (node->is_aggregate) {
2126 			/* Reset the aggregate total values */
2127 			sel_reset_aggregate_vals(node);
2128 		}
2129 	}
2130 
2131 	dberr_t	err = row_sel(node, thr);
2132 
2133 	/* NOTE! if queries are parallelized, the following assignment may
2134 	have problems; the assignment should be made only if thr is the
2135 	only top-level thr in the graph: */
2136 
2137 	thr->graph->last_sel_node = node;
2138 
2139 	if (err != DB_SUCCESS) {
2140 		thr_get_trx(thr)->error_state = err;
2141 
2142 		return(NULL);
2143 	}
2144 
2145 	return(thr);
2146 }
2147 
2148 /**********************************************************************//**
2149 Performs a fetch for a cursor.
2150 @return	query thread to run next or NULL */
2151 UNIV_INTERN
2152 que_thr_t*
fetch_step(que_thr_t * thr)2153 fetch_step(
2154 /*=======*/
2155 	que_thr_t*	thr)	/*!< in: query thread */
2156 {
2157 	sel_node_t*	sel_node;
2158 	fetch_node_t*	node;
2159 
2160 	ut_ad(thr);
2161 
2162 	node = static_cast<fetch_node_t*>(thr->run_node);
2163 	sel_node = node->cursor_def;
2164 
2165 	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2166 
2167 	if (thr->prev_node != que_node_get_parent(node)) {
2168 
2169 		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2170 
2171 			if (node->into_list) {
2172 				sel_assign_into_var_values(node->into_list,
2173 							   sel_node);
2174 			} else {
2175 				ibool ret = (*node->func->func)(
2176 					sel_node, node->func->arg);
2177 
2178 				if (!ret) {
2179 					sel_node->state
2180 						 = SEL_NODE_NO_MORE_ROWS;
2181 				}
2182 			}
2183 		}
2184 
2185 		thr->run_node = que_node_get_parent(node);
2186 
2187 		return(thr);
2188 	}
2189 
2190 	/* Make the fetch node the parent of the cursor definition for
2191 	the time of the fetch, so that execution knows to return to this
2192 	fetch node after a row has been selected or we know that there is
2193 	no row left */
2194 
2195 	sel_node->common.parent = node;
2196 
2197 	if (sel_node->state == SEL_NODE_CLOSED) {
2198 		fprintf(stderr,
2199 			"InnoDB: Error: fetch called on a closed cursor\n");
2200 
2201 		thr_get_trx(thr)->error_state = DB_ERROR;
2202 
2203 		return(NULL);
2204 	}
2205 
2206 	thr->run_node = sel_node;
2207 
2208 	return(thr);
2209 }
2210 
2211 /****************************************************************//**
2212 Sample callback function for fetch that prints each row.
2213 @return	always returns non-NULL */
2214 UNIV_INTERN
2215 void*
row_fetch_print(void * row,void * user_arg)2216 row_fetch_print(
2217 /*============*/
2218 	void*	row,		/*!< in:  sel_node_t* */
2219 	void*	user_arg)	/*!< in:  not used */
2220 {
2221 	que_node_t*	exp;
2222 	ulint		i = 0;
2223 	sel_node_t*	node = static_cast<sel_node_t*>(row);
2224 
2225 	UT_NOT_USED(user_arg);
2226 
2227 	fprintf(stderr, "row_fetch_print: row %p\n", row);
2228 
2229 	for (exp = node->select_list;
2230 	     exp != 0;
2231 	     exp = que_node_get_next(exp), i++) {
2232 
2233 		dfield_t*	dfield = que_node_get_val(exp);
2234 		const dtype_t*	type = dfield_get_type(dfield);
2235 
2236 		fprintf(stderr, " column %lu:\n", (ulong) i);
2237 
2238 		dtype_print(type);
2239 		putc('\n', stderr);
2240 
2241 		if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2242 			ut_print_buf(stderr, dfield_get_data(dfield),
2243 				     dfield_get_len(dfield));
2244 			putc('\n', stderr);
2245 		} else {
2246 			fputs(" <NULL>;\n", stderr);
2247 		}
2248 	}
2249 
2250 	return((void*)42);
2251 }
2252 
2253 /***********************************************************//**
2254 Prints a row in a select result.
2255 @return	query thread to run next or NULL */
2256 UNIV_INTERN
2257 que_thr_t*
row_printf_step(que_thr_t * thr)2258 row_printf_step(
2259 /*============*/
2260 	que_thr_t*	thr)	/*!< in: query thread */
2261 {
2262 	row_printf_node_t*	node;
2263 	sel_node_t*		sel_node;
2264 	que_node_t*		arg;
2265 
2266 	ut_ad(thr);
2267 
2268 	node = static_cast<row_printf_node_t*>(thr->run_node);
2269 
2270 	sel_node = node->sel_node;
2271 
2272 	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2273 
2274 	if (thr->prev_node == que_node_get_parent(node)) {
2275 
2276 		/* Reset the cursor */
2277 		sel_node->state = SEL_NODE_OPEN;
2278 
2279 		/* Fetch next row to print */
2280 
2281 		thr->run_node = sel_node;
2282 
2283 		return(thr);
2284 	}
2285 
2286 	if (sel_node->state != SEL_NODE_FETCH) {
2287 
2288 		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2289 
2290 		/* No more rows to print */
2291 
2292 		thr->run_node = que_node_get_parent(node);
2293 
2294 		return(thr);
2295 	}
2296 
2297 	arg = sel_node->select_list;
2298 
2299 	while (arg) {
2300 		dfield_print_also_hex(que_node_get_val(arg));
2301 
2302 		fputs(" ::: ", stderr);
2303 
2304 		arg = que_node_get_next(arg);
2305 	}
2306 
2307 	putc('\n', stderr);
2308 
2309 	/* Fetch next row to print */
2310 
2311 	thr->run_node = sel_node;
2312 
2313 	return(thr);
2314 }
2315 
2316 /****************************************************************//**
2317 Converts a key value stored in MySQL format to an Innobase dtuple. The last
2318 field of the key value may be just a prefix of a fixed length field: hence
2319 the parameter key_len. But currently we do not allow search keys where the
2320 last field is only a prefix of the full key field len and print a warning if
2321 such appears. A counterpart of this function is
2322 ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2323 UNIV_INTERN
2324 void
row_sel_convert_mysql_key_to_innobase(dtuple_t * tuple,byte * buf,ulint buf_len,dict_index_t * index,const byte * key_ptr,ulint key_len,trx_t * trx)2325 row_sel_convert_mysql_key_to_innobase(
2326 /*==================================*/
2327 	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
2328 					NOTE: we assume that the type info
2329 					in the tuple is already according
2330 					to index! */
2331 	byte*		buf,		/*!< in: buffer to use in field
2332 					conversions; NOTE that dtuple->data
2333 					may end up pointing inside buf so
2334 					do not discard that buffer while
2335 					the tuple is being used. See
2336 					row_mysql_store_col_in_innobase_format()
2337 					in the case of DATA_INT */
2338 	ulint		buf_len,	/*!< in: buffer length */
2339 	dict_index_t*	index,		/*!< in: index of the key value */
2340 	const byte*	key_ptr,	/*!< in: MySQL key value */
2341 	ulint		key_len,	/*!< in: MySQL key value length */
2342 	trx_t*		trx)		/*!< in: transaction */
2343 {
2344 	byte*		original_buf	= buf;
2345 	const byte*	original_key_ptr = key_ptr;
2346 	dict_field_t*	field;
2347 	dfield_t*	dfield;
2348 	ulint		data_offset;
2349 	ulint		data_len;
2350 	ulint		data_field_len;
2351 	ibool		is_null;
2352 	const byte*	key_end;
2353 	ulint		n_fields = 0;
2354 
2355 	/* For documentation of the key value storage format in MySQL, see
2356 	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2357 
2358 	key_end = key_ptr + key_len;
2359 
2360 	/* Permit us to access any field in the tuple (ULINT_MAX): */
2361 
2362 	dtuple_set_n_fields(tuple, ULINT_MAX);
2363 
2364 	dfield = dtuple_get_nth_field(tuple, 0);
2365 	field = dict_index_get_nth_field(index, 0);
2366 
2367 	if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2368 		/* A special case: we are looking for a position in the
2369 		generated clustered index which InnoDB automatically added
2370 		to a table with no primary key: the first and the only
2371 		ordering column is ROW_ID which InnoDB stored to the key_ptr
2372 		buffer. */
2373 
2374 		ut_a(key_len == DATA_ROW_ID_LEN);
2375 
2376 		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2377 
2378 		dtuple_set_n_fields(tuple, 1);
2379 
2380 		return;
2381 	}
2382 
2383 	while (key_ptr < key_end) {
2384 
2385 		ulint	type = dfield_get_type(dfield)->mtype;
2386 		ut_a(field->col->mtype == type);
2387 
2388 		data_offset = 0;
2389 		is_null = FALSE;
2390 
2391 		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2392 			/* The first byte in the field tells if this is
2393 			an SQL NULL value */
2394 
2395 			data_offset = 1;
2396 
2397 			if (*key_ptr != 0) {
2398 				dfield_set_null(dfield);
2399 
2400 				is_null = TRUE;
2401 			}
2402 		}
2403 
2404 		/* Calculate data length and data field total length */
2405 
2406 		if (type == DATA_BLOB) {
2407 			/* The key field is a column prefix of a BLOB or
2408 			TEXT */
2409 
2410 			ut_a(field->prefix_len > 0);
2411 
2412 			/* MySQL stores the actual data length to the first 2
2413 			bytes after the optional SQL NULL marker byte. The
2414 			storage format is little-endian, that is, the most
2415 			significant byte at a higher address. In UTF-8, MySQL
2416 			seems to reserve field->prefix_len bytes for
2417 			storing this field in the key value buffer, even
2418 			though the actual value only takes data_len bytes
2419 			from the start. */
2420 
2421 			data_len = key_ptr[data_offset]
2422 				+ 256 * key_ptr[data_offset + 1];
2423 			data_field_len = data_offset + 2 + field->prefix_len;
2424 
2425 			data_offset += 2;
2426 
2427 			/* Now that we know the length, we store the column
2428 			value like it would be a fixed char field */
2429 
2430 		} else if (field->prefix_len > 0) {
2431 			/* Looks like MySQL pads unused end bytes in the
2432 			prefix with space. Therefore, also in UTF-8, it is ok
2433 			to compare with a prefix containing full prefix_len
2434 			bytes, and no need to take at most prefix_len / 3
2435 			UTF-8 characters from the start.
2436 			If the prefix is used as the upper end of a LIKE
2437 			'abc%' query, then MySQL pads the end with chars
2438 			0xff. TODO: in that case does it any harm to compare
2439 			with the full prefix_len bytes. How do characters
2440 			0xff in UTF-8 behave? */
2441 
2442 			data_len = field->prefix_len;
2443 			data_field_len = data_offset + data_len;
2444 		} else {
2445 			data_len = dfield_get_type(dfield)->len;
2446 			data_field_len = data_offset + data_len;
2447 		}
2448 
2449 		if (UNIV_UNLIKELY
2450 		    (dtype_get_mysql_type(dfield_get_type(dfield))
2451 		     == DATA_MYSQL_TRUE_VARCHAR)
2452 		    && UNIV_LIKELY(type != DATA_INT)) {
2453 			/* In a MySQL key value format, a true VARCHAR is
2454 			always preceded by 2 bytes of a length field.
2455 			dfield_get_type(dfield)->len returns the maximum
2456 			'payload' len in bytes. That does not include the
2457 			2 bytes that tell the actual data length.
2458 
2459 			We added the check != DATA_INT to make sure we do
2460 			not treat MySQL ENUM or SET as a true VARCHAR! */
2461 
2462 			data_len += 2;
2463 			data_field_len += 2;
2464 		}
2465 
2466 		/* Storing may use at most data_len bytes of buf */
2467 
2468 		if (UNIV_LIKELY(!is_null)) {
2469 			buf = row_mysql_store_col_in_innobase_format(
2470 					dfield, buf,
2471 					FALSE, /* MySQL key value format col */
2472 					key_ptr + data_offset, data_len,
2473 					dict_table_is_comp(index->table));
2474 			ut_a(buf <= original_buf + buf_len);
2475 		}
2476 
2477 		key_ptr += data_field_len;
2478 
2479 		if (UNIV_UNLIKELY(key_ptr > key_end)) {
2480 			/* The last field in key was not a complete key field
2481 			but a prefix of it.
2482 
2483 			Print a warning about this! HA_READ_PREFIX_LAST does
2484 			not currently work in InnoDB with partial-field key
2485 			value prefixes. Since MySQL currently uses a padding
2486 			trick to calculate LIKE 'abc%' type queries there
2487 			should never be partial-field prefixes in searches. */
2488 
2489 			ut_print_timestamp(stderr);
2490 
2491 			fputs("  InnoDB: Warning: using a partial-field"
2492 			      " key prefix in search.\n"
2493 			      "InnoDB: ", stderr);
2494 			dict_index_name_print(stderr, trx, index);
2495 			fprintf(stderr, ". Last data field length %lu bytes,\n"
2496 				"InnoDB: key ptr now exceeds"
2497 				" key end by %lu bytes.\n"
2498 				"InnoDB: Key value in the MySQL format:\n",
2499 				(ulong) data_field_len,
2500 				(ulong) (key_ptr - key_end));
2501 			fflush(stderr);
2502 			ut_print_buf(stderr, original_key_ptr, key_len);
2503 			putc('\n', stderr);
2504 
2505 			if (!is_null) {
2506 				ulint	len = dfield_get_len(dfield);
2507 				dfield_set_len(dfield, len
2508 					       - (ulint) (key_ptr - key_end));
2509 			}
2510                         ut_ad(0);
2511 		}
2512 
2513 		n_fields++;
2514 		field++;
2515 		dfield++;
2516 	}
2517 
2518 	ut_a(buf <= original_buf + buf_len);
2519 
2520 	/* We set the length of tuple to n_fields: we assume that the memory
2521 	area allocated for it is big enough (usually bigger than n_fields). */
2522 
2523 	dtuple_set_n_fields(tuple, n_fields);
2524 }
2525 
2526 /**************************************************************//**
2527 Stores the row id to the prebuilt struct. */
2528 static
2529 void
row_sel_store_row_id_to_prebuilt(row_prebuilt_t * prebuilt,const rec_t * index_rec,const dict_index_t * index,const ulint * offsets)2530 row_sel_store_row_id_to_prebuilt(
2531 /*=============================*/
2532 	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt */
2533 	const rec_t*		index_rec,	/*!< in: record */
2534 	const dict_index_t*	index,		/*!< in: index of the record */
2535 	const ulint*		offsets)	/*!< in: rec_get_offsets
2536 						(index_rec, index) */
2537 {
2538 	const byte*	data;
2539 	ulint		len;
2540 
2541 	ut_ad(rec_offs_validate(index_rec, index, offsets));
2542 
2543 	data = rec_get_nth_field(
2544 		index_rec, offsets,
2545 		dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2546 
2547 	if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2548 		fprintf(stderr,
2549 			"InnoDB: Error: Row id field is"
2550 			" wrong length %lu in ", (ulong) len);
2551 		dict_index_name_print(stderr, prebuilt->trx, index);
2552 		fprintf(stderr, "\n"
2553 			"InnoDB: Field number %lu, record:\n",
2554 			(ulong) dict_index_get_sys_col_pos(index,
2555 							   DATA_ROW_ID));
2556 		rec_print_new(stderr, index_rec, offsets);
2557 		putc('\n', stderr);
2558 		ut_error;
2559 	}
2560 
2561 	ut_memcpy(prebuilt->row_id, data, len);
2562 }
2563 
2564 #ifdef UNIV_DEBUG
2565 /** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
2566 # define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len,sec) \
2567 	row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len,sec)
2568 #else /* UNIV_DEBUG */
2569 /** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
2570 # define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len,sec) \
2571 	row_sel_field_store_in_mysql_format_func(dest,templ,src,len,sec)
2572 #endif /* UNIV_DEBUG */
2573 
2574 /** Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2575 function is row_mysql_store_col_in_innobase_format() in row0mysql.cc.
2576 @param[in,out]	dest		buffer where to store; NOTE
2577 				that BLOBs are not in themselves stored
2578 				here: the caller must allocate and copy
2579 				the BLOB into buffer before, and pass
2580 				the pointer to the BLOB in 'data'
2581 @param[in]	templ		MySQL column template. Its following fields
2582 				are referenced: type, is_unsigned, mysql_col_len,
2583 				mbminlen, mbmaxlen
2584 @param[in]	index		InnoDB index
2585 @param[in]	field_no	templ->rec_field_no or templ->clust_rec_field_no
2586 				or templ->icp_rec_field_no
2587 @param[in]	data		data to store
2588 @param[in]	len		length of the data
2589 @param[in]	sec_field	secondary index field no if the secondary index
2590 				record but the prebuilt template is in
2591 				clustered index format and used only for end
2592 				range comparison. */
2593 static MY_ATTRIBUTE((nonnull))
2594 void
row_sel_field_store_in_mysql_format_func(byte * dest,const mysql_row_templ_t * templ,const dict_index_t * index,ulint field_no,const byte * data,ulint len,ulint sec_field)2595 row_sel_field_store_in_mysql_format_func(
2596 	byte*		dest,
2597 	const mysql_row_templ_t* templ,
2598 #ifdef UNIV_DEBUG
2599 	const dict_index_t* index,
2600 	ulint		field_no,
2601 #endif /* UNIV_DEBUG */
2602 	const byte*	data,
2603 	ulint		len,
2604 	ulint		sec_field)
2605 {
2606 	byte*			ptr;
2607 #ifdef UNIV_DEBUG
2608 	const dict_field_t*	field
2609 		= dict_index_get_nth_field(index, field_no);
2610 	bool	clust_templ_for_sec = (sec_field != ULINT_UNDEFINED);
2611 #endif /* UNIV_DEBUG */
2612 
2613 	ut_ad(len != UNIV_SQL_NULL);
2614 	UNIV_MEM_ASSERT_RW(data, len);
2615 	UNIV_MEM_ASSERT_W(dest, templ->mysql_col_len);
2616 	UNIV_MEM_INVALID(dest, templ->mysql_col_len);
2617 
2618 	switch (templ->type) {
2619 		const byte*	field_end;
2620 		byte*		pad;
2621 	case DATA_INT:
2622 		/* Convert integer data from Innobase to a little-endian
2623 		format, sign bit restored to normal */
2624 
2625 		ptr = dest + len;
2626 
2627 		for (;;) {
2628 			ptr--;
2629 			*ptr = *data;
2630 			if (ptr == dest) {
2631 				break;
2632 			}
2633 			data++;
2634 		}
2635 
2636 		if (!templ->is_unsigned) {
2637 			dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2638 		}
2639 
2640 		ut_ad(templ->mysql_col_len == len);
2641 		break;
2642 
2643 	case DATA_VARCHAR:
2644 	case DATA_VARMYSQL:
2645 	case DATA_BINARY:
2646 		field_end = dest + templ->mysql_col_len;
2647 
2648 		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2649 			/* This is a >= 5.0.3 type true VARCHAR. Store the
2650 			length of the data to the first byte or the first
2651 			two bytes of dest. */
2652 
2653 			dest = row_mysql_store_true_var_len(
2654 				dest, len, templ->mysql_length_bytes);
2655 			/* Copy the actual data. Leave the rest of the
2656 			buffer uninitialized. */
2657 			memcpy(dest, data, len);
2658 			break;
2659 		}
2660 
2661 		/* Copy the actual data */
2662 		ut_memcpy(dest, data, len);
2663 
2664 		/* Pad with trailing spaces. */
2665 
2666 		pad = dest + len;
2667 
2668 		ut_ad(templ->mbminlen <= templ->mbmaxlen);
2669 
2670 		/* We treat some Unicode charset strings specially. */
2671 		switch (templ->mbminlen) {
2672 		case 4:
2673 			/* InnoDB should never have stripped partial
2674 			UTF-32 characters. */
2675 			ut_a(!(len & 3));
2676 			break;
2677 		case 2:
2678 			/* A space char is two bytes,
2679 			0x0020 in UCS2 and UTF-16 */
2680 
2681 			if (UNIV_UNLIKELY(len & 1)) {
2682 				/* A 0x20 has been stripped from the column.
2683 				Pad it back. */
2684 
2685 				if (pad < field_end) {
2686 					*pad++ = 0x20;
2687 				}
2688 			}
2689 		}
2690 
2691 		row_mysql_pad_col(templ->mbminlen, pad, field_end - pad);
2692 		break;
2693 
2694 	case DATA_BLOB:
2695 		/* Store a pointer to the BLOB buffer to dest: the BLOB was
2696 		already copied to the buffer in row_sel_store_mysql_rec */
2697 
2698 		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2699 					 len);
2700 		break;
2701 
2702 	case DATA_MYSQL:
2703 		memcpy(dest, data, len);
2704 
2705 		ut_ad(templ->mysql_col_len >= len);
2706 		ut_ad(templ->mbmaxlen >= templ->mbminlen);
2707 
2708 		/* If field_no equals to templ->icp_rec_field_no,
2709 		we are examining a row pointed by "icp_rec_field_no".
2710 		There is possibility that icp_rec_field_no refers to
2711 		a field in a secondary index while templ->rec_field_no
2712 		points to field in a primary index. The length
2713 		should still be equal, unless the field pointed
2714 		by icp_rec_field_no has a prefix */
2715 		ut_ad(templ->mbmaxlen > templ->mbminlen
2716 		      || templ->mysql_col_len == len
2717 		      || (field_no == templ->icp_rec_field_no
2718 			  && field->prefix_len > 0));
2719 
2720 		/* The following assertion would fail for old tables
2721 		containing UTF-8 ENUM columns due to Bug #9526. */
2722 		ut_ad(!templ->mbmaxlen
2723 		      || !(templ->mysql_col_len % templ->mbmaxlen));
2724 		ut_ad(clust_templ_for_sec
2725 		      || len * templ->mbmaxlen >= templ->mysql_col_len
2726 		      || (field_no == templ->icp_rec_field_no
2727 			  && field->prefix_len > 0));
2728 		ut_ad(!(field->prefix_len % templ->mbmaxlen));
2729 
2730 		if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
2731 			/* Pad with spaces. This undoes the stripping
2732 			done in row0mysql.cc, function
2733 			row_mysql_store_col_in_innobase_format(). */
2734 
2735 			memset(dest + len, 0x20, templ->mysql_col_len - len);
2736 		}
2737 		break;
2738 
2739 	default:
2740 #ifdef UNIV_DEBUG
2741 	case DATA_SYS_CHILD:
2742 	case DATA_SYS:
2743 		/* These column types should never be shipped to MySQL. */
2744 		ut_ad(0);
2745 
2746 	case DATA_CHAR:
2747 	case DATA_FIXBINARY:
2748 	case DATA_FLOAT:
2749 	case DATA_DOUBLE:
2750 	case DATA_DECIMAL:
2751 		/* Above are the valid column types for MySQL data. */
2752 #endif /* UNIV_DEBUG */
2753 		/* If sec_field value is present then mapping of
2754 		secondary index records to clustered index template
2755 		happens for end range comparison. So length can
2756 		vary according to secondary index record length. */
2757 		ut_ad(field->prefix_len
2758 		      ? field->prefix_len == len
2759 		      : (clust_templ_for_sec ?
2760 				1 : (templ->mysql_col_len == len)));
2761 		memcpy(dest, data, len);
2762 	}
2763 }
2764 
2765 #ifdef UNIV_DEBUG
2766 /** Convert a field from Innobase format to MySQL format. */
2767 # define row_sel_store_mysql_field(m,p,r,i,o,f,t,s) \
2768 	row_sel_store_mysql_field_func(m,p,r,i,o,f,t,s)
2769 #else /* UNIV_DEBUG */
2770 /** Convert a field from Innobase format to MySQL format. */
2771 # define row_sel_store_mysql_field(m,p,r,i,o,f,t,s) \
2772 	row_sel_store_mysql_field_func(m,p,r,o,f,t,s)
2773 #endif /* UNIV_DEBUG */
2774 /** Convert a field in the Innobase format to a field in the MySQL format.
2775 @param[out]	mysql_rec		record in the MySQL format
2776 @param[in,out]	prebuilt		prebuilt struct
2777 @param[in]	rec			InnoDB record; must be protected
2778 					by a page latch
2779 @param[in]	index			index of rec
2780 @param[in]	offsets			array returned by rec_get_offsets()
2781 @param[in]	field_no		templ->rec_field_no or
2782 					templ->clust_rec_field_no
2783 					or templ->icp_rec_field_no
2784 					or sec field no if clust_templ_for_sec
2785 					is TRUE
2786 @param[in]	templ			row template
2787 @param[in]	sec_field_no		field_no if rec belongs to secondary index
2788 					but prebuilt template is in clustered
2789 					index format and used only for end
2790 					range comparison. */
2791 static MY_ATTRIBUTE((warn_unused_result))
2792 ibool
row_sel_store_mysql_field_func(byte * mysql_rec,row_prebuilt_t * prebuilt,const rec_t * rec,const dict_index_t * index,const ulint * offsets,ulint field_no,const mysql_row_templ_t * templ,ulint sec_field_no)2793 row_sel_store_mysql_field_func(
2794 	byte*			mysql_rec,
2795 	row_prebuilt_t*		prebuilt,
2796 	const rec_t*		rec,
2797 #ifdef UNIV_DEBUG
2798 	const dict_index_t*	index,
2799 #endif
2800 	const ulint*		offsets,
2801 	ulint			field_no,
2802 	const mysql_row_templ_t*templ,
2803 	ulint			sec_field_no)
2804 {
2805 	const byte*	data;
2806 	ulint		len;
2807 	ulint		clust_field_no = 0;
2808 	bool		clust_templ_for_sec = (sec_field_no != ULINT_UNDEFINED);
2809 
2810 	ut_ad(prebuilt->default_rec);
2811 	ut_ad(templ);
2812 	ut_ad(templ >= prebuilt->mysql_template);
2813 	ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
2814 	ut_ad(clust_templ_for_sec
2815 	      || field_no == templ->clust_rec_field_no
2816 	      || field_no == templ->rec_field_no
2817 	      || field_no == templ->icp_rec_field_no);
2818 	ut_ad(rec_offs_validate(rec,
2819 		clust_templ_for_sec ? prebuilt->index : index, offsets));
2820 
2821 	/* If sec_field_no is present then extract the data from record
2822 	using secondary field no. */
2823 	if (clust_templ_for_sec) {
2824 		clust_field_no = field_no;
2825 		field_no = sec_field_no;
2826 	}
2827 
2828 	if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
2829 
2830 		mem_heap_t*	heap;
2831 		/* Copy an externally stored field to a temporary heap */
2832 
2833 		ut_a(!prebuilt->trx->has_search_latch);
2834 		ut_ad(field_no == templ->clust_rec_field_no);
2835 
2836 		if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2837 			if (prebuilt->blob_heap == NULL) {
2838 				prebuilt->blob_heap = mem_heap_create(
2839 					UNIV_PAGE_SIZE);
2840 			}
2841 
2842 			heap = prebuilt->blob_heap;
2843 		} else {
2844 			heap = mem_heap_create(UNIV_PAGE_SIZE);
2845 		}
2846 
2847 		/* NOTE: if we are retrieving a big BLOB, we may
2848 		already run out of memory in the next call, which
2849 		causes an assert */
2850 
2851 		data = btr_rec_copy_externally_stored_field(
2852 			rec, offsets,
2853 			dict_table_zip_size(prebuilt->table),
2854 			field_no, &len, heap);
2855 
2856 		if (UNIV_UNLIKELY(!data)) {
2857 			/* The externally stored field was not written
2858 			yet. This record should only be seen by
2859 			recv_recovery_rollback_active() or any
2860 			TRX_ISO_READ_UNCOMMITTED transactions. */
2861 
2862 			if (heap != prebuilt->blob_heap) {
2863 				mem_heap_free(heap);
2864 			}
2865 
2866 			ut_a(prebuilt->trx->isolation_level
2867 			     == TRX_ISO_READ_UNCOMMITTED);
2868 			return(FALSE);
2869 		}
2870 
2871 		ut_a(len != UNIV_SQL_NULL);
2872 
2873 		row_sel_field_store_in_mysql_format(
2874 			mysql_rec + templ->mysql_col_offset,
2875 			templ, index, field_no, data, len,
2876 			ULINT_UNDEFINED);
2877 
2878 		if (heap != prebuilt->blob_heap) {
2879 			mem_heap_free(heap);
2880 		}
2881 	} else {
2882 		/* Field is stored in the row. */
2883 
2884 		data = rec_get_nth_field(rec, offsets, field_no, &len);
2885 
2886 		if (len == UNIV_SQL_NULL) {
2887 			/* MySQL assumes that the field for an SQL
2888 			NULL value is set to the default value. */
2889 			ut_ad(templ->mysql_null_bit_mask);
2890 
2891 			UNIV_MEM_ASSERT_RW(prebuilt->default_rec
2892 					   + templ->mysql_col_offset,
2893 					   templ->mysql_col_len);
2894 			mysql_rec[templ->mysql_null_byte_offset]
2895 				|= (byte) templ->mysql_null_bit_mask;
2896 			memcpy(mysql_rec + templ->mysql_col_offset,
2897 			       (const byte*) prebuilt->default_rec
2898 			       + templ->mysql_col_offset,
2899 			       templ->mysql_col_len);
2900 			return(TRUE);
2901 		}
2902 
2903 		if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2904 
2905 			/* It is a BLOB field locally stored in the
2906 			InnoDB record: we MUST copy its contents to
2907 			prebuilt->blob_heap here because
2908 			row_sel_field_store_in_mysql_format() stores a
2909 			pointer to the data, and the data passed to us
2910 			will be invalid as soon as the
2911 			mini-transaction is committed and the page
2912 			latch on the clustered index page is
2913 			released. */
2914 
2915 			if (prebuilt->blob_heap == NULL) {
2916 				prebuilt->blob_heap = mem_heap_create(
2917 					UNIV_PAGE_SIZE);
2918 			}
2919 
2920 			data = static_cast<byte*>(
2921 				mem_heap_dup(prebuilt->blob_heap, data, len));
2922 		}
2923 
2924 		/* Reassign the clustered index field no. */
2925 		if (clust_templ_for_sec) {
2926 			field_no = clust_field_no;
2927 		}
2928 
2929 		row_sel_field_store_in_mysql_format(
2930 			mysql_rec + templ->mysql_col_offset,
2931 			templ, index, field_no, data, len, sec_field_no);
2932 	}
2933 
2934 	ut_ad(len != UNIV_SQL_NULL);
2935 
2936 	if (templ->mysql_null_bit_mask) {
2937 		/* It is a nullable column with a non-NULL
2938 		value */
2939 		mysql_rec[templ->mysql_null_byte_offset]
2940 			&= ~(byte) templ->mysql_null_bit_mask;
2941 	}
2942 
2943 	return(TRUE);
2944 }
2945 
2946 /** Convert a row in the Innobase format to a row in the MySQL format.
2947 Note that the template in prebuilt may advise us to copy only a few
2948 columns to mysql_rec, other columns are left blank. All columns may not
2949 be needed in the query.
2950 @param[out]	mysql_rec		row in the MySQL format
2951 @param[in]	prebuilt		prebuilt structure
2952 @param[in]	rec			Innobase record in the index
2953 					which was described in prebuilt's
2954 					template, or in the clustered index;
2955 					must be protected by a page latch
2956 @param[in]	rec_clust		TRUE if the rec in the clustered index
2957 @param[in]	index			index of rec
2958 @param[in]	offsets			array returned by rec_get_offsets(rec)
2959 @param[in]	clust_templ_for_sec	TRUE if rec belongs to secondary index
2960 					but the prebuilt->template is in
2961 					clustered index format and it is
2962 					used only for end range comparison
2963 @return TRUE on success, FALSE if not all columns could be retrieved */
2964 static MY_ATTRIBUTE((warn_unused_result))
2965 ibool
row_sel_store_mysql_rec(byte * mysql_rec,row_prebuilt_t * prebuilt,const rec_t * rec,ibool rec_clust,const dict_index_t * index,const ulint * offsets,bool clust_templ_for_sec)2966 row_sel_store_mysql_rec(
2967 	byte*		mysql_rec,
2968 	row_prebuilt_t*	prebuilt,
2969 	const rec_t*	rec,
2970 	ibool		rec_clust,
2971 	const dict_index_t* index,
2972 	const ulint*	offsets,
2973 	bool		clust_templ_for_sec)
2974 {
2975 	ulint				i;
2976 	std::vector<const dict_col_t*>	template_col;
2977 
2978 	ut_ad(rec_clust || index == prebuilt->index);
2979 	ut_ad(!rec_clust || dict_index_is_clust(index));
2980 
2981 	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2982 		mem_heap_free(prebuilt->blob_heap);
2983 		prebuilt->blob_heap = NULL;
2984 	}
2985 
2986 	if (clust_templ_for_sec) {
2987 		/* Store all clustered index column of
2988 		secondary index record. */
2989 		for (i = 0; i < dict_index_get_n_fields(
2990 				prebuilt->index); i++) {
2991 			ulint   sec_field = dict_index_get_nth_field_pos(
2992 				index, prebuilt->index, i);
2993 
2994 			if (sec_field == ULINT_UNDEFINED) {
2995 				template_col.push_back(NULL);
2996 				continue;
2997 			}
2998 
2999 			const dict_field_t*	field =
3000 				dict_index_get_nth_field(index, sec_field);
3001 			const dict_col_t*	col =
3002 				dict_field_get_col(field);
3003 
3004 			template_col.push_back(col);
3005 		}
3006 	}
3007 
3008 	for (i = 0; i < prebuilt->n_template; i++) {
3009 		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
3010 		ulint		field_no
3011 			= rec_clust
3012 			? templ->clust_rec_field_no
3013 			: templ->rec_field_no;
3014 		ulint		sec_field_no = ULINT_UNDEFINED;
3015 
3016 		/* We should never deliver column prefixes to MySQL,
3017 		except for evaluating innobase_index_cond(). */
3018 		ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
3019 		      == 0);
3020 
3021 		if (clust_templ_for_sec) {
3022 			std::vector<const dict_col_t*>::iterator it;
3023 			const dict_field_t*	field =
3024 					dict_index_get_nth_field(index, field_no);
3025 			const dict_col_t*	col = dict_field_get_col(
3026 								field);
3027 			it = std::find(template_col.begin(),
3028 				       template_col.end(), col);
3029 
3030 			if (it == template_col.end()) {
3031 				continue;
3032 			}
3033 
3034 			ut_ad(templ->rec_field_no == templ->clust_rec_field_no);
3035 
3036 			sec_field_no = it - template_col.begin();
3037 		}
3038 
3039 		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
3040 					       rec, index, offsets,
3041 					       field_no, templ,
3042 					       sec_field_no)) {
3043 			return(FALSE);
3044 		}
3045 	}
3046 
3047 	/* FIXME: We only need to read the doc_id if an FTS indexed
3048 	column is being updated.
3049 	NOTE, the record must be cluster index record. Secondary index
3050 	might not have the Doc ID */
3051 	if (dict_table_has_fts_index(prebuilt->table)
3052 	    && dict_index_is_clust(index)
3053 	    && !clust_templ_for_sec) {
3054 
3055 		prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
3056 			prebuilt->table, rec, NULL);
3057 	}
3058 
3059 	return(TRUE);
3060 }
3061 
3062 /*********************************************************************//**
3063 Builds a previous version of a clustered index record for a consistent read
3064 @return	DB_SUCCESS or error code */
3065 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3066 dberr_t
row_sel_build_prev_vers_for_mysql(read_view_t * read_view,dict_index_t * clust_index,row_prebuilt_t * prebuilt,const rec_t * rec,ulint ** offsets,mem_heap_t ** offset_heap,rec_t ** old_vers,mtr_t * mtr)3067 row_sel_build_prev_vers_for_mysql(
3068 /*==============================*/
3069 	read_view_t*	read_view,	/*!< in: read view */
3070 	dict_index_t*	clust_index,	/*!< in: clustered index */
3071 	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
3072 	const rec_t*	rec,		/*!< in: record in a clustered index */
3073 	ulint**		offsets,	/*!< in/out: offsets returned by
3074 					rec_get_offsets(rec, clust_index) */
3075 	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
3076 					the offsets are allocated */
3077 	rec_t**		old_vers,	/*!< out: old version, or NULL if the
3078 					record does not exist in the view:
3079 					i.e., it was freshly inserted
3080 					afterwards */
3081 	mtr_t*		mtr)		/*!< in: mtr */
3082 {
3083 	dberr_t	err;
3084 
3085 	if (prebuilt->old_vers_heap) {
3086 		mem_heap_empty(prebuilt->old_vers_heap);
3087 	} else {
3088 		prebuilt->old_vers_heap = mem_heap_create(200);
3089 	}
3090 
3091 	err = row_vers_build_for_consistent_read(
3092 		rec, mtr, clust_index, offsets, read_view, offset_heap,
3093 		prebuilt->old_vers_heap, old_vers);
3094 	return(err);
3095 }
3096 
3097 /*********************************************************************//**
3098 Retrieves the clustered index record corresponding to a record in a
3099 non-clustered index. Does the necessary locking. Used in the MySQL
3100 interface.
3101 @return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
3102 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3103 dberr_t
row_sel_get_clust_rec_for_mysql(row_prebuilt_t * prebuilt,dict_index_t * sec_index,const rec_t * rec,que_thr_t * thr,const rec_t ** out_rec,ulint ** offsets,mem_heap_t ** offset_heap,mtr_t * mtr)3104 row_sel_get_clust_rec_for_mysql(
3105 /*============================*/
3106 	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct in the handle */
3107 	dict_index_t*	sec_index,/*!< in: secondary index where rec resides */
3108 	const rec_t*	rec,	/*!< in: record in a non-clustered index; if
3109 				this is a locking read, then rec is not
3110 				allowed to be delete-marked, and that would
3111 				not make sense either */
3112 	que_thr_t*	thr,	/*!< in: query thread */
3113 	const rec_t**	out_rec,/*!< out: clustered record or an old version of
3114 				it, NULL if the old version did not exist
3115 				in the read view, i.e., it was a fresh
3116 				inserted version */
3117 	ulint**		offsets,/*!< in: offsets returned by
3118 				rec_get_offsets(rec, sec_index);
3119 				out: offsets returned by
3120 				rec_get_offsets(out_rec, clust_index) */
3121 	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
3122 				the offsets are allocated */
3123 	mtr_t*		mtr)	/*!< in: mtr used to get access to the
3124 				non-clustered record; the same mtr is used to
3125 				access the clustered index */
3126 {
3127 	dict_index_t*	clust_index;
3128 	const rec_t*	clust_rec;
3129 	rec_t*		old_vers;
3130 	dberr_t		err;
3131 	trx_t*		trx;
3132 
3133 	*out_rec = NULL;
3134 	trx = thr_get_trx(thr);
3135 
3136 	row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
3137 				   sec_index, *offsets, trx);
3138 
3139 	clust_index = dict_table_get_first_index(sec_index->table);
3140 
3141 	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
3142 				   PAGE_CUR_LE, BTR_SEARCH_LEAF,
3143 				   &prebuilt->clust_pcur, 0, mtr);
3144 
3145 	clust_rec = btr_pcur_get_rec(&prebuilt->clust_pcur);
3146 
3147 	prebuilt->clust_pcur.trx_if_known = trx;
3148 
3149 	/* Note: only if the search ends up on a non-infimum record is the
3150 	low_match value the real match to the search tuple */
3151 
3152 	if (!page_rec_is_user_rec(clust_rec)
3153 	    || btr_pcur_get_low_match(&prebuilt->clust_pcur)
3154 	    < dict_index_get_n_unique(clust_index)) {
3155 
3156 		/* In a rare case it is possible that no clust rec is found
3157 		for a delete-marked secondary index record: if in row0umod.cc
3158 		in row_undo_mod_remove_clust_low() we have already removed
3159 		the clust rec, while purge is still cleaning and removing
3160 		secondary index records associated with earlier versions of
3161 		the clustered index record. In that case we know that the
3162 		clustered index record did not exist in the read view of
3163 		trx. */
3164 
3165 		if (!rec_get_deleted_flag(rec,
3166 					  dict_table_is_comp(sec_index->table))
3167 		    || prebuilt->select_lock_type != LOCK_NONE) {
3168 			ut_print_timestamp(stderr);
3169 			fputs("  InnoDB: error clustered record"
3170 			      " for sec rec not found\n"
3171 			      "InnoDB: ", stderr);
3172 			dict_index_name_print(stderr, trx, sec_index);
3173 			fputs("\n"
3174 			      "InnoDB: sec index record ", stderr);
3175 			rec_print(stderr, rec, sec_index);
3176 			fputs("\n"
3177 			      "InnoDB: clust index record ", stderr);
3178 			rec_print(stderr, clust_rec, clust_index);
3179 			putc('\n', stderr);
3180 			trx_print(stderr, trx, 600);
3181 			fputs("\n"
3182 			      "InnoDB: Submit a detailed bug report"
3183 			      " to http://bugs.mysql.com\n", stderr);
3184 			ut_ad(0);
3185 		}
3186 
3187 		clust_rec = NULL;
3188 
3189 		err = DB_SUCCESS;
3190 		goto func_exit;
3191 	}
3192 
3193 	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
3194 				   ULINT_UNDEFINED, offset_heap);
3195 
3196 	if (prebuilt->select_lock_type != LOCK_NONE) {
3197 		/* Try to place a lock on the index record; we are searching
3198 		the clust rec with a unique condition, hence
3199 		we set a LOCK_REC_NOT_GAP type lock */
3200 
3201 		err = lock_clust_rec_read_check_and_lock(
3202 			0, btr_pcur_get_block(&prebuilt->clust_pcur),
3203 			clust_rec, clust_index, *offsets,
3204 			static_cast<enum lock_mode>(prebuilt->select_lock_type),
3205 			LOCK_REC_NOT_GAP,
3206 			thr);
3207 
3208 		switch (err) {
3209 		case DB_SUCCESS:
3210 		case DB_SUCCESS_LOCKED_REC:
3211 			break;
3212 		default:
3213 			goto err_exit;
3214 		}
3215 	} else {
3216 		/* This is a non-locking consistent read: if necessary, fetch
3217 		a previous version of the record */
3218 
3219 		old_vers = NULL;
3220 
3221 		/* If the isolation level allows reading of uncommitted data,
3222 		then we never look for an earlier version */
3223 
3224 		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3225 		    && !lock_clust_rec_cons_read_sees(
3226 			    clust_rec, clust_index, *offsets,
3227 			    trx->read_view)) {
3228 
3229 			/* The following call returns 'offsets' associated with
3230 			'old_vers' */
3231 			err = row_sel_build_prev_vers_for_mysql(
3232 				trx->read_view, clust_index, prebuilt,
3233 				clust_rec, offsets, offset_heap, &old_vers,
3234 				mtr);
3235 
3236 			if (err != DB_SUCCESS || old_vers == NULL) {
3237 
3238 				goto err_exit;
3239 			}
3240 
3241 			clust_rec = old_vers;
3242 		}
3243 
3244 		/* If we had to go to an earlier version of row or the
3245 		secondary index record is delete marked, then it may be that
3246 		the secondary index record corresponding to clust_rec
3247 		(or old_vers) is not rec; in that case we must ignore
3248 		such row because in our snapshot rec would not have existed.
3249 		Remember that from rec we cannot see directly which transaction
3250 		id corresponds to it: we have to go to the clustered index
3251 		record. A query where we want to fetch all rows where
3252 		the secondary index value is in some interval would return
3253 		a wrong result if we would not drop rows which we come to
3254 		visit through secondary index records that would not really
3255 		exist in our snapshot. */
3256 
3257 		if (clust_rec
3258 		    && (old_vers
3259 			|| trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
3260 			|| rec_get_deleted_flag(rec, dict_table_is_comp(
3261 							sec_index->table)))
3262 		    && !row_sel_sec_rec_is_for_clust_rec(
3263 			    rec, sec_index, clust_rec, clust_index)) {
3264 			clust_rec = NULL;
3265 #ifdef UNIV_SEARCH_DEBUG
3266 		} else {
3267 			ut_a(clust_rec == NULL
3268 			     || row_sel_sec_rec_is_for_clust_rec(
3269 				     rec, sec_index, clust_rec, clust_index));
3270 #endif
3271 		}
3272 
3273 		err = DB_SUCCESS;
3274 	}
3275 
3276 func_exit:
3277 	*out_rec = clust_rec;
3278 
3279 	/* Store the current position if select_lock_type is not
3280 	LOCK_NONE or if we are scanning using InnoDB APIs */
3281 	if (prebuilt->select_lock_type != LOCK_NONE
3282 	    || prebuilt->innodb_api) {
3283 		/* We may use the cursor in update or in unlock_row():
3284 		store its position */
3285 
3286 		btr_pcur_store_position(&prebuilt->clust_pcur, mtr);
3287 	}
3288 
3289 err_exit:
3290 	return(err);
3291 }
3292 
3293 /********************************************************************//**
3294 Restores cursor position after it has been stored. We have to take into
3295 account that the record cursor was positioned on may have been deleted.
3296 Then we may have to move the cursor one step up or down.
3297 @return TRUE if we may need to process the record the cursor is now
3298 positioned on (i.e. we should not go to the next record yet) */
3299 static
3300 ibool
sel_restore_position_for_mysql(ibool * same_user_rec,ulint latch_mode,btr_pcur_t * pcur,ibool moves_up,mtr_t * mtr)3301 sel_restore_position_for_mysql(
3302 /*===========================*/
3303 	ibool*		same_user_rec,	/*!< out: TRUE if we were able to restore
3304 					the cursor on a user record with the
3305 					same ordering prefix in in the
3306 					B-tree index */
3307 	ulint		latch_mode,	/*!< in: latch mode wished in
3308 					restoration */
3309 	btr_pcur_t*	pcur,		/*!< in: cursor whose position
3310 					has been stored */
3311 	ibool		moves_up,	/*!< in: TRUE if the cursor moves up
3312 					in the index */
3313 	mtr_t*		mtr)		/*!< in: mtr; CAUTION: may commit
3314 					mtr temporarily! */
3315 {
3316 	ibool		success;
3317 
3318 	success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3319 
3320 	*same_user_rec = success;
3321 
3322 	ut_ad(!success || pcur->rel_pos == BTR_PCUR_ON);
3323 #ifdef UNIV_DEBUG
3324 	if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
3325 		ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
3326 		      || pcur->rel_pos == BTR_PCUR_AFTER);
3327 	} else {
3328 		ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
3329 		ut_ad((pcur->rel_pos == BTR_PCUR_ON)
3330 		      == btr_pcur_is_on_user_rec(pcur));
3331 	}
3332 #endif
3333 
3334 	/* The position may need be adjusted for rel_pos and moves_up. */
3335 
3336 	switch (pcur->rel_pos) {
3337 	case BTR_PCUR_ON:
3338 		if (!success && moves_up) {
3339 next:
3340 			btr_pcur_move_to_next(pcur, mtr);
3341 			return(TRUE);
3342 		}
3343 		return(!success);
3344 	case BTR_PCUR_AFTER_LAST_IN_TREE:
3345 	case BTR_PCUR_BEFORE_FIRST_IN_TREE:
3346 		return(TRUE);
3347 	case BTR_PCUR_AFTER:
3348 		/* positioned to record after pcur->old_rec. */
3349 		pcur->pos_state = BTR_PCUR_IS_POSITIONED;
3350 prev:
3351 		if (btr_pcur_is_on_user_rec(pcur) && !moves_up) {
3352 			btr_pcur_move_to_prev(pcur, mtr);
3353 		}
3354 		return(TRUE);
3355 	case BTR_PCUR_BEFORE:
3356 		/* For non optimistic restoration:
3357 		The position is now set to the record before pcur->old_rec.
3358 
3359 		For optimistic restoration:
3360 		The position also needs to take the previous search_mode into
3361 		consideration. */
3362 
3363 		switch (pcur->pos_state) {
3364 		case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
3365 			pcur->pos_state = BTR_PCUR_IS_POSITIONED;
3366 			if (pcur->search_mode == PAGE_CUR_GE) {
3367 				/* Positioned during Greater or Equal search
3368 				with BTR_PCUR_BEFORE. Optimistic restore to
3369 				the same record. If scanning for lower then
3370 				we must move to previous record.
3371 				This can happen with:
3372 				HANDLER READ idx a = (const);
3373 				HANDLER READ idx PREV; */
3374 				goto prev;
3375 			}
3376 			return(TRUE);
3377 		case BTR_PCUR_IS_POSITIONED:
3378 			if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3379 				goto next;
3380 			}
3381 			return(TRUE);
3382 		case BTR_PCUR_WAS_POSITIONED:
3383 		case BTR_PCUR_NOT_POSITIONED:
3384 			break;
3385 		}
3386 	}
3387 	ut_ad(0);
3388 	return(TRUE);
3389 }
3390 
3391 /********************************************************************//**
3392 Copies a cached field for MySQL from the fetch cache. */
3393 static
3394 void
row_sel_copy_cached_field_for_mysql(byte * buf,const byte * cache,const mysql_row_templ_t * templ)3395 row_sel_copy_cached_field_for_mysql(
3396 /*================================*/
3397 	byte*			buf,	/*!< in/out: row buffer */
3398 	const byte*		cache,	/*!< in: cached row */
3399 	const mysql_row_templ_t*templ)	/*!< in: column template */
3400 {
3401 	ulint	len;
3402 
3403 	buf += templ->mysql_col_offset;
3404 	cache += templ->mysql_col_offset;
3405 
3406 	UNIV_MEM_ASSERT_W(buf, templ->mysql_col_len);
3407 
3408 	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
3409 	    && templ->type != DATA_INT) {
3410 		/* Check for != DATA_INT to make sure we do
3411 		not treat MySQL ENUM or SET as a true VARCHAR!
3412 		Find the actual length of the true VARCHAR field. */
3413 		row_mysql_read_true_varchar(
3414 			&len, cache, templ->mysql_length_bytes);
3415 		len += templ->mysql_length_bytes;
3416 		UNIV_MEM_INVALID(buf, templ->mysql_col_len);
3417 	} else {
3418 		len = templ->mysql_col_len;
3419 	}
3420 
3421 	ut_memcpy(buf, cache, len);
3422 }
3423 
3424 /** Copy used fields from cached row.
3425 Copy cache record field by field, don't touch fields that
3426 are not covered by current key.
3427 @param[out]     buf             Where to copy the MySQL row.
3428 @param[in]      cached_rec      What to copy (in MySQL row format).
3429 @param[in]      prebuilt        prebuilt struct. */
3430 void
row_sel_copy_cached_fields_for_mysql(byte * buf,const byte * cached_rec,row_prebuilt_t * prebuilt)3431 row_sel_copy_cached_fields_for_mysql(
3432         byte*           buf,
3433         const byte*     cached_rec,
3434         row_prebuilt_t* prebuilt)
3435 {
3436         const mysql_row_templ_t*templ;
3437         ulint                   i;
3438         for (i = 0; i < prebuilt->n_template; i++) {
3439                 templ = prebuilt->mysql_template + i;
3440 
3441                 row_sel_copy_cached_field_for_mysql(
3442                         buf, cached_rec, templ);
3443                 /* Copy NULL bit of the current field from cached_rec
3444                 to buf */
3445                 if (templ->mysql_null_bit_mask) {
3446                         buf[templ->mysql_null_byte_offset]
3447                                 ^= (buf[templ->mysql_null_byte_offset]
3448                                     ^ cached_rec[templ->mysql_null_byte_offset])
3449                                 & (byte) templ->mysql_null_bit_mask;
3450                 }
3451         }
3452 }
3453 
3454 /********************************************************************//**
3455 Pops a cached row for MySQL from the fetch cache. */
3456 UNIV_INLINE
3457 void
row_sel_dequeue_cached_row_for_mysql(byte * buf,row_prebuilt_t * prebuilt)3458 row_sel_dequeue_cached_row_for_mysql(
3459 /*=================================*/
3460 	byte*		buf,		/*!< in/out: buffer where to copy the
3461 					row */
3462 	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
3463 {
3464 	ulint			i;
3465 	const mysql_row_templ_t*templ;
3466 	const byte*		cached_rec;
3467 	ut_ad(prebuilt->n_fetch_cached > 0);
3468 	ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3469 
3470 	UNIV_MEM_ASSERT_W(buf, prebuilt->mysql_row_len);
3471 
3472 	cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
3473 
3474 	if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3475 		/* Copy cache record field by field, don't touch fields that
3476 		are not covered by current key */
3477 
3478 		for (i = 0; i < prebuilt->n_template; i++) {
3479 			templ = prebuilt->mysql_template + i;
3480 			row_sel_copy_cached_field_for_mysql(
3481 				buf, cached_rec, templ);
3482 			/* Copy NULL bit of the current field from cached_rec
3483 			to buf */
3484 			if (templ->mysql_null_bit_mask) {
3485 				buf[templ->mysql_null_byte_offset]
3486 					^= (buf[templ->mysql_null_byte_offset]
3487 					    ^ cached_rec[templ->mysql_null_byte_offset])
3488 					& (byte) templ->mysql_null_bit_mask;
3489 			}
3490 		}
3491 	} else if (prebuilt->mysql_prefix_len > 63) {
3492 		/* The record is long. Copy it field by field, in case
3493 		there are some long VARCHAR column of which only a
3494 		small length is being used. */
3495 		UNIV_MEM_INVALID(buf, prebuilt->mysql_prefix_len);
3496 
3497 		/* First copy the NULL bits. */
3498 		ut_memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
3499 		/* Then copy the requested fields. */
3500 
3501 		for (i = 0; i < prebuilt->n_template; i++) {
3502 			row_sel_copy_cached_field_for_mysql(
3503 				buf, cached_rec, prebuilt->mysql_template + i);
3504 		}
3505 	} else {
3506 		ut_memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
3507 	}
3508 
3509 	prebuilt->n_fetch_cached--;
3510 	prebuilt->fetch_cache_first++;
3511 
3512 	if (prebuilt->n_fetch_cached == 0) {
3513 		prebuilt->fetch_cache_first = 0;
3514 	}
3515 }
3516 
3517 /********************************************************************//**
3518 Initialise the prefetch cache. */
3519 UNIV_INLINE
3520 void
row_sel_prefetch_cache_init(row_prebuilt_t * prebuilt)3521 row_sel_prefetch_cache_init(
3522 /*========================*/
3523 	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
3524 {
3525 	ulint	i;
3526 	ulint	sz;
3527 	byte*	ptr;
3528 
3529 	/* Reserve space for the magic number. */
3530 	sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
3531 	ptr = static_cast<byte*>(mem_alloc(sz));
3532 
3533 	for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
3534 
3535 		/* A user has reported memory corruption in these
3536 		buffers in Linux. Put magic numbers there to help
3537 		to track a possible bug. */
3538 
3539 		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3540 		ptr += 4;
3541 
3542 		prebuilt->fetch_cache[i] = ptr;
3543 		ptr += prebuilt->mysql_row_len;
3544 
3545 		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
3546 		ptr += 4;
3547 	}
3548 }
3549 
3550 /********************************************************************//**
3551 Get the last fetch cache buffer from the queue.
3552 @return pointer to buffer. */
3553 UNIV_INLINE
3554 byte*
row_sel_fetch_last_buf(row_prebuilt_t * prebuilt)3555 row_sel_fetch_last_buf(
3556 /*===================*/
3557 	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
3558 {
3559 	ut_ad(!prebuilt->templ_contains_blob);
3560 	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3561 
3562 	if (prebuilt->fetch_cache[0] == NULL) {
3563 		/* Allocate memory for the fetch cache */
3564 		ut_ad(prebuilt->n_fetch_cached == 0);
3565 
3566 		row_sel_prefetch_cache_init(prebuilt);
3567 	}
3568 
3569 	ut_ad(prebuilt->fetch_cache_first == 0);
3570 	UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
3571 			 prebuilt->mysql_row_len);
3572 
3573 	return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
3574 }
3575 
3576 /********************************************************************//**
3577 Pushes a row for MySQL to the fetch cache. */
3578 UNIV_INLINE
3579 void
row_sel_enqueue_cache_row_for_mysql(byte * mysql_rec,row_prebuilt_t * prebuilt)3580 row_sel_enqueue_cache_row_for_mysql(
3581 /*================================*/
3582 	byte*		mysql_rec,	/*!< in/out: MySQL record */
3583 	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
3584 {
3585 	/* For non ICP code path the row should already exist in the
3586 	next fetch cache slot. */
3587 
3588 	if (prebuilt->idx_cond != NULL) {
3589 		byte*	dest = row_sel_fetch_last_buf(prebuilt);
3590 
3591 		ut_memcpy(dest, mysql_rec, prebuilt->mysql_row_len);
3592 	}
3593 
3594 	++prebuilt->n_fetch_cached;
3595 }
3596 
3597 /*********************************************************************//**
3598 Tries to do a shortcut to fetch a clustered index record with a unique key,
3599 using the hash index if possible (not always). We assume that the search
3600 mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3601 btr search latch has been locked in S-mode if AHI is enabled.
3602 @return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3603 static
3604 ulint
row_sel_try_search_shortcut_for_mysql(const rec_t ** out_rec,row_prebuilt_t * prebuilt,ulint ** offsets,mem_heap_t ** heap,mtr_t * mtr)3605 row_sel_try_search_shortcut_for_mysql(
3606 /*==================================*/
3607 	const rec_t**	out_rec,/*!< out: record if found */
3608 	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct */
3609 	ulint**		offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
3610 	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
3611 	mtr_t*		mtr)	/*!< in: started mtr */
3612 {
3613 	dict_index_t*	index		= prebuilt->index;
3614 	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
3615 	btr_pcur_t*	pcur		= &prebuilt->pcur;
3616 	trx_t*		trx		= prebuilt->trx;
3617 	const rec_t*	rec;
3618 
3619 	ut_ad(dict_index_is_clust(index));
3620 	ut_ad(!prebuilt->templ_contains_blob);
3621 
3622 #ifndef UNIV_SEARCH_DEBUG
3623 	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3624 				   BTR_SEARCH_LEAF, pcur,
3625 				   (trx->has_search_latch)
3626 				    ? RW_S_LATCH
3627 				    : 0,
3628 				   mtr);
3629 #else /* UNIV_SEARCH_DEBUG */
3630 	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3631 				   BTR_SEARCH_LEAF, pcur,
3632 				   0,
3633 				   mtr);
3634 #endif /* UNIV_SEARCH_DEBUG */
3635 	rec = btr_pcur_get_rec(pcur);
3636 
3637 	if (!page_rec_is_user_rec(rec)) {
3638 
3639 		return(SEL_RETRY);
3640 	}
3641 
3642 	/* As the cursor is now placed on a user record after a search with
3643 	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3644 	fields in the user record matched to the search tuple */
3645 
3646 	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3647 
3648 		return(SEL_EXHAUSTED);
3649 	}
3650 
3651 	/* This is a non-locking consistent read: if necessary, fetch
3652 	a previous version of the record */
3653 
3654 	*offsets = rec_get_offsets(rec, index, *offsets,
3655 				   ULINT_UNDEFINED, heap);
3656 
3657 	if (!lock_clust_rec_cons_read_sees(rec, index,
3658 					   *offsets, trx->read_view)) {
3659 
3660 		return(SEL_RETRY);
3661 	}
3662 
3663 	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3664 
3665 		return(SEL_EXHAUSTED);
3666 	}
3667 
3668 	*out_rec = rec;
3669 
3670 	return(SEL_FOUND);
3671 }
3672 
3673 /*********************************************************************//**
3674 Check a pushed-down index condition.
3675 @return ICP_NO_MATCH, ICP_MATCH, or ICP_OUT_OF_RANGE */
3676 static
3677 enum icp_result
row_search_idx_cond_check(byte * mysql_rec,row_prebuilt_t * prebuilt,const rec_t * rec,const ulint * offsets)3678 row_search_idx_cond_check(
3679 /*======================*/
3680 	byte*			mysql_rec,	/*!< out: record
3681 						in MySQL format (invalid unless
3682 						prebuilt->idx_cond!=NULL and
3683 						we return ICP_MATCH) */
3684 	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct
3685 						for the table handle */
3686 	const rec_t*		rec,		/*!< in: InnoDB record */
3687 	const ulint*		offsets)	/*!< in: rec_get_offsets() */
3688 {
3689 	enum icp_result result;
3690 	ulint		i;
3691 
3692 	ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
3693 
3694 	if (!prebuilt->idx_cond) {
3695 		return(ICP_MATCH);
3696 	}
3697 
3698 	MONITOR_INC(MONITOR_ICP_ATTEMPTS);
3699 
3700 	/* Convert to MySQL format those fields that are needed for
3701 	evaluating the index condition. */
3702 
3703 	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
3704 		mem_heap_empty(prebuilt->blob_heap);
3705 	}
3706 
3707 	for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
3708 		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
3709 
3710 		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
3711 					       rec, prebuilt->index, offsets,
3712 					       templ->icp_rec_field_no,
3713 					       templ, ULINT_UNDEFINED)) {
3714 			return(ICP_NO_MATCH);
3715 		}
3716 	}
3717 
3718 	/* We assume that the index conditions on
3719 	case-insensitive columns are case-insensitive. The
3720 	case of such columns may be wrong in a secondary
3721 	index, if the case of the column has been updated in
3722 	the past, or a record has been deleted and a record
3723 	inserted in a different case. */
3724 	result = innobase_index_cond(prebuilt->idx_cond);
3725 	switch (result) {
3726 	case ICP_MATCH:
3727 		/* Convert the remaining fields to MySQL format.
3728 		If this is a secondary index record, we must defer
3729 		this until we have fetched the clustered index record. */
3730 		if (!prebuilt->need_to_access_clustered
3731 		    || dict_index_is_clust(prebuilt->index)) {
3732 			if (!row_sel_store_mysql_rec(
3733 				    mysql_rec, prebuilt, rec, FALSE,
3734 				    prebuilt->index, offsets, false)) {
3735 				ut_ad(dict_index_is_clust(prebuilt->index));
3736 				return(ICP_NO_MATCH);
3737 			}
3738 		}
3739 		MONITOR_INC(MONITOR_ICP_MATCH);
3740 		return(result);
3741 	case ICP_NO_MATCH:
3742 		MONITOR_INC(MONITOR_ICP_NO_MATCH);
3743 		return(result);
3744 	case ICP_OUT_OF_RANGE:
3745 		MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
3746 		return(result);
3747 	}
3748 
3749 	ut_error;
3750 	return(result);
3751 }
3752 
3753 /** Check the pushed down end range condition to avoid extra traversal
3754 if records are not within view and also to avoid prefetching in the
3755 cache buffer.
3756 @param[in]	mysql_rec	record in MySQL format
3757 @param[in,out]	handler		the MySQL handler performing the scan
3758 @retval true	if the row in mysql_rec is out of range
3759 @retval false	if the row in mysql_rec is in range */
3760 static
3761 bool
row_search_end_range_check(const byte * mysql_rec,ha_innobase * handler)3762 row_search_end_range_check(
3763 	const byte*	mysql_rec,
3764 	ha_innobase*	handler)
3765 {
3766 	if (handler->end_range &&
3767 	    handler->compare_key_in_buffer(mysql_rec) > 0) {
3768 		return(true);
3769 	}
3770 
3771 	return(false);
3772 }
3773 
3774 /********************************************************************//**
3775 Searches for rows in the database. This is used in the interface to
3776 MySQL. This function opens a cursor, and also implements fetch next
3777 and fetch prev. NOTE that if we do a search with a full key value
3778 from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3779 position and fetch next or fetch prev must not be tried to the cursor!
3780 @return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
3781 DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
3782 UNIV_INTERN
3783 dberr_t
row_search_for_mysql(byte * buf,ulint mode,row_prebuilt_t * prebuilt,ulint match_mode,ulint direction)3784 row_search_for_mysql(
3785 /*=================*/
3786 	byte*		buf,		/*!< in/out: buffer for the fetched
3787 					row in the MySQL format */
3788 	ulint		mode,		/*!< in: search mode PAGE_CUR_L, ... */
3789 	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
3790 					table handle; this contains the info
3791 					of search_tuple, index; if search
3792 					tuple contains 0 fields then we
3793 					position the cursor at the start or
3794 					the end of the index, depending on
3795 					'mode' */
3796 	ulint		match_mode,	/*!< in: 0 or ROW_SEL_EXACT or
3797 					ROW_SEL_EXACT_PREFIX */
3798 	ulint		direction)	/*!< in: 0 or ROW_SEL_NEXT or
3799 					ROW_SEL_PREV; NOTE: if this is != 0,
3800 					then prebuilt must have a pcur
3801 					with stored position! In opening of a
3802 					cursor 'direction' should be 0. */
3803 {
3804 	dict_index_t*	index		= prebuilt->index;
3805 	ibool		comp		= dict_table_is_comp(index->table);
3806 	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
3807 	btr_pcur_t*	pcur		= &prebuilt->pcur;
3808 	trx_t*		trx		= prebuilt->trx;
3809 	dict_index_t*	clust_index;
3810 	que_thr_t*	thr;
3811 	const rec_t*	prev_rec = NULL;
3812 	const rec_t*	rec = NULL;
3813 	byte*		end_range_cache = NULL;
3814 	const rec_t*	result_rec = NULL;
3815 	const rec_t*	clust_rec;
3816 	dberr_t		err				= DB_SUCCESS;
3817 	ibool		unique_search			= FALSE;
3818 	ibool		mtr_has_extra_clust_latch	= FALSE;
3819 	ibool		moves_up			= FALSE;
3820 	ibool		set_also_gap_locks		= TRUE;
3821 	/* if the query is a plain locking SELECT, and the isolation level
3822 	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3823 	ibool		did_semi_consistent_read	= FALSE;
3824 	/* if the returned record was locked and we did a semi-consistent
3825 	read (fetch the newest committed version), then this is set to
3826 	TRUE */
3827 #ifdef UNIV_SEARCH_DEBUG
3828 	ulint		cnt				= 0;
3829 #endif /* UNIV_SEARCH_DEBUG */
3830 	ulint		next_offs;
3831 	ibool		same_user_rec;
3832 	mtr_t		mtr;
3833 	mem_heap_t*	heap				= NULL;
3834 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
3835 	ulint*		offsets				= offsets_;
3836 	ibool		table_lock_waited		= FALSE;
3837 	byte*		next_buf			= 0;
3838 	ulint		end_loop			= 0;
3839 
3840 	rec_offs_init(offsets_);
3841 
3842 	ut_ad(index && pcur && search_tuple);
3843 
3844 	/* We don't support FTS queries from the HANDLER interfaces, because
3845 	we implemented FTS as reversed inverted index with auxiliary tables.
3846 	So anything related to traditional index query would not apply to
3847 	it. */
3848 	if (index->type & DICT_FTS) {
3849 		return(DB_END_OF_INDEX);
3850 	}
3851 
3852 #ifdef UNIV_SYNC_DEBUG
3853 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
3854 #endif /* UNIV_SYNC_DEBUG */
3855 
3856 	if (dict_table_is_discarded(prebuilt->table)) {
3857 
3858 		return(DB_TABLESPACE_DELETED);
3859 
3860 	} else if (prebuilt->table->ibd_file_missing) {
3861 
3862 		return(DB_TABLESPACE_NOT_FOUND);
3863 
3864 	} else if (!prebuilt->index_usable) {
3865 
3866 		return(DB_MISSING_HISTORY);
3867 
3868 	} else if (dict_index_is_corrupted(index)) {
3869 
3870 		return(DB_CORRUPTION);
3871 
3872 	} else if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
3873 		fprintf(stderr,
3874 			"InnoDB: Error: trying to free a corrupt\n"
3875 			"InnoDB: table handle. Magic n %lu, table name ",
3876 			(ulong) prebuilt->magic_n);
3877 		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3878 		putc('\n', stderr);
3879 
3880 		mem_analyze_corruption(prebuilt);
3881 
3882 		ut_error;
3883 	}
3884 
3885 #if 0
3886 	/* August 19, 2005 by Heikki: temporarily disable this error
3887 	print until the cursor lock count is done correctly.
3888 	See bugs #12263 and #12456!*/
3889 
3890 	if (trx->n_mysql_tables_in_use == 0
3891 	    && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
3892 		/* Note that if MySQL uses an InnoDB temp table that it
3893 		created inside LOCK TABLES, then n_mysql_tables_in_use can
3894 		be zero; in that case select_lock_type is set to LOCK_X in
3895 		::start_stmt. */
3896 
3897 		fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
3898 		      "InnoDB: but it has not locked"
3899 		      " any tables in ::external_lock()!\n",
3900 		      stderr);
3901 		trx_print(stderr, trx, 600);
3902 		fputc('\n', stderr);
3903 	}
3904 #endif
3905 
3906 #if 0
3907 	fprintf(stderr, "Match mode %lu\n search tuple ",
3908 		(ulong) match_mode);
3909 	dtuple_print(search_tuple);
3910 	fprintf(stderr, "N tables locked %lu\n",
3911 		(ulong) trx->mysql_n_tables_locked);
3912 #endif
3913 	/*-------------------------------------------------------------*/
3914 	/* PHASE 0: Release a possible s-latch we are holding on the
3915 	adaptive hash index latch if there is someone waiting behind */
3916 
3917 	if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
3918 	    && trx->has_search_latch) {
3919 
3920 		/* There is an x-latch request on the adaptive hash index:
3921 		release the s-latch to reduce starvation and wait for
3922 		BTR_SEA_TIMEOUT rounds before trying to keep it again over
3923 		calls from MySQL */
3924 
3925 		rw_lock_s_unlock(&btr_search_latch);
3926 		trx->has_search_latch = FALSE;
3927 
3928 		trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3929 	}
3930 
3931 	/* Reset the new record lock info if srv_locks_unsafe_for_binlog
3932 	is set or session is using a READ COMMITED isolation level. Then
3933 	we are able to remove the record locks set here on an individual
3934 	row. */
3935 	prebuilt->new_rec_locks = 0;
3936 
3937 	/*-------------------------------------------------------------*/
3938 	/* PHASE 1: Try to pop the row from the prefetch cache */
3939 
3940 	if (UNIV_UNLIKELY(direction == 0)) {
3941 		trx->op_info = "starting index read";
3942 
3943 		prebuilt->n_rows_fetched = 0;
3944 		prebuilt->n_fetch_cached = 0;
3945 		prebuilt->fetch_cache_first = 0;
3946 		prebuilt->end_range = false;
3947 
3948 		if (prebuilt->sel_graph == NULL) {
3949 			/* Build a dummy select query graph */
3950 			row_prebuild_sel_graph(prebuilt);
3951 		}
3952 	} else {
3953 		trx->op_info = "fetching rows";
3954 
3955 		if (prebuilt->n_rows_fetched == 0) {
3956 			prebuilt->fetch_direction = direction;
3957 		}
3958 
3959 		if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3960 			if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3961 				ut_error;
3962 				/* TODO: scrollable cursor: restore cursor to
3963 				the place of the latest returned row,
3964 				or better: prevent caching for a scroll
3965 				cursor! */
3966 			}
3967 
3968 			prebuilt->n_rows_fetched = 0;
3969 			prebuilt->n_fetch_cached = 0;
3970 			prebuilt->fetch_cache_first = 0;
3971 
3972 		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3973 			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
3974 
3975 			prebuilt->n_rows_fetched++;
3976 
3977 			err = DB_SUCCESS;
3978 			goto func_exit;
3979 		} else if (prebuilt->end_range == true) {
3980 			prebuilt->end_range = false;
3981 			err = DB_RECORD_NOT_FOUND;
3982 			goto func_exit;
3983 		}
3984 
3985 		if (prebuilt->fetch_cache_first > 0
3986 		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3987 
3988 			/* The previous returned row was popped from the fetch
3989 			cache, but the cache was not full at the time of the
3990 			popping: no more rows can exist in the result set */
3991 
3992 			err = DB_RECORD_NOT_FOUND;
3993 			goto func_exit;
3994 		}
3995 
3996 		prebuilt->n_rows_fetched++;
3997 
3998 		if (prebuilt->n_rows_fetched > 1000000000) {
3999 			/* Prevent wrap-over */
4000 			prebuilt->n_rows_fetched = 500000000;
4001 		}
4002 
4003 		mode = pcur->search_mode;
4004 	}
4005 
4006 	/* In a search where at most one record in the index may match, we
4007 	can use a LOCK_REC_NOT_GAP type record lock when locking a
4008 	non-delete-marked matching record.
4009 
4010 	Note that in a unique secondary index there may be different
4011 	delete-marked versions of a record where only the primary key
4012 	values differ: thus in a secondary index we must use next-key
4013 	locks when locking delete-marked records. */
4014 
4015 	if (match_mode == ROW_SEL_EXACT
4016 	    && dict_index_is_unique(index)
4017 	    && dtuple_get_n_fields(search_tuple)
4018 	    == dict_index_get_n_unique(index)
4019 	    && (dict_index_is_clust(index)
4020 		|| !dtuple_contains_null(search_tuple))) {
4021 
4022 		/* Note above that a UNIQUE secondary index can contain many
4023 		rows with the same key value if one of the columns is the SQL
4024 		null. A clustered index under MySQL can never contain null
4025 		columns because we demand that all the columns in primary key
4026 		are non-null. */
4027 
4028 		unique_search = TRUE;
4029 
4030 		/* Even if the condition is unique, MySQL seems to try to
4031 		retrieve also a second row if a primary key contains more than
4032 		1 column. Return immediately if this is not a HANDLER
4033 		command. */
4034 
4035 		if (UNIV_UNLIKELY(direction != 0
4036 				  && !prebuilt->used_in_HANDLER)) {
4037 
4038 			err = DB_RECORD_NOT_FOUND;
4039 			goto func_exit;
4040 		}
4041 	}
4042 
4043 	mtr_start(&mtr);
4044 
4045 	/*-------------------------------------------------------------*/
4046 	/* PHASE 2: Try fast adaptive hash index search if possible */
4047 
4048 	/* Next test if this is the special case where we can use the fast
4049 	adaptive hash index to try the search. Since we must release the
4050 	search system latch when we retrieve an externally stored field, we
4051 	cannot use the adaptive hash index in a search in the case the row
4052 	may be long and there may be externally stored fields */
4053 
4054 	if (UNIV_UNLIKELY(direction == 0)
4055 	    && unique_search
4056 	    && dict_index_is_clust(index)
4057 	    && !prebuilt->templ_contains_blob
4058 	    && !prebuilt->used_in_HANDLER
4059 	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)
4060 	    && !prebuilt->innodb_api) {
4061 
4062 		mode = PAGE_CUR_GE;
4063 
4064 		if (trx->mysql_n_tables_locked == 0
4065 		    && prebuilt->select_lock_type == LOCK_NONE
4066 		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
4067 		    && trx->read_view) {
4068 
4069 			/* This is a SELECT query done as a consistent read,
4070 			and the read view has already been allocated:
4071 			let us try a search shortcut through the hash
4072 			index.
4073 			NOTE that we must also test that
4074 			mysql_n_tables_locked == 0, because this might
4075 			also be INSERT INTO ... SELECT ... or
4076 			CREATE TABLE ... SELECT ... . Our algorithm is
4077 			NOT prepared to inserts interleaved with the SELECT,
4078 			and if we try that, we can deadlock on the adaptive
4079 			hash index semaphore! */
4080 
4081 #ifndef UNIV_SEARCH_DEBUG
4082 			if (!trx->has_search_latch) {
4083 				rw_lock_s_lock(&btr_search_latch);
4084 				trx->has_search_latch = TRUE;
4085 			}
4086 #endif
4087 			switch (row_sel_try_search_shortcut_for_mysql(
4088 					&rec, prebuilt, &offsets, &heap,
4089 					&mtr)) {
4090 			case SEL_FOUND:
4091 #ifdef UNIV_SEARCH_DEBUG
4092 				ut_a(0 == cmp_dtuple_rec(search_tuple,
4093 							 rec, offsets));
4094 #endif
4095 				/* At this point, rec is protected by
4096 				a page latch that was acquired by
4097 				row_sel_try_search_shortcut_for_mysql().
4098 				The latch will not be released until
4099 				mtr_commit(&mtr). */
4100 				ut_ad(!rec_get_deleted_flag(rec, comp));
4101 
4102 				if (prebuilt->idx_cond) {
4103 					switch (row_search_idx_cond_check(
4104 							buf, prebuilt,
4105 							rec, offsets)) {
4106 					case ICP_NO_MATCH:
4107 					case ICP_OUT_OF_RANGE:
4108 						goto shortcut_mismatch;
4109 					case ICP_MATCH:
4110 						goto shortcut_match;
4111 					}
4112 				}
4113 
4114 				if (!row_sel_store_mysql_rec(
4115 					    buf, prebuilt,
4116 					    rec, FALSE, index,
4117 					    offsets, false)) {
4118 					/* Only fresh inserts may contain
4119 					incomplete externally stored
4120 					columns. Pretend that such
4121 					records do not exist. Such
4122 					records may only be accessed
4123 					at the READ UNCOMMITTED
4124 					isolation level or when
4125 					rolling back a recovered
4126 					transaction. Rollback happens
4127 					at a lower level, not here. */
4128 
4129 					/* Proceed as in case SEL_RETRY. */
4130 					break;
4131 				}
4132 
4133 			shortcut_match:
4134 				mtr_commit(&mtr);
4135 
4136 				/* ut_print_name(stderr, index->name);
4137 				fputs(" shortcut\n", stderr); */
4138 
4139 				err = DB_SUCCESS;
4140 				goto release_search_latch_if_needed;
4141 
4142 			case SEL_EXHAUSTED:
4143 			shortcut_mismatch:
4144 				mtr_commit(&mtr);
4145 
4146 				/* ut_print_name(stderr, index->name);
4147 				fputs(" record not found 2\n", stderr); */
4148 
4149 				err = DB_RECORD_NOT_FOUND;
4150 release_search_latch_if_needed:
4151 				if (trx->search_latch_timeout > 0
4152 				    && trx->has_search_latch) {
4153 
4154 					trx->search_latch_timeout--;
4155 
4156 					rw_lock_s_unlock(&btr_search_latch);
4157 					trx->has_search_latch = FALSE;
4158 				}
4159 
4160 				/* NOTE that we do NOT store the cursor
4161 				position */
4162 				goto func_exit;
4163 
4164 			case SEL_RETRY:
4165 				break;
4166 
4167 			default:
4168 				ut_ad(0);
4169 			}
4170 
4171 			mtr_commit(&mtr);
4172 			mtr_start(&mtr);
4173 		}
4174 	}
4175 
4176 	/*-------------------------------------------------------------*/
4177 	/* PHASE 3: Open or restore index cursor position */
4178 
4179 	if (trx->has_search_latch) {
4180 		rw_lock_s_unlock(&btr_search_latch);
4181 		trx->has_search_latch = FALSE;
4182 	}
4183 
4184 	/* The state of a running trx can only be changed by the
4185 	thread that is currently serving the transaction. Because we
4186 	are that thread, we can read trx->state without holding any
4187 	mutex. */
4188 	ut_ad(prebuilt->sql_stat_start || trx->state == TRX_STATE_ACTIVE);
4189 
4190 	ut_ad(trx->state == TRX_STATE_NOT_STARTED
4191 	      || trx->state == TRX_STATE_ACTIVE);
4192 
4193 	ut_ad(prebuilt->sql_stat_start
4194 	      || prebuilt->select_lock_type != LOCK_NONE
4195 	      || trx->read_view);
4196 
4197 	trx_start_if_not_started(trx);
4198 
4199 	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
4200 	    && prebuilt->select_lock_type != LOCK_NONE
4201 	    && trx->mysql_thd != NULL
4202 	    && thd_is_select(trx->mysql_thd)) {
4203 		/* It is a plain locking SELECT and the isolation
4204 		level is low: do not lock gaps */
4205 
4206 		set_also_gap_locks = FALSE;
4207 	}
4208 
4209 	/* Note that if the search mode was GE or G, then the cursor
4210 	naturally moves upward (in fetch next) in alphabetical order,
4211 	otherwise downward */
4212 
4213 	if (UNIV_UNLIKELY(direction == 0)) {
4214 		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
4215 			moves_up = TRUE;
4216 		}
4217 	} else if (direction == ROW_SEL_NEXT) {
4218 		moves_up = TRUE;
4219 	}
4220 
4221 	thr = que_fork_get_first_thr(prebuilt->sel_graph);
4222 
4223 	que_thr_move_to_run_state_for_mysql(thr, trx);
4224 
4225 	clust_index = dict_table_get_first_index(index->table);
4226 
4227 	/* Do some start-of-statement preparations */
4228 
4229 	if (!prebuilt->sql_stat_start) {
4230 		/* No need to set an intention lock or assign a read view */
4231 
4232 		if (UNIV_UNLIKELY
4233 		    (trx->read_view == NULL
4234 		     && prebuilt->select_lock_type == LOCK_NONE)) {
4235 
4236 			fputs("InnoDB: Error: MySQL is trying to"
4237 			      " perform a consistent read\n"
4238 			      "InnoDB: but the read view is not assigned!\n",
4239 			      stderr);
4240 			trx_print(stderr, trx, 600);
4241 			fputc('\n', stderr);
4242 			ut_error;
4243 		}
4244 	} else if (prebuilt->select_lock_type == LOCK_NONE) {
4245 		/* This is a consistent read */
4246 		/* Assign a read view for the query */
4247 
4248 		trx_assign_read_view(trx);
4249 		prebuilt->sql_stat_start = FALSE;
4250 	} else {
4251 wait_table_again:
4252 		err = lock_table(0, index->table,
4253 				 prebuilt->select_lock_type == LOCK_S
4254 				 ? LOCK_IS : LOCK_IX, thr);
4255 
4256 		if (err != DB_SUCCESS) {
4257 
4258 			table_lock_waited = TRUE;
4259 			goto lock_table_wait;
4260 		}
4261 		prebuilt->sql_stat_start = FALSE;
4262 	}
4263 
4264 	/* Open or restore index cursor position */
4265 
4266 	if (UNIV_LIKELY(direction != 0)) {
4267 		ibool	need_to_process = sel_restore_position_for_mysql(
4268 			&same_user_rec, BTR_SEARCH_LEAF,
4269 			pcur, moves_up, &mtr);
4270 
4271 		if (UNIV_UNLIKELY(need_to_process)) {
4272 			if (UNIV_UNLIKELY(prebuilt->row_read_type
4273 					  == ROW_READ_DID_SEMI_CONSISTENT)) {
4274 				/* We did a semi-consistent read,
4275 				but the record was removed in
4276 				the meantime. */
4277 				prebuilt->row_read_type
4278 					= ROW_READ_TRY_SEMI_CONSISTENT;
4279 			}
4280 		} else if (UNIV_LIKELY(prebuilt->row_read_type
4281 				       != ROW_READ_DID_SEMI_CONSISTENT)) {
4282 
4283 			/* The cursor was positioned on the record
4284 			that we returned previously.  If we need
4285 			to repeat a semi-consistent read as a
4286 			pessimistic locking read, the record
4287 			cannot be skipped. */
4288 
4289 			goto next_rec;
4290 		}
4291 
4292 	} else if (dtuple_get_n_fields(search_tuple) > 0) {
4293 
4294 		btr_pcur_open_with_no_init(index, search_tuple, mode,
4295 					   BTR_SEARCH_LEAF,
4296 					   pcur, 0, &mtr);
4297 
4298 		pcur->trx_if_known = trx;
4299 
4300 		rec = btr_pcur_get_rec(pcur);
4301 
4302 		if (!moves_up
4303 		    && !page_rec_is_supremum(rec)
4304 		    && set_also_gap_locks
4305 		    && !(srv_locks_unsafe_for_binlog
4306 			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4307 		    && prebuilt->select_lock_type != LOCK_NONE) {
4308 
4309 			/* Try to place a gap lock on the next index record
4310 			to prevent phantoms in ORDER BY ... DESC queries */
4311 			const rec_t*	next_rec = page_rec_get_next_const(rec);
4312 
4313 			offsets = rec_get_offsets(next_rec, index, offsets,
4314 						  ULINT_UNDEFINED, &heap);
4315 			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4316 					       next_rec, index, offsets,
4317 					       prebuilt->select_lock_type,
4318 					       LOCK_GAP, thr);
4319 
4320 			switch (err) {
4321 			case DB_SUCCESS_LOCKED_REC:
4322 				err = DB_SUCCESS;
4323 			case DB_SUCCESS:
4324 				break;
4325 			default:
4326 				goto lock_wait_or_error;
4327 			}
4328 		}
4329 	} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
4330 		btr_pcur_open_at_index_side(
4331 			mode == PAGE_CUR_G, index, BTR_SEARCH_LEAF,
4332 			pcur, false, 0, &mtr);
4333 	}
4334 
4335 rec_loop:
4336 	DEBUG_SYNC_C("row_search_rec_loop");
4337 	if (trx_is_interrupted(trx)) {
4338 		btr_pcur_store_position(pcur, &mtr);
4339 		err = DB_INTERRUPTED;
4340 		goto normal_return;
4341 	}
4342 
4343 	/*-------------------------------------------------------------*/
4344 	/* PHASE 4: Look for matching records in a loop */
4345 
4346 	rec = btr_pcur_get_rec(pcur);
4347 	ut_ad(!!page_rec_is_comp(rec) == comp);
4348 #ifdef UNIV_SEARCH_DEBUG
4349 	/*
4350 	fputs("Using ", stderr);
4351 	dict_index_name_print(stderr, trx, index);
4352 	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
4353 	page_get_page_no(page_align(rec)));
4354 	rec_print(stderr, rec, index);
4355 	printf("delete-mark: %lu\n",
4356 	       rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
4357 	*/
4358 #endif /* UNIV_SEARCH_DEBUG */
4359 
4360 	if (page_rec_is_infimum(rec)) {
4361 
4362 		/* The infimum record on a page cannot be in the result set,
4363 		and neither can a record lock be placed on it: we skip such
4364 		a record. */
4365 
4366 		prev_rec = NULL;
4367 		goto next_rec;
4368 	}
4369 
4370 	if (page_rec_is_supremum(rec)) {
4371 
4372 		DBUG_EXECUTE_IF("compare_end_range",
4373 				if (end_loop < 100) {
4374 					end_loop = 100;
4375 				});
4376 		/** Compare the last record of the page with end range
4377 		passed to InnoDB when there is no ICP and number of loops
4378 		in row_search_for_mysql for rows found but not
4379 		reporting due to search views etc. */
4380 		if (prev_rec != NULL && !prebuilt->innodb_api
4381 		    && prebuilt->mysql_handler->end_range != NULL
4382 		    && prebuilt->idx_cond == NULL
4383 		    && end_loop >= 100) {
4384 
4385 			dict_index_t*	key_index = prebuilt->index;
4386 			bool		clust_templ_for_sec = false;
4387 
4388 			if (end_range_cache == NULL) {
4389 				end_range_cache = static_cast<byte*>(
4390 					ut_malloc(prebuilt->mysql_row_len));
4391 			}
4392 
4393 			if (index != clust_index
4394 			    && prebuilt->need_to_access_clustered) {
4395 				/** Secondary index record but the template
4396 				based on PK. */
4397 				key_index = clust_index;
4398 				clust_templ_for_sec = true;
4399 			}
4400 
4401 			/** Create offsets based on prebuilt index. */
4402 			offsets = rec_get_offsets(prev_rec, prebuilt->index,
4403 					offsets, ULINT_UNDEFINED, &heap);
4404 
4405 			if (row_sel_store_mysql_rec(
4406 				end_range_cache, prebuilt, prev_rec,
4407 				clust_templ_for_sec, key_index, offsets,
4408 				clust_templ_for_sec)) {
4409 
4410 				if (row_search_end_range_check(
4411 					end_range_cache,
4412 					prebuilt->mysql_handler)) {
4413 
4414 					/** In case of prebuilt->fetch,
4415 					set the error in prebuilt->end_range. */
4416 					if (next_buf != NULL) {
4417 						prebuilt->end_range = true;
4418 					}
4419 
4420 					err = DB_RECORD_NOT_FOUND;
4421 					goto normal_return;
4422 				}
4423 			}
4424 		}
4425 
4426 		if (set_also_gap_locks
4427 		    && !(srv_locks_unsafe_for_binlog
4428 			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4429 		    && prebuilt->select_lock_type != LOCK_NONE) {
4430 
4431 			/* Try to place a lock on the index record */
4432 
4433 			/* If innodb_locks_unsafe_for_binlog option is used
4434 			or this session is using a READ COMMITTED isolation
4435 			level we do not lock gaps. Supremum record is really
4436 			a gap and therefore we do not set locks there. */
4437 
4438 			offsets = rec_get_offsets(rec, index, offsets,
4439 						  ULINT_UNDEFINED, &heap);
4440 			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4441 					       rec, index, offsets,
4442 					       prebuilt->select_lock_type,
4443 					       LOCK_ORDINARY, thr);
4444 
4445 			switch (err) {
4446 			case DB_SUCCESS_LOCKED_REC:
4447 				err = DB_SUCCESS;
4448 			case DB_SUCCESS:
4449 				break;
4450 			default:
4451 				goto lock_wait_or_error;
4452 			}
4453 		}
4454 		/* A page supremum record cannot be in the result set: skip
4455 		it now that we have placed a possible lock on it */
4456 
4457 		prev_rec = NULL;
4458 		goto next_rec;
4459 	}
4460 
4461 	/*-------------------------------------------------------------*/
4462 	/* Do sanity checks in case our cursor has bumped into page
4463 	corruption */
4464 
4465 	if (comp) {
4466 		next_offs = rec_get_next_offs(rec, TRUE);
4467 		if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
4468 
4469 			goto wrong_offs;
4470 		}
4471 	} else {
4472 		next_offs = rec_get_next_offs(rec, FALSE);
4473 		if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
4474 
4475 			goto wrong_offs;
4476 		}
4477 	}
4478 
4479 	if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
4480 
4481 wrong_offs:
4482 		if (srv_force_recovery == 0 || moves_up == FALSE) {
4483 			ut_print_timestamp(stderr);
4484 			buf_page_print(page_align(rec), 0,
4485 				       BUF_PAGE_PRINT_NO_CRASH);
4486 			fprintf(stderr,
4487 				"\nInnoDB: rec address %p,"
4488 				" buf block fix count %lu\n",
4489 				(void*) rec, (ulong)
4490 				btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
4491 				->page.buf_fix_count);
4492 			fprintf(stderr,
4493 				"InnoDB: Index corruption: rec offs %lu"
4494 				" next offs %lu, page no %lu,\n"
4495 				"InnoDB: ",
4496 				(ulong) page_offset(rec),
4497 				(ulong) next_offs,
4498 				(ulong) page_get_page_no(page_align(rec)));
4499 			dict_index_name_print(stderr, trx, index);
4500 			fputs(". Run CHECK TABLE. You may need to\n"
4501 			      "InnoDB: restore from a backup, or"
4502 			      " dump + drop + reimport the table.\n",
4503 			      stderr);
4504 			ut_ad(0);
4505 			err = DB_CORRUPTION;
4506 
4507 			goto lock_wait_or_error;
4508 		} else {
4509 			/* The user may be dumping a corrupt table. Jump
4510 			over the corruption to recover as much as possible. */
4511 
4512 			fprintf(stderr,
4513 				"InnoDB: Index corruption: rec offs %lu"
4514 				" next offs %lu, page no %lu,\n"
4515 				"InnoDB: ",
4516 				(ulong) page_offset(rec),
4517 				(ulong) next_offs,
4518 				(ulong) page_get_page_no(page_align(rec)));
4519 			dict_index_name_print(stderr, trx, index);
4520 			fputs(". We try to skip the rest of the page.\n",
4521 			      stderr);
4522 
4523 			btr_pcur_move_to_last_on_page(pcur, &mtr);
4524 
4525 			prev_rec = NULL;
4526 			goto next_rec;
4527 		}
4528 	}
4529 	/*-------------------------------------------------------------*/
4530 
4531 	/* Calculate the 'offsets' associated with 'rec' */
4532 
4533 	ut_ad(fil_page_get_type(btr_pcur_get_page(pcur)) == FIL_PAGE_INDEX);
4534 	ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
4535 
4536 	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4537 
4538 	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
4539 		if (!rec_validate(rec, offsets)
4540 		    || !btr_index_rec_validate(rec, index, FALSE)) {
4541 			fprintf(stderr,
4542 				"InnoDB: Index corruption: rec offs %lu"
4543 				" next offs %lu, page no %lu,\n"
4544 				"InnoDB: ",
4545 				(ulong) page_offset(rec),
4546 				(ulong) next_offs,
4547 				(ulong) page_get_page_no(page_align(rec)));
4548 			dict_index_name_print(stderr, trx, index);
4549 			fputs(". We try to skip the record.\n",
4550 			      stderr);
4551 
4552 			prev_rec = NULL;
4553 			goto next_rec;
4554 		}
4555 	}
4556 
4557 	prev_rec = rec;
4558 
4559 	/* Note that we cannot trust the up_match value in the cursor at this
4560 	place because we can arrive here after moving the cursor! Thus
4561 	we have to recompare rec and search_tuple to determine if they
4562 	match enough. */
4563 
4564 	if (match_mode == ROW_SEL_EXACT) {
4565 		/* Test if the index record matches completely to search_tuple
4566 		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
4567 
4568 		/* fputs("Comparing rec and search tuple\n", stderr); */
4569 
4570 		if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
4571 
4572 			if (set_also_gap_locks
4573 			    && !(srv_locks_unsafe_for_binlog
4574 				 || trx->isolation_level
4575 				 <= TRX_ISO_READ_COMMITTED)
4576 			    && prebuilt->select_lock_type != LOCK_NONE) {
4577 
4578 				/* Try to place a gap lock on the index
4579 				record only if innodb_locks_unsafe_for_binlog
4580 				option is not set or this session is not
4581 				using a READ COMMITTED isolation level. */
4582 
4583 				err = sel_set_rec_lock(
4584 					btr_pcur_get_block(pcur),
4585 					rec, index, offsets,
4586 					prebuilt->select_lock_type, LOCK_GAP,
4587 					thr);
4588 
4589 				switch (err) {
4590 				case DB_SUCCESS_LOCKED_REC:
4591 				case DB_SUCCESS:
4592 					break;
4593 				default:
4594 					goto lock_wait_or_error;
4595 				}
4596 			}
4597 
4598 			btr_pcur_store_position(pcur, &mtr);
4599 
4600 			/* The found record was not a match, but may be used
4601 			as NEXT record (index_next). Set the relative position
4602 			to BTR_PCUR_BEFORE, to reflect that the position of
4603 			the persistent cursor is before the found/stored row
4604 			(pcur->old_rec). */
4605 			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
4606 			pcur->rel_pos = BTR_PCUR_BEFORE;
4607 
4608 			err = DB_RECORD_NOT_FOUND;
4609 #if 0
4610 			ut_print_name(stderr, trx, FALSE, index->name);
4611 			fputs(" record not found 3\n", stderr);
4612 #endif
4613 
4614 			goto normal_return;
4615 		}
4616 
4617 	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4618 
4619 		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4620 
4621 			if (set_also_gap_locks
4622 			    && !(srv_locks_unsafe_for_binlog
4623 				 || trx->isolation_level
4624 				 <= TRX_ISO_READ_COMMITTED)
4625 			    && prebuilt->select_lock_type != LOCK_NONE) {
4626 
4627 				/* Try to place a gap lock on the index
4628 				record only if innodb_locks_unsafe_for_binlog
4629 				option is not set or this session is not
4630 				using a READ COMMITTED isolation level. */
4631 
4632 				err = sel_set_rec_lock(
4633 					btr_pcur_get_block(pcur),
4634 					rec, index, offsets,
4635 					prebuilt->select_lock_type, LOCK_GAP,
4636 					thr);
4637 
4638 				switch (err) {
4639 				case DB_SUCCESS_LOCKED_REC:
4640 				case DB_SUCCESS:
4641 					break;
4642 				default:
4643 					goto lock_wait_or_error;
4644 				}
4645 			}
4646 
4647 			btr_pcur_store_position(pcur, &mtr);
4648 
4649 			/* The found record was not a match, but may be used
4650 			as NEXT record (index_next). Set the relative position
4651 			to BTR_PCUR_BEFORE, to reflect that the position of
4652 			the persistent cursor is before the found/stored row
4653 			(pcur->old_rec). */
4654 			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
4655 			pcur->rel_pos = BTR_PCUR_BEFORE;
4656 
4657 			err = DB_RECORD_NOT_FOUND;
4658 #if 0
4659 			ut_print_name(stderr, trx, FALSE, index->name);
4660 			fputs(" record not found 4\n", stderr);
4661 #endif
4662 
4663 			goto normal_return;
4664 		}
4665 	}
4666 
4667 	/* We are ready to look at a possible new index entry in the result
4668 	set: the cursor is now placed on a user record */
4669 
4670 	if (prebuilt->select_lock_type != LOCK_NONE) {
4671 		/* Try to place a lock on the index record; note that delete
4672 		marked records are a special case in a unique search. If there
4673 		is a non-delete marked record, then it is enough to lock its
4674 		existence with LOCK_REC_NOT_GAP. */
4675 
4676 		/* If innodb_locks_unsafe_for_binlog option is used
4677 		or this session is using a READ COMMITED isolation
4678 		level we lock only the record, i.e., next-key locking is
4679 		not used. */
4680 
4681 		ulint	lock_type;
4682 
4683 		if (!set_also_gap_locks
4684 		    || srv_locks_unsafe_for_binlog
4685 		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED
4686 		    || (unique_search && !rec_get_deleted_flag(rec, comp))) {
4687 
4688 			goto no_gap_lock;
4689 		} else {
4690 			lock_type = LOCK_ORDINARY;
4691 		}
4692 
4693 		/* If we are doing a 'greater or equal than a primary key
4694 		value' search from a clustered index, and we find a record
4695 		that has that exact primary key value, then there is no need
4696 		to lock the gap before the record, because no insert in the
4697 		gap can be in our search range. That is, no phantom row can
4698 		appear that way.
4699 
4700 		An example: if col1 is the primary key, the search is WHERE
4701 		col1 >= 100, and we find a record where col1 = 100, then no
4702 		need to lock the gap before that record. */
4703 
4704 		if (index == clust_index
4705 		    && mode == PAGE_CUR_GE
4706 		    && direction == 0
4707 		    && dtuple_get_n_fields_cmp(search_tuple)
4708 		    == dict_index_get_n_unique(index)
4709 		    && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4710 no_gap_lock:
4711 			lock_type = LOCK_REC_NOT_GAP;
4712 		}
4713 
4714 		err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4715 				       rec, index, offsets,
4716 				       prebuilt->select_lock_type,
4717 				       lock_type, thr);
4718 
4719 		switch (err) {
4720 			const rec_t*	old_vers;
4721 		case DB_SUCCESS_LOCKED_REC:
4722 			if (srv_locks_unsafe_for_binlog
4723 			    || trx->isolation_level
4724 			    <= TRX_ISO_READ_COMMITTED) {
4725 				/* Note that a record of
4726 				prebuilt->index was locked. */
4727 				prebuilt->new_rec_locks = 1;
4728 			}
4729 			err = DB_SUCCESS;
4730 		case DB_SUCCESS:
4731 			break;
4732 		case DB_LOCK_WAIT:
4733 			/* Never unlock rows that were part of a conflict. */
4734 			prebuilt->new_rec_locks = 0;
4735 
4736 			if (UNIV_LIKELY(prebuilt->row_read_type
4737 					!= ROW_READ_TRY_SEMI_CONSISTENT)
4738 			    || unique_search
4739 			    || index != clust_index) {
4740 
4741 				goto lock_wait_or_error;
4742 			}
4743 
4744 			/* The following call returns 'offsets'
4745 			associated with 'old_vers' */
4746 			row_sel_build_committed_vers_for_mysql(
4747 				clust_index, prebuilt, rec,
4748 				&offsets, &heap, &old_vers, &mtr);
4749 
4750 			/* Check whether it was a deadlock or not, if not
4751 			a deadlock and the transaction had to wait then
4752 			release the lock it is waiting on. */
4753 
4754 			err = lock_trx_handle_wait(trx);
4755 
4756 			switch (err) {
4757 			case DB_SUCCESS:
4758 				/* The lock was granted while we were
4759 				searching for the last committed version.
4760 				Do a normal locking read. */
4761 
4762 				offsets = rec_get_offsets(
4763 					rec, index, offsets, ULINT_UNDEFINED,
4764 					&heap);
4765 				goto locks_ok;
4766 			case DB_DEADLOCK:
4767 				goto lock_wait_or_error;
4768 			case DB_LOCK_WAIT:
4769 				err = DB_SUCCESS;
4770 				break;
4771 			default:
4772 				ut_error;
4773 			}
4774 
4775 			if (old_vers == NULL) {
4776 				/* The row was not yet committed */
4777 
4778 				goto next_rec;
4779 			}
4780 
4781 			did_semi_consistent_read = TRUE;
4782 			rec = old_vers;
4783 			prev_rec = rec;
4784 			break;
4785 		default:
4786 
4787 			goto lock_wait_or_error;
4788 		}
4789 	} else {
4790 		/* This is a non-locking consistent read: if necessary, fetch
4791 		a previous version of the record */
4792 
4793 		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4794 
4795 			/* Do nothing: we let a non-locking SELECT read the
4796 			latest version of the record */
4797 
4798 		} else if (index == clust_index) {
4799 
4800 			/* Fetch a previous version of the row if the current
4801 			one is not visible in the snapshot; if we have a very
4802 			high force recovery level set, we try to avoid crashes
4803 			by skipping this lookup */
4804 
4805 			if (UNIV_LIKELY(srv_force_recovery < 5)
4806 			    && !lock_clust_rec_cons_read_sees(
4807 				    rec, index, offsets, trx->read_view)) {
4808 
4809 				rec_t*	old_vers;
4810 				/* The following call returns 'offsets'
4811 				associated with 'old_vers' */
4812 				err = row_sel_build_prev_vers_for_mysql(
4813 					trx->read_view, clust_index,
4814 					prebuilt, rec, &offsets, &heap,
4815 					&old_vers, &mtr);
4816 
4817 				if (err != DB_SUCCESS) {
4818 
4819 					goto lock_wait_or_error;
4820 				}
4821 
4822 				if (old_vers == NULL) {
4823 					/* The row did not exist yet in
4824 					the read view */
4825 
4826 					goto next_rec;
4827 				}
4828 
4829 				rec = old_vers;
4830 				prev_rec = rec;
4831 			}
4832 		} else {
4833 			/* We are looking into a non-clustered index,
4834 			and to get the right version of the record we
4835 			have to look also into the clustered index: this
4836 			is necessary, because we can only get the undo
4837 			information via the clustered index record. */
4838 
4839 			ut_ad(!dict_index_is_clust(index));
4840 
4841 			if (!lock_sec_rec_cons_read_sees(
4842 				    rec, trx->read_view)) {
4843 				/* We should look at the clustered index.
4844 				However, as this is a non-locking read,
4845 				we can skip the clustered index lookup if
4846 				the condition does not match the secondary
4847 				index entry. */
4848 				switch (row_search_idx_cond_check(
4849 						buf, prebuilt, rec, offsets)) {
4850 				case ICP_NO_MATCH:
4851 					goto next_rec;
4852 				case ICP_OUT_OF_RANGE:
4853 					err = DB_RECORD_NOT_FOUND;
4854 					goto idx_cond_failed;
4855 				case ICP_MATCH:
4856 					goto requires_clust_rec;
4857 				}
4858 
4859 				ut_error;
4860 			}
4861 		}
4862 	}
4863 
4864 locks_ok:
4865 	/* NOTE that at this point rec can be an old version of a clustered
4866 	index record built for a consistent read. We cannot assume after this
4867 	point that rec is on a buffer pool page. Functions like
4868 	page_rec_is_comp() cannot be used! */
4869 
4870 	if (rec_get_deleted_flag(rec, comp)) {
4871 
4872 		/* The record is delete-marked: we can skip it */
4873 
4874 		if ((srv_locks_unsafe_for_binlog
4875 		     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4876 		    && prebuilt->select_lock_type != LOCK_NONE
4877 		    && !did_semi_consistent_read) {
4878 
4879 			/* No need to keep a lock on a delete-marked record
4880 			if we do not want to use next-key locking. */
4881 
4882 			row_unlock_for_mysql(prebuilt, TRUE);
4883 		}
4884 
4885 		/* This is an optimization to skip setting the next key lock
4886 		on the record that follows this delete-marked record. This
4887 		optimization works because of the unique search criteria
4888 		which precludes the presence of a range lock between this
4889 		delete marked record and the record following it.
4890 
4891 		For now this is applicable only to clustered indexes while
4892 		doing a unique search except for HANDLER queries because
4893 		HANDLER allows NEXT and PREV even in unique search on
4894 		clustered index. There is scope for further optimization
4895 		applicable to unique secondary indexes. Current behaviour is
4896 		to widen the scope of a lock on an already delete marked record
4897 		if the same record is deleted twice by the same transaction */
4898 		if (index == clust_index && unique_search
4899 		    && !prebuilt->used_in_HANDLER) {
4900 
4901 			err = DB_RECORD_NOT_FOUND;
4902 
4903 			goto normal_return;
4904 		}
4905 
4906 		goto next_rec;
4907 	}
4908 
4909 	/* Check if the record matches the index condition. */
4910 	switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
4911 	case ICP_NO_MATCH:
4912 		if (did_semi_consistent_read) {
4913 			row_unlock_for_mysql(prebuilt, TRUE);
4914 		}
4915 		goto next_rec;
4916 	case ICP_OUT_OF_RANGE:
4917 		err = DB_RECORD_NOT_FOUND;
4918 		goto idx_cond_failed;
4919 	case ICP_MATCH:
4920 		break;
4921 	}
4922 
4923 	/* Get the clustered index record if needed, if we did not do the
4924 	search using the clustered index. */
4925 
4926 	if (index != clust_index && prebuilt->need_to_access_clustered) {
4927 
4928 requires_clust_rec:
4929 		ut_ad(index != clust_index);
4930 		/* We use a 'goto' to the preceding label if a consistent
4931 		read of a secondary index record requires us to look up old
4932 		versions of the associated clustered index record. */
4933 
4934 		ut_ad(rec_offs_validate(rec, index, offsets));
4935 
4936 		/* It was a non-clustered index and we must fetch also the
4937 		clustered index record */
4938 
4939 		mtr_has_extra_clust_latch = TRUE;
4940 
4941 		/* The following call returns 'offsets' associated with
4942 		'clust_rec'. Note that 'clust_rec' can be an old version
4943 		built for a consistent read. */
4944 
4945 		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4946 						      thr, &clust_rec,
4947 						      &offsets, &heap, &mtr);
4948 		switch (err) {
4949 		case DB_SUCCESS:
4950 			if (clust_rec == NULL) {
4951 				/* The record did not exist in the read view */
4952 				ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4953 
4954 				goto next_rec;
4955 			}
4956 			break;
4957 		case DB_SUCCESS_LOCKED_REC:
4958 			ut_a(clust_rec != NULL);
4959 			if (srv_locks_unsafe_for_binlog
4960 			     || trx->isolation_level
4961 			    <= TRX_ISO_READ_COMMITTED) {
4962 				/* Note that the clustered index record
4963 				was locked. */
4964 				prebuilt->new_rec_locks = 2;
4965 			}
4966 			err = DB_SUCCESS;
4967 			break;
4968 		default:
4969 			goto lock_wait_or_error;
4970 		}
4971 
4972 		if (rec_get_deleted_flag(clust_rec, comp)) {
4973 
4974 			/* The record is delete marked: we can skip it */
4975 
4976 			if ((srv_locks_unsafe_for_binlog
4977 			     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4978 			    && prebuilt->select_lock_type != LOCK_NONE) {
4979 
4980 				/* No need to keep a lock on a delete-marked
4981 				record if we do not want to use next-key
4982 				locking. */
4983 
4984 				row_unlock_for_mysql(prebuilt, TRUE);
4985 			}
4986 
4987 			goto next_rec;
4988 		}
4989 
4990 		result_rec = clust_rec;
4991 		ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
4992 
4993 		if (prebuilt->idx_cond) {
4994 			/* Convert the record to MySQL format. We were
4995 			unable to do this in row_search_idx_cond_check(),
4996 			because the condition is on the secondary index
4997 			and the requested column is in the clustered index.
4998 			We convert all fields, including those that
4999 			may have been used in ICP, because the
5000 			secondary index may contain a column prefix
5001 			rather than the full column. Also, as noted
5002 			in Bug #56680, the column in the secondary
5003 			index may be in the wrong case, and the
5004 			authoritative case is in result_rec, the
5005 			appropriate version of the clustered index record. */
5006 			if (!row_sel_store_mysql_rec(
5007 				    buf, prebuilt, result_rec,
5008 				    TRUE, clust_index, offsets, false)) {
5009 				goto next_rec;
5010 			}
5011 		}
5012 	} else {
5013 		result_rec = rec;
5014 	}
5015 
5016 	/* We found a qualifying record 'result_rec'. At this point,
5017 	'offsets' are associated with 'result_rec'. */
5018 
5019 	ut_ad(rec_offs_validate(result_rec,
5020 				result_rec != rec ? clust_index : index,
5021 				offsets));
5022 	ut_ad(!rec_get_deleted_flag(result_rec, comp));
5023 
5024 	/* At this point, the clustered index record is protected
5025 	by a page latch that was acquired when pcur was positioned.
5026 	The latch will not be released until mtr_commit(&mtr). */
5027 
5028 	if ((match_mode == ROW_SEL_EXACT
5029 	     || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
5030 	    && prebuilt->select_lock_type == LOCK_NONE
5031 	    && !prebuilt->templ_contains_blob
5032 	    && !prebuilt->clust_index_was_generated
5033 	    && !prebuilt->used_in_HANDLER
5034 	    && !prebuilt->innodb_api
5035 	    && prebuilt->template_type
5036 	    != ROW_MYSQL_DUMMY_TEMPLATE
5037 	    && !prebuilt->in_fts_query) {
5038 
5039 		/* Inside an update, for example, we do not cache rows,
5040 		since we may use the cursor position to do the actual
5041 		update, that is why we require ...lock_type == LOCK_NONE.
5042 		Since we keep space in prebuilt only for the BLOBs of
5043 		a single row, we cannot cache rows in the case there
5044 		are BLOBs in the fields to be fetched. In HANDLER we do
5045 		not cache rows because there the cursor is a scrollable
5046 		cursor. */
5047 
5048 		ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
5049 
5050 		/* We only convert from InnoDB row format to MySQL row
5051 		format when ICP is disabled. */
5052 
5053 		if (!prebuilt->idx_cond) {
5054 
5055 			/* We use next_buf to track the allocation of buffers
5056 			where we store and enqueue the buffers for our
5057 			pre-fetch optimisation.
5058 
5059 			If next_buf == 0 then we store the converted record
5060 			directly into the MySQL record buffer (buf). If it is
5061 			!= 0 then we allocate a pre-fetch buffer and store the
5062 			converted record there.
5063 
5064 			If the conversion fails and the MySQL record buffer
5065 			was not written to then we reset next_buf so that
5066 			we can re-use the MySQL record buffer in the next
5067 			iteration. */
5068 
5069 			next_buf = next_buf
5070 				 ? row_sel_fetch_last_buf(prebuilt) : buf;
5071 
5072 			if (!row_sel_store_mysql_rec(
5073 				next_buf, prebuilt, result_rec,
5074 				result_rec != rec,
5075 				result_rec != rec ? clust_index : index,
5076 				offsets, false)) {
5077 
5078 				if (next_buf == buf) {
5079 					ut_a(prebuilt->n_fetch_cached == 0);
5080 					next_buf = 0;
5081 				}
5082 
5083 				/* Only fresh inserts may contain incomplete
5084 				externally stored columns. Pretend that such
5085 				records do not exist. Such records may only be
5086 				accessed at the READ UNCOMMITTED isolation
5087 				level or when rolling back a recovered
5088 				transaction. Rollback happens at a lower
5089 				level, not here. */
5090 				goto next_rec;
5091 			}
5092 
5093 			if (next_buf != buf) {
5094 				row_sel_enqueue_cache_row_for_mysql(
5095 					next_buf, prebuilt);
5096 			}
5097 		} else {
5098 			row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
5099 		}
5100 
5101 		if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
5102 			goto next_rec;
5103 		}
5104 
5105 	} else {
5106 		if (UNIV_UNLIKELY
5107 		    (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
5108 			/* CHECK TABLE: fetch the row */
5109 
5110 			if (result_rec != rec
5111 			    && !prebuilt->need_to_access_clustered) {
5112 				/* We used 'offsets' for the clust
5113 				rec, recalculate them for 'rec' */
5114 				offsets = rec_get_offsets(rec, index, offsets,
5115 							  ULINT_UNDEFINED,
5116 							  &heap);
5117 				result_rec = rec;
5118 			}
5119 
5120 			memcpy(buf + 4, result_rec
5121 			       - rec_offs_extra_size(offsets),
5122 			       rec_offs_size(offsets));
5123 			mach_write_to_4(buf,
5124 					rec_offs_extra_size(offsets) + 4);
5125 		} else if (!prebuilt->idx_cond && !prebuilt->innodb_api) {
5126 			/* The record was not yet converted to MySQL format. */
5127 			if (!row_sel_store_mysql_rec(
5128 				    buf, prebuilt, result_rec,
5129 				    result_rec != rec,
5130 				    result_rec != rec ? clust_index : index,
5131 				    offsets, false)) {
5132 				/* Only fresh inserts may contain
5133 				incomplete externally stored
5134 				columns. Pretend that such records do
5135 				not exist. Such records may only be
5136 				accessed at the READ UNCOMMITTED
5137 				isolation level or when rolling back a
5138 				recovered transaction. Rollback
5139 				happens at a lower level, not here. */
5140 				goto next_rec;
5141 			}
5142 		}
5143 
5144 		if (prebuilt->clust_index_was_generated) {
5145 			row_sel_store_row_id_to_prebuilt(
5146 				prebuilt, result_rec,
5147 				result_rec == rec ? index : clust_index,
5148 				offsets);
5149 		}
5150 	}
5151 
5152 	/* From this point on, 'offsets' are invalid. */
5153 
5154 	/* We have an optimization to save CPU time: if this is a consistent
5155 	read on a unique condition on the clustered index, then we do not
5156 	store the pcur position, because any fetch next or prev will anyway
5157 	return 'end of file'. Exceptions are locking reads and the MySQL
5158 	HANDLER command where the user can move the cursor with PREV or NEXT
5159 	even after a unique search. */
5160 
5161 	err = DB_SUCCESS;
5162 
5163 idx_cond_failed:
5164 	if (!unique_search
5165 	    || !dict_index_is_clust(index)
5166 	    || direction != 0
5167 	    || prebuilt->select_lock_type != LOCK_NONE
5168 	    || prebuilt->used_in_HANDLER
5169 	    || prebuilt->innodb_api) {
5170 
5171 		/* Inside an update always store the cursor position */
5172 
5173 		btr_pcur_store_position(pcur, &mtr);
5174 
5175 		if (prebuilt->innodb_api
5176 		    && (btr_pcur_get_rec(pcur) != result_rec)) {
5177 			ulint rec_size =  rec_offs_size(offsets);
5178 			if (!prebuilt->innodb_api_rec_size ||
5179 			   (prebuilt->innodb_api_rec_size < rec_size)) {
5180 				prebuilt->innodb_api_buf =
5181 				  static_cast<byte*>
5182 				  (mem_heap_alloc(prebuilt->cursor_heap,rec_size));
5183 				prebuilt->innodb_api_rec_size = rec_size;
5184 			}
5185 			prebuilt->innodb_api_rec =
5186 			      rec_copy(
5187 			       prebuilt->innodb_api_buf, result_rec, offsets);
5188 		}
5189 	}
5190 
5191 	goto normal_return;
5192 
5193 next_rec:
5194 	end_loop++;
5195 
5196 	/* Reset the old and new "did semi-consistent read" flags. */
5197 	if (UNIV_UNLIKELY(prebuilt->row_read_type
5198 			  == ROW_READ_DID_SEMI_CONSISTENT)) {
5199 		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5200 	}
5201 	did_semi_consistent_read = FALSE;
5202 	prebuilt->new_rec_locks = 0;
5203 
5204 	/*-------------------------------------------------------------*/
5205 	/* PHASE 5: Move the cursor to the next index record */
5206 
5207 	/* NOTE: For moves_up==FALSE, the mini-transaction will be
5208 	committed and restarted every time when switching b-tree
5209 	pages. For moves_up==TRUE in index condition pushdown, we can
5210 	scan an entire secondary index tree within a single
5211 	mini-transaction. As long as the prebuilt->idx_cond does not
5212 	match, we do not need to consult the clustered index or
5213 	return records to MySQL, and thus we can avoid repositioning
5214 	the cursor. What prevents us from buffer-fixing all leaf pages
5215 	within the mini-transaction is the btr_leaf_page_release()
5216 	call in btr_pcur_move_to_next_page(). Only the leaf page where
5217 	the cursor is positioned will remain buffer-fixed. */
5218 
5219 	if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
5220 		/* We must commit mtr if we are moving to the next
5221 		non-clustered index record, because we could break the
5222 		latching order if we would access a different clustered
5223 		index page right away without releasing the previous. */
5224 
5225 		btr_pcur_store_position(pcur, &mtr);
5226 
5227 		mtr_commit(&mtr);
5228 		mtr_has_extra_clust_latch = FALSE;
5229 
5230 		mtr_start(&mtr);
5231 		if (sel_restore_position_for_mysql(&same_user_rec,
5232 						   BTR_SEARCH_LEAF,
5233 						   pcur, moves_up, &mtr)) {
5234 #ifdef UNIV_SEARCH_DEBUG
5235 			cnt++;
5236 #endif /* UNIV_SEARCH_DEBUG */
5237 
5238 			goto rec_loop;
5239 		}
5240 	}
5241 
5242 	if (moves_up) {
5243 		if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
5244 not_moved:
5245 			btr_pcur_store_position(pcur, &mtr);
5246 
5247 			if (match_mode != 0) {
5248 				err = DB_RECORD_NOT_FOUND;
5249 			} else {
5250 				err = DB_END_OF_INDEX;
5251 			}
5252 
5253 			goto normal_return;
5254 		}
5255 	} else {
5256 		if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
5257 			goto not_moved;
5258 		}
5259 	}
5260 
5261 #ifdef UNIV_SEARCH_DEBUG
5262 	cnt++;
5263 #endif /* UNIV_SEARCH_DEBUG */
5264 
5265 	goto rec_loop;
5266 
5267 lock_wait_or_error:
5268 	/* Reset the old and new "did semi-consistent read" flags. */
5269 	if (UNIV_UNLIKELY(prebuilt->row_read_type
5270 			  == ROW_READ_DID_SEMI_CONSISTENT)) {
5271 		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5272 	}
5273 	did_semi_consistent_read = FALSE;
5274 
5275 	/*-------------------------------------------------------------*/
5276 
5277 	btr_pcur_store_position(pcur, &mtr);
5278 
5279 lock_table_wait:
5280 	mtr_commit(&mtr);
5281 	mtr_has_extra_clust_latch = FALSE;
5282 
5283 	trx->error_state = err;
5284 
5285 	/* The following is a patch for MySQL */
5286 
5287 	que_thr_stop_for_mysql(thr);
5288 
5289 	thr->lock_state = QUE_THR_LOCK_ROW;
5290 
5291 	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
5292 		/* It was a lock wait, and it ended */
5293 
5294 		thr->lock_state = QUE_THR_LOCK_NOLOCK;
5295 		mtr_start(&mtr);
5296 
5297 		/* Table lock waited, go try to obtain table lock
5298 		again */
5299 		if (table_lock_waited) {
5300 			table_lock_waited = FALSE;
5301 
5302 			goto wait_table_again;
5303 		}
5304 
5305 		sel_restore_position_for_mysql(&same_user_rec,
5306 					       BTR_SEARCH_LEAF, pcur,
5307 					       moves_up, &mtr);
5308 
5309 		if ((srv_locks_unsafe_for_binlog
5310 		     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
5311 		    && !same_user_rec) {
5312 
5313 			/* Since we were not able to restore the cursor
5314 			on the same user record, we cannot use
5315 			row_unlock_for_mysql() to unlock any records, and
5316 			we must thus reset the new rec lock info. Since
5317 			in lock0lock.cc we have blocked the inheriting of gap
5318 			X-locks, we actually do not have any new record locks
5319 			set in this case.
5320 
5321 			Note that if we were able to restore on the 'same'
5322 			user record, it is still possible that we were actually
5323 			waiting on a delete-marked record, and meanwhile
5324 			it was removed by purge and inserted again by some
5325 			other user. But that is no problem, because in
5326 			rec_loop we will again try to set a lock, and
5327 			new_rec_lock_info in trx will be right at the end. */
5328 
5329 			prebuilt->new_rec_locks = 0;
5330 		}
5331 
5332 		mode = pcur->search_mode;
5333 
5334 		goto rec_loop;
5335 	}
5336 
5337 	thr->lock_state = QUE_THR_LOCK_NOLOCK;
5338 
5339 #ifdef UNIV_SEARCH_DEBUG
5340 	/*	fputs("Using ", stderr);
5341 	dict_index_name_print(stderr, index);
5342 	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
5343 #endif /* UNIV_SEARCH_DEBUG */
5344 	goto func_exit;
5345 
5346 normal_return:
5347 	/*-------------------------------------------------------------*/
5348 	que_thr_stop_for_mysql_no_error(thr, trx);
5349 
5350 	mtr_commit(&mtr);
5351 
5352 	if (prebuilt->idx_cond != 0) {
5353 
5354 		/* When ICP is active we don't write to the MySQL buffer
5355 		directly, only to buffers that are enqueued in the pre-fetch
5356 		queue. We need to dequeue the first buffer and copy the contents
5357 		to the record buffer that was passed in by MySQL. */
5358 
5359 		if (prebuilt->n_fetch_cached > 0) {
5360 			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
5361 			err = DB_SUCCESS;
5362 		}
5363 
5364 	} else if (next_buf != 0) {
5365 
5366 		/* We may or may not have enqueued some buffers to the
5367 		pre-fetch queue, but we definitely wrote to the record
5368 		buffer passed to use by MySQL. */
5369 
5370 		DEBUG_SYNC_C("row_search_cached_row");
5371 		err = DB_SUCCESS;
5372 	}
5373 
5374 #ifdef UNIV_SEARCH_DEBUG
5375 	/*	fputs("Using ", stderr);
5376 	dict_index_name_print(stderr, index);
5377 	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
5378 #endif /* UNIV_SEARCH_DEBUG */
5379 
5380 func_exit:
5381 	trx->op_info = "";
5382 
5383 	if (end_range_cache != NULL) {
5384 		ut_free(end_range_cache);
5385 	}
5386 
5387 	if (UNIV_LIKELY_NULL(heap)) {
5388 		mem_heap_free(heap);
5389 	}
5390 
5391 	/* Set or reset the "did semi-consistent read" flag on return.
5392 	The flag did_semi_consistent_read is set if and only if
5393 	the record being returned was fetched with a semi-consistent read. */
5394 	ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
5395 	      || !did_semi_consistent_read);
5396 
5397 	if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
5398 		if (UNIV_UNLIKELY(did_semi_consistent_read)) {
5399 			prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
5400 		} else {
5401 			prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
5402 		}
5403 	}
5404 
5405 #ifdef UNIV_SYNC_DEBUG
5406 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
5407 #endif /* UNIV_SYNC_DEBUG */
5408 
5409 	DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
5410 
5411 	return(err);
5412 }
5413 
5414 /*******************************************************************//**
5415 Checks if MySQL at the moment is allowed for this table to retrieve a
5416 consistent read result, or store it to the query cache.
5417 @return	TRUE if storing or retrieving from the query cache is permitted */
5418 UNIV_INTERN
5419 ibool
row_search_check_if_query_cache_permitted(trx_t * trx,const char * norm_name)5420 row_search_check_if_query_cache_permitted(
5421 /*======================================*/
5422 	trx_t*		trx,		/*!< in: transaction object */
5423 	const char*	norm_name)	/*!< in: concatenation of database name,
5424 					'/' char, table name */
5425 {
5426 	dict_table_t*	table;
5427 	ibool		ret	= FALSE;
5428 
5429 	/* Disable query cache altogether for all tables if recovered XA
5430 	transactions in prepared state exist. This is because we do not
5431 	restore the table locks for those transactions and we may wrongly
5432 	set ret=TRUE above if "lock_table_get_n_locks(table) == 0". See
5433 	"Bug#14658648 XA ROLLBACK (DISTRIBUTED DATABASE) NOT WORKING WITH
5434 	QUERY CACHE ENABLED".
5435 	Read trx_sys->n_prepared_recovered_trx without mutex protection,
5436 	not possible to end up with a torn read since n_prepared_recovered_trx
5437 	is word size. */
5438 	if (trx_sys->n_prepared_recovered_trx > 0) {
5439 
5440 		return(FALSE);
5441 	}
5442 
5443 	table = dict_table_open_on_name(norm_name, FALSE, FALSE,
5444 					DICT_ERR_IGNORE_NONE);
5445 
5446 	if (table == NULL) {
5447 
5448 		return(FALSE);
5449 	}
5450 
5451 	/* Start the transaction if it is not started yet */
5452 
5453 	trx_start_if_not_started(trx);
5454 
5455 	/* If there are locks on the table or some trx has invalidated the
5456 	cache up to our trx id, then ret = FALSE.
5457 	We do not check what type locks there are on the table, though only
5458 	IX type locks actually would require ret = FALSE. */
5459 
5460 	if (lock_table_get_n_locks(table) == 0
5461 	    && trx->id >= table->query_cache_inv_trx_id) {
5462 
5463 		ret = TRUE;
5464 
5465 		/* If the isolation level is high, assign a read view for the
5466 		transaction if it does not yet have one */
5467 
5468 		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
5469 		    && !trx->read_view) {
5470 
5471 			trx->read_view = read_view_open_now(
5472 				trx->id, trx->global_read_view_heap);
5473 
5474 			trx->global_read_view = trx->read_view;
5475 		}
5476 	}
5477 
5478 	dict_table_close(table, FALSE, FALSE);
5479 
5480 	return(ret);
5481 }
5482 
5483 /*******************************************************************//**
5484 Read the AUTOINC column from the current row. If the value is less than
5485 0 and the type is not unsigned then we reset the value to 0.
5486 @return	value read from the column */
5487 static
5488 ib_uint64_t
row_search_autoinc_read_column(dict_index_t * index,const rec_t * rec,ulint col_no,ulint mtype,ibool unsigned_type)5489 row_search_autoinc_read_column(
5490 /*===========================*/
5491 	dict_index_t*	index,		/*!< in: index to read from */
5492 	const rec_t*	rec,		/*!< in: current rec */
5493 	ulint		col_no,		/*!< in: column number */
5494 	ulint		mtype,		/*!< in: column main type */
5495 	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
5496 {
5497 	ulint		len;
5498 	const byte*	data;
5499 	ib_uint64_t	value;
5500 	mem_heap_t*	heap = NULL;
5501 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
5502 	ulint*		offsets	= offsets_;
5503 
5504 	rec_offs_init(offsets_);
5505 
5506 	offsets = rec_get_offsets(rec, index, offsets, col_no + 1, &heap);
5507 
5508 	if (rec_offs_nth_sql_null(offsets, col_no)) {
5509 		/* There is no non-NULL value in the auto-increment column. */
5510 		value = 0;
5511 		goto func_exit;
5512 	}
5513 
5514 	data = rec_get_nth_field(rec, offsets, col_no, &len);
5515 
5516 	switch (mtype) {
5517 	case DATA_INT:
5518 		ut_a(len <= sizeof value);
5519 		value = mach_read_int_type(data, len, unsigned_type);
5520 		break;
5521 
5522 	case DATA_FLOAT:
5523 		ut_a(len == sizeof(float));
5524 		value = (ib_uint64_t) mach_float_read(data);
5525 		break;
5526 
5527 	case DATA_DOUBLE:
5528 		ut_a(len == sizeof(double));
5529 		value = (ib_uint64_t) mach_double_read(data);
5530 		break;
5531 
5532 	default:
5533 		ut_error;
5534 	}
5535 
5536 	if (!unsigned_type && (ib_int64_t) value < 0) {
5537 		value = 0;
5538 	}
5539 
5540 func_exit:
5541 	if (UNIV_LIKELY_NULL(heap)) {
5542 		mem_heap_free(heap);
5543 	}
5544 
5545 	return(value);
5546 }
5547 
5548 /** Get the maximum and non-delete-marked record in an index.
5549 @param[in]	index	index tree
5550 @param[in,out]	mtr	mini-transaction (may be committed and restarted)
5551 @return maximum record, page s-latched in mtr
5552 @retval NULL if there are no records, or if all of them are delete-marked */
5553 static
5554 const rec_t*
row_search_get_max_rec(dict_index_t * index,mtr_t * mtr)5555 row_search_get_max_rec(
5556 	dict_index_t*	index,
5557 	mtr_t*		mtr)
5558 {
5559 	btr_pcur_t	pcur;
5560 	const rec_t*	rec;
5561 	/* Open at the high/right end (false), and init cursor */
5562 	btr_pcur_open_at_index_side(
5563 		false, index, BTR_SEARCH_LEAF, &pcur, true, 0, mtr);
5564 
5565 	do {
5566 		const page_t*	page;
5567 
5568 		page = btr_pcur_get_page(&pcur);
5569 		rec = page_find_rec_max_not_deleted(page);
5570 
5571 		if (page_rec_is_user_rec(rec)) {
5572 			break;
5573 		} else {
5574 			rec = NULL;
5575 		}
5576 		btr_pcur_move_before_first_on_page(&pcur);
5577 	} while (btr_pcur_move_to_prev(&pcur, mtr));
5578 
5579 	btr_pcur_close(&pcur);
5580 
5581 	return(rec);
5582 }
5583 
5584 /*******************************************************************//**
5585 Read the max AUTOINC value from an index.
5586 @return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
5587 column name can't be found in index */
5588 UNIV_INTERN
5589 dberr_t
row_search_max_autoinc(dict_index_t * index,const char * col_name,ib_uint64_t * value)5590 row_search_max_autoinc(
5591 /*===================*/
5592 	dict_index_t*	index,		/*!< in: index to search */
5593 	const char*	col_name,	/*!< in: name of autoinc column */
5594 	ib_uint64_t*	value)		/*!< out: AUTOINC value read */
5595 {
5596 	dict_field_t*	dfield = dict_index_get_nth_field(index, 0);
5597 	dberr_t		error = DB_SUCCESS;
5598 	*value = 0;
5599 
5600 	if (strcmp(col_name, dfield->name) != 0) {
5601 		error = DB_RECORD_NOT_FOUND;
5602 	} else {
5603 		mtr_t		mtr;
5604 		const rec_t*	rec;
5605 
5606 		mtr_start(&mtr);
5607 
5608 		rec = row_search_get_max_rec(index, &mtr);
5609 
5610 		if (rec != NULL) {
5611 			ibool unsigned_type = (
5612 				dfield->col->prtype & DATA_UNSIGNED);
5613 
5614 			*value = row_search_autoinc_read_column(
5615 				index, rec, 0,
5616 				dfield->col->mtype, unsigned_type);
5617 		}
5618 
5619 		mtr_commit(&mtr);
5620 	}
5621 
5622 	return(error);
5623 }
5624