1 /* Copyright (C) 2000-2006 MySQL AB
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15 
16 /**
17   @file
18 
19   @brief
20   join cache optimizations
21 
22   @defgroup Query_Optimizer  Query Optimizer
23   @{
24 */
25 
26 #ifdef USE_PRAGMA_IMPLEMENTATION
27 #pragma implementation				// gcc: Class implementation
28 #endif
29 
30 #include "mariadb.h"
31 #include "key.h"
32 #include "sql_base.h"
33 #include "sql_select.h"
34 #include "opt_subselect.h"
35 
36 #define NO_MORE_RECORDS_IN_BUFFER  (uint)(-1)
37 
38 static void save_or_restore_used_tabs(JOIN_TAB *join_tab, bool save);
39 
40 /*****************************************************************************
41  *  Join cache module
42 ******************************************************************************/
43 
44 /*
45   Fill in the descriptor of a flag field associated with a join cache
46 
47   SYNOPSIS
48     add_field_flag_to_join_cache()
49       str           position in a record buffer to copy the field from/to
50       length        length of the field
51       field  IN/OUT pointer to the field descriptor to fill in
52 
53   DESCRIPTION
54     The function fill in the descriptor of a cache flag field to which
55     the parameter 'field' points to. The function uses the first two
56     parameters to set the position in the record buffer from/to which
57     the field value is to be copied and the length of the copied fragment.
58     Before returning the result the function increments the value of
59     *field by 1.
60     The function ignores the fields 'blob_length' and 'ofset' of the
61     descriptor.
62 
63   RETURN VALUE
64     the length of the field
65 */
66 
67 static
add_flag_field_to_join_cache(uchar * str,uint length,CACHE_FIELD ** field)68 uint add_flag_field_to_join_cache(uchar *str, uint length, CACHE_FIELD **field)
69 {
70   CACHE_FIELD *copy= *field;
71   copy->str= str;
72   copy->length= length;
73   copy->type= 0;
74   copy->field= 0;
75   copy->referenced_field_no= 0;
76   (*field)++;
77   return length;
78 }
79 
80 
81 /*
82   Fill in the descriptors of table data fields associated with a join cache
83 
84   SYNOPSIS
85     add_table_data_fields_to_join_cache()
86       tab              descriptors of fields from this table are to be filled
87       field_set        descriptors for only these fields are to be created
88       field_cnt IN/OUT     counter of data fields
89       descr  IN/OUT        pointer to the first descriptor to be filled
90       field_ptr_cnt IN/OUT counter of pointers to the data fields
91       descr_ptr IN/OUT     pointer to the first pointer to blob descriptors
92 
93   DESCRIPTION
94     The function fills in the descriptors of cache data fields from the table
95     'tab'. The descriptors are filled only for the fields marked in the
96     bitmap 'field_set'.
97     The function fills the descriptors starting from the position pointed
98     by 'descr'. If an added field is of a BLOB type then a pointer to the
99     its descriptor is added to the array descr_ptr.
100     At the return 'descr' points to the position after the last added
101     descriptor  while 'descr_ptr' points to the position right after the
102     last added pointer.
103 
104   RETURN VALUE
105     the total length of the added fields
106 */
107 
108 static
add_table_data_fields_to_join_cache(JOIN_TAB * tab,MY_BITMAP * field_set,uint * field_cnt,CACHE_FIELD ** descr,uint * field_ptr_cnt,CACHE_FIELD *** descr_ptr)109 uint add_table_data_fields_to_join_cache(JOIN_TAB *tab,
110                                          MY_BITMAP *field_set,
111                                          uint *field_cnt,
112                                          CACHE_FIELD **descr,
113                                          uint *field_ptr_cnt,
114                                          CACHE_FIELD ***descr_ptr)
115 {
116   Field **fld_ptr;
117   uint len= 0;
118   CACHE_FIELD *copy= *descr;
119   CACHE_FIELD **copy_ptr= *descr_ptr;
120   uint used_fields= bitmap_bits_set(field_set);
121   for (fld_ptr= tab->table->field; used_fields; fld_ptr++)
122   {
123     if (bitmap_is_set(field_set, (*fld_ptr)->field_index))
124     {
125       len+= (*fld_ptr)->fill_cache_field(copy);
126       if (copy->type == CACHE_BLOB)
127       {
128         *copy_ptr= copy;
129         copy_ptr++;
130         (*field_ptr_cnt)++;
131       }
132       copy->field= *fld_ptr;
133       copy->referenced_field_no= 0;
134       copy++;
135       (*field_cnt)++;
136       used_fields--;
137     }
138   }
139   *descr= copy;
140   *descr_ptr= copy_ptr;
141   return len;
142 }
143 
144 /*
145   Determine different counters of fields associated with a record in the cache
146 
147   SYNOPSIS
148     calc_record_fields()
149 
150   DESCRIPTION
151     The function counts the number of total fields stored in a record
152     of the cache and saves this number in the 'fields' member. It also
153     determines the number of flag fields and the number of blobs.
154     The function sets 'with_match_flag' on if 'join_tab' needs a match flag
155     i.e. if it is the first inner table of an outer join or a semi-join.
156 
157   RETURN VALUE
158     none
159 */
160 
calc_record_fields()161 void JOIN_CACHE::calc_record_fields()
162 {
163   JOIN_TAB *tab;
164 
165   if (prev_cache)
166     tab= prev_cache->join_tab;
167   else
168   {
169     if (join_tab->bush_root_tab)
170     {
171       /*
172         --ot1--SJM1--------------ot2--...
173                 |
174                 |
175                 +-it1--...--itN
176                         ^____________ this->join_tab is somewhere here,
177                                       inside an sjm nest.
178 
179         The join buffer should store the values of it1.*, it2.*, ..
180         It should not store values of ot1.*.
181       */
182       tab= join_tab->bush_root_tab->bush_children->start;
183     }
184     else
185     {
186       /*
187         -ot1--ot2--SJM1--SJM2--------------ot3--...--otN
188                     |     |                      ^
189                     |     +-it21--...--it2N      |
190                     |                            \-- we're somewhere here,
191                     +-it11--...--it1N                at the top level
192 
193         The join buffer should store the values of
194 
195           ot1.*, ot2.*, it1{i}, it2{j}.*, ot3.*, ...
196 
197         that is, we should start from the first non-const top-level table.
198 
199         We will need to store columns of SJ-inner tables (it_X_Y.*), but we're
200         not interested in storing the columns of materialization tables
201         themselves. Beause of that, if the first non-const top-level table is a
202         materialized table, we move to its bush_children:
203       */
204       tab= join->join_tab + join->const_tables;
205       if (tab->bush_children)
206         tab= tab->bush_children->start;
207     }
208   }
209   DBUG_ASSERT(!tab->bush_children);
210 
211   start_tab= tab;
212   fields= 0;
213   blobs= 0;
214   flag_fields= 0;
215   data_field_count= 0;
216   data_field_ptr_count= 0;
217   referenced_fields= 0;
218 
219   /*
220     The following loop will get inside SJM nests, because data may be unpacked
221     to sjm-inner tables.
222   */
223   for (; tab != join_tab ; tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
224   {
225     tab->calc_used_field_length(FALSE);
226     flag_fields+= MY_TEST(tab->used_null_fields || tab->used_uneven_bit_fields);
227     flag_fields+= MY_TEST(tab->table->maybe_null);
228     fields+= tab->used_fields;
229     blobs+= tab->used_blobs;
230   }
231   if ((with_match_flag= join_tab->use_match_flag()))
232     flag_fields++;
233   fields+= flag_fields;
234 }
235 
236 
237 /*
238   Collect information on join key arguments
239 
240   SYNOPSIS
241     collect_info_on_key_args()
242 
243   DESCRIPTION
244     The function traverses the ref expressions that are used to access the
245     joined table join_tab. For each table 'tab' whose fields are to be stored
246     in the join buffer of the cache the function finds the fields from 'tab'
247     that occur in the ref expressions and marks these fields in the bitmap
248     tab->table->tmp_set. The function counts the number of them stored
249     in this cache and the total number of them stored in the previous caches
250     and saves the results of the counting in 'local_key_arg_fields' and
251     'external_key_arg_fields' respectively.
252 
253   NOTES
254     The function does not do anything if no key is used to join the records
255     from join_tab.
256 
257   RETURN VALUE
258     none
259 */
260 
collect_info_on_key_args()261 void JOIN_CACHE::collect_info_on_key_args()
262 {
263   JOIN_TAB *tab;
264   JOIN_CACHE *cache;
265   local_key_arg_fields= 0;
266   external_key_arg_fields= 0;
267 
268   if (!is_key_access())
269     return;
270 
271   TABLE_REF *ref= &join_tab->ref;
272   cache= this;
273   do
274   {
275     for (tab= cache->start_tab; tab != cache->join_tab;
276          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
277     {
278       uint key_args;
279       bitmap_clear_all(&tab->table->tmp_set);
280       for (uint i= 0; i < ref->key_parts; i++)
281       {
282         Item *ref_item= ref->items[i];
283         if (!(tab->table->map & ref_item->used_tables()))
284 	  continue;
285 	 ref_item->walk(&Item::add_field_to_set_processor, 1, tab->table);
286       }
287       if ((key_args= bitmap_bits_set(&tab->table->tmp_set)))
288       {
289         if (cache == this)
290           local_key_arg_fields+= key_args;
291         else
292           external_key_arg_fields+= key_args;
293       }
294     }
295     cache= cache->prev_cache;
296   }
297   while (cache);
298 
299   return;
300 }
301 
302 
303 /*
304   Allocate memory for descriptors and pointers to them associated with the cache
305 
306   SYNOPSIS
307     alloc_fields()
308 
309   DESCRIPTION
310     The function allocates memory for the array of fields descriptors
311     and the array of pointers to the field descriptors used to copy
312     join record data from record buffers into the join buffer and
313     backward. Some pointers refer to the field descriptor associated
314     with previous caches. They are placed at the beginning of the array
315     of pointers and its total number is stored in external_key_arg_fields.
316     The pointer of the first array is assigned to field_descr and the number
317     of the elements in it is precalculated by the function calc_record_fields.
318     The allocated arrays are adjacent.
319 
320   NOTES
321     The memory is allocated in join->thd->mem_root
322 
323   RETURN VALUE
324     pointer to the first array
325 */
326 
alloc_fields()327 int JOIN_CACHE::alloc_fields()
328 {
329   uint ptr_cnt= external_key_arg_fields+blobs+1;
330   uint fields_size= sizeof(CACHE_FIELD)*fields;
331   field_descr= (CACHE_FIELD*) join->thd->alloc(fields_size +
332                                                sizeof(CACHE_FIELD*)*ptr_cnt);
333   blob_ptr= (CACHE_FIELD **) ((uchar *) field_descr + fields_size);
334   return (field_descr == NULL);
335 }
336 
337 
338 /*
339   Create descriptors of the record flag fields stored in the join buffer
340 
341   SYNOPSIS
342     create_flag_fields()
343 
344   DESCRIPTION
345     The function creates descriptors of the record flag fields stored
346     in the join buffer. These are descriptors for:
347     - an optional match flag field,
348     - table null bitmap fields,
349     - table null row fields.
350     The match flag field is created when 'join_tab' is the first inner
351     table of an outer join our a semi-join. A null bitmap field is
352     created for any table whose fields are to be stored in the join
353     buffer if at least one of these fields is nullable or is a BIT field
354     whose bits are partially stored with null bits. A null row flag
355     is created for any table assigned to the cache if it is an inner
356     table of an outer join.
357     The descriptor for flag fields are placed one after another at the
358     beginning of the array of field descriptors 'field_descr' that
359     contains 'fields' elements. If there is a match flag field the
360     descriptor for it is always first in the sequence of flag fields.
361     The descriptors for other flag fields can follow in an arbitrary
362     order.
363     The flag field values follow in a record stored in the join buffer
364     in the same order as field descriptors, with the match flag always
365     following first.
366     The function sets the value of 'flag_fields' to the total number
367     of the descriptors created for the flag fields.
368     The function sets the value of 'length' to the total length of the
369     flag fields.
370 
371   RETURN VALUE
372     none
373 */
374 
create_flag_fields()375 void JOIN_CACHE::create_flag_fields()
376 {
377   CACHE_FIELD *copy;
378   JOIN_TAB *tab;
379 
380   copy= field_descr;
381 
382   length=0;
383 
384   /* If there is a match flag the first field is always used for this flag */
385   if (with_match_flag)
386     length+= add_flag_field_to_join_cache((uchar*) &join_tab->found,
387                                           sizeof(join_tab->found),
388 	                                  &copy);
389 
390   /* Create fields for all null bitmaps and null row flags that are needed */
391   for (tab= start_tab; tab != join_tab;
392        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
393   {
394     TABLE *table= tab->table;
395 
396     /* Create a field for the null bitmap from table if needed */
397     if (tab->used_null_fields || tab->used_uneven_bit_fields)
398       length+= add_flag_field_to_join_cache(table->null_flags,
399                                             table->s->null_bytes,
400                                             &copy);
401 
402     /* Create table for the null row flag if needed */
403     if (table->maybe_null)
404       length+= add_flag_field_to_join_cache((uchar*) &table->null_row,
405                                             sizeof(table->null_row),
406                                             &copy);
407   }
408 
409   /* Theoretically the new value of flag_fields can be less than the old one */
410   flag_fields= (uint)(copy-field_descr);
411 }
412 
413 
414 /*
415   Create descriptors of the fields used to build access keys to the joined table
416 
417   SYNOPSIS
418     create_key_arg_fields()
419 
420   DESCRIPTION
421     The function creates descriptors of the record fields stored in the join
422     buffer that are used to build access keys to the joined table. These
423     fields are put into the buffer ahead of other records fields stored in
424     the buffer. Such placement helps to optimize construction of access keys.
425     For each field that is used to build access keys to the joined table but
426     is stored in some other join cache buffer the function saves a pointer
427     to the the field descriptor. The array of such pointers are placed in the
428     the join cache structure just before the array of pointers to the
429     blob fields blob_ptr.
430     Any field stored in a join cache buffer that is used to construct keys
431     to access tables associated with other join caches is called a referenced
432     field. It receives a unique number that is saved by the function in the
433     member 'referenced_field_no' of the CACHE_FIELD descriptor for the field.
434     This number is used as index to the array of offsets to the referenced
435     fields that are saved and put in the join cache buffer after all record
436     fields.
437     The function also finds out whether that the keys to access join_tab
438     can be considered as embedded and, if so, sets the flag 'use_emb_key' in
439     this join cache appropriately.
440 
441   NOTES.
442     When a key to access the joined table 'join_tab' is constructed the array
443     of pointers to the field descriptors for the external fields is looked
444     through. For each of this pointers we find out in what previous key cache
445     the referenced field is stored. The value of 'referenced_field_no'
446     provides us with the index into the array of offsets for referenced
447     fields stored in the join cache. The offset read by the the index allows
448     us to read the field without reading all other fields of the record
449     stored the join cache buffer. This optimizes the construction of keys
450     to access 'join_tab' when some key arguments are stored in the previous
451     join caches.
452 
453   NOTES
454     The function does not do anything if no key is used to join the records
455     from join_tab.
456 
457   RETURN VALUE
458     none
459 */
create_key_arg_fields()460 void JOIN_CACHE::create_key_arg_fields()
461 {
462   JOIN_TAB *tab;
463   JOIN_CACHE *cache;
464 
465   if (!is_key_access())
466     return;
467 
468   /*
469     Save pointers to the cache fields in previous caches
470     that  are used to build keys for this key access.
471   */
472   cache= this;
473   uint ext_key_arg_cnt= external_key_arg_fields;
474   CACHE_FIELD *copy;
475   CACHE_FIELD **copy_ptr= blob_ptr;
476   while (ext_key_arg_cnt)
477   {
478     cache= cache->prev_cache;
479     for (tab= cache->start_tab; tab != cache->join_tab;
480          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
481     {
482       CACHE_FIELD *copy_end;
483       MY_BITMAP *key_read_set= &tab->table->tmp_set;
484       /* key_read_set contains the bitmap of tab's fields referenced by ref */
485       if (bitmap_is_clear_all(key_read_set))
486         continue;
487       copy_end= cache->field_descr+cache->fields;
488       for (copy= cache->field_descr+cache->flag_fields; copy < copy_end; copy++)
489       {
490         /*
491           (1) - when we store rowids for DuplicateWeedout, they have
492                 copy->field==NULL
493         */
494         if (copy->field &&  // (1)
495             copy->field->table == tab->table &&
496             bitmap_is_set(key_read_set, copy->field->field_index))
497         {
498           *copy_ptr++= copy;
499           ext_key_arg_cnt--;
500           if (!copy->referenced_field_no)
501           {
502             /*
503               Register the referenced field 'copy':
504               - set the offset number in copy->referenced_field_no,
505               - adjust the value of the flag 'with_length',
506               - adjust the values of 'pack_length' and
507                 of 'pack_length_with_blob_ptrs'.
508 	    */
509             copy->referenced_field_no= ++cache->referenced_fields;
510             if (!cache->with_length)
511             {
512               cache->with_length= TRUE;
513               uint sz= cache->get_size_of_rec_length();
514               cache->base_prefix_length+= sz;
515               cache->pack_length+= sz;
516               cache->pack_length_with_blob_ptrs+= sz;
517             }
518 	    cache->pack_length+= cache->get_size_of_fld_offset();
519             cache->pack_length_with_blob_ptrs+= cache->get_size_of_fld_offset();
520           }
521         }
522       }
523     }
524   }
525   /* After this 'blob_ptr' shall not be be changed */
526   blob_ptr= copy_ptr;
527 
528   /* Now create local fields that are used to build ref for this key access */
529   copy= field_descr+flag_fields;
530   for (tab= start_tab; tab != join_tab;
531        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
532   {
533     length+= add_table_data_fields_to_join_cache(tab, &tab->table->tmp_set,
534                                                  &data_field_count, &copy,
535                                                  &data_field_ptr_count,
536                                                  &copy_ptr);
537   }
538 
539   use_emb_key= check_emb_key_usage();
540 
541   return;
542 }
543 
544 
545 /*
546   Create descriptors of all remaining data fields stored in the join buffer
547 
548   SYNOPSIS
549     create_remaining_fields()
550 
551   DESCRIPTION
552     The function creates descriptors for all remaining data fields of a
553     record from the join buffer. If the value returned by is_key_access() is
554     false the function creates fields for all read record fields that
555     comprise the partial join record joined with join_tab. Otherwise,
556     for each table tab, the set of the read fields for which the descriptors
557     have to be added is determined as the difference between all read fields
558     and and those for which the descriptors have been already created.
559     The latter are supposed to be marked in the bitmap tab->table->tmp_set.
560     The function increases the value of 'length' to the the total length of
561     the added fields.
562 
563   NOTES
564     If is_key_access() returns true the function modifies the value of
565     tab->table->tmp_set for a each table whose fields are stored in the cache.
566     The function calls the method Field::fill_cache_field to figure out
567     the type of the cache field and the maximal length of its representation
568     in the join buffer. If this is a blob field then additionally a pointer
569     to this field is added as an element of the array blob_ptr. For a blob
570     field only the size of the length of the blob data is taken into account.
571     It is assumed that 'data_field_count' contains the number of descriptors
572     for data fields that have been already created and 'data_field_ptr_count'
573     contains the number of the pointers to such descriptors having been
574     stored up to the moment.
575 
576   RETURN VALUE
577     none
578 */
579 
create_remaining_fields()580 void JOIN_CACHE::create_remaining_fields()
581 {
582   JOIN_TAB *tab;
583   bool all_read_fields= !is_key_access();
584   CACHE_FIELD *copy= field_descr+flag_fields+data_field_count;
585   CACHE_FIELD **copy_ptr= blob_ptr+data_field_ptr_count;
586 
587   for (tab= start_tab; tab != join_tab;
588        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
589   {
590     MY_BITMAP *rem_field_set;
591     TABLE *table= tab->table;
592 
593     if (all_read_fields)
594       rem_field_set= table->read_set;
595     else
596     {
597       bitmap_invert(&table->tmp_set);
598       bitmap_intersect(&table->tmp_set, table->read_set);
599       rem_field_set= &table->tmp_set;
600     }
601 
602     length+= add_table_data_fields_to_join_cache(tab, rem_field_set,
603                                                  &data_field_count, &copy,
604                                                  &data_field_ptr_count,
605                                                  &copy_ptr);
606 
607     /* SemiJoinDuplicateElimination: allocate space for rowid if needed */
608     if (tab->keep_current_rowid)
609     {
610       copy->str= table->file->ref;
611       if (copy->str)
612         copy->length= table->file->ref_length;
613       else
614       {
615         /* This may happen only for materialized derived tables and views */
616         copy->length= 0;
617         copy->str= (uchar *) table;
618       }
619       copy->type= CACHE_ROWID;
620       copy->field= 0;
621       copy->referenced_field_no= 0;
622       /*
623         Note: this may seem odd, but at this point we have
624         table->file->ref==NULL while table->file->ref_length is already set
625         to correct value.
626       */
627       length += table->file->ref_length;
628       data_field_count++;
629       copy++;
630     }
631   }
632 }
633 
634 
635 
636 /*
637   Calculate and set all cache constants
638 
639   SYNOPSIS
640     set_constants()
641 
642   DESCRIPTION
643     The function calculates and set all precomputed constants that are used
644     when writing records into the join buffer and reading them from it.
645     It calculates the size of offsets of a record within the join buffer
646     and of a field within a record. It also calculates the number of bytes
647     used to store record lengths.
648     The function also calculates the maximal length of the representation
649     of record in the cache excluding blob_data. This value is used when
650     making a dicision whether more records should be added into the join
651     buffer or not.
652 
653   RETURN VALUE
654     none
655 */
656 
set_constants()657 void JOIN_CACHE::set_constants()
658 {
659   /*
660     Any record from a BKA cache is prepended with the record length.
661     We use the record length when reading the buffer and building key values
662     for each record. The length allows us not to read the fields that are
663     not needed for keys.
664     If a record has match flag it also may be skipped when the match flag
665     is on. It happens if the cache is used for a semi-join operation or
666     for outer join when the 'not exist' optimization can be applied.
667     If some of the fields are referenced from other caches then
668     the record length allows us to easily reach the saved offsets for
669     these fields since the offsets are stored at the very end of the record.
670     However at this moment we don't know whether we have referenced fields for
671     the cache or not. Later when a referenced field is registered for the cache
672     we adjust the value of the flag 'with_length'.
673   */
674   with_length= is_key_access() ||
675                join_tab->is_inner_table_of_semi_join_with_first_match() ||
676                join_tab->is_inner_table_of_outer_join();
677   /*
678      At this moment we don't know yet the value of 'referenced_fields',
679      but in any case it can't be greater than the value of 'fields'.
680   */
681   uint len= length + fields*sizeof(uint)+blobs*sizeof(uchar *) +
682             (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
683             sizeof(ulong);
684   /*
685     The values of  size_of_rec_ofs, size_of_rec_len, size_of_fld_ofs,
686      base_prefix_length, pack_length,  pack_length_with_blob_ptrs
687      will be recalculated later in this function when we get the estimate
688      for the actual value of the join buffer size.
689   */
690   size_of_rec_ofs=  size_of_rec_len= size_of_fld_ofs= 4;
691   base_prefix_length= (with_length ? size_of_rec_len : 0) +
692                       (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
693   pack_length= (with_length ? size_of_rec_len : 0) +
694                (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
695                length + fields*sizeof(uint);
696   pack_length_with_blob_ptrs= pack_length + blobs*sizeof(uchar *);
697   min_buff_size= 0;
698   min_records= 1;
699   buff_size= (size_t)MY_MAX(join->thd->variables.join_buff_size,
700                  get_min_join_buffer_size());
701   size_of_rec_ofs= offset_size(buff_size);
702   size_of_rec_len= blobs ? size_of_rec_ofs : offset_size(len);
703   size_of_fld_ofs= size_of_rec_len;
704   base_prefix_length= (with_length ? size_of_rec_len : 0) +
705                       (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
706   /*
707     The size of the offsets for referenced fields will be added later.
708     The values of 'pack_length' and 'pack_length_with_blob_ptrs' are adjusted
709     every time when the first reference to the referenced field is registered.
710   */
711   pack_length= (with_length ? size_of_rec_len : 0) +
712                (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
713                length;
714   pack_length_with_blob_ptrs= pack_length + blobs*sizeof(uchar *);
715 }
716 
717 
718 /*
719   Get maximum total length of all affixes of a record in the join cache buffer
720 
721   SYNOPSIS
722     get_record_max_affix_length()
723 
724   DESCRIPTION
725     The function calculates the maximum possible total length of all affixes
726     of a record in the join cache buffer, that is made of:
727       - the length of all prefixes used in this cache,
728       - the length of the match flag if it's needed
729       - the total length of the maximum possible offsets to the fields of
730         a record in the buffer.
731 
732   RETURN VALUE
733     The maximum total length of all affixes of a record in the join buffer
734 */
735 
get_record_max_affix_length()736 uint JOIN_CACHE::get_record_max_affix_length()
737 {
738   uint len= get_prefix_length() +
739             MY_TEST(with_match_flag) +
740             size_of_fld_ofs * data_field_count;
741   return len;
742 }
743 
744 
745 /*
746   Get the minimum possible size of the cache join buffer
747 
748   SYNOPSIS
749     get_min_join_buffer_size()
750 
751   DESCRIPTION
752     At the first its invocation for the cache the function calculates the
753     minimum possible size of the join buffer of the cache. This value depends
754     on the minimal number of records 'min_records' to be stored in the join
755     buffer. The number is supposed to be determined by the procedure that
756     chooses the best access path to the joined table join_tab in the execution
757     plan. After the calculation of the interesting size the function saves it
758     in the field 'min_buff_size' in order to use it directly at the next
759     invocations of the function.
760 
761   NOTES
762     Currently the number of minimal records is just set to 1.
763 
764   RETURN VALUE
765     The minimal possible size of the join buffer of this cache
766 */
767 
get_min_join_buffer_size()768 size_t JOIN_CACHE::get_min_join_buffer_size()
769 {
770   if (!min_buff_size)
771   {
772     size_t len= 0;
773     size_t len_last= 0;
774     for (JOIN_TAB *tab= start_tab; tab != join_tab;
775          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
776     {
777       len+= tab->get_max_used_fieldlength();
778       len_last+= tab->get_used_fieldlength();
779     }
780     size_t len_addon= get_record_max_affix_length() +
781                       get_max_key_addon_space_per_record();
782     len+= len_addon;
783     len_last+= len_addon;
784     size_t min_sz= len*(min_records-1) + len_last;
785     min_sz+= pack_length_with_blob_ptrs;
786     size_t add_sz= 0;
787     for (uint i=0; i < min_records; i++)
788       add_sz+= join_tab_scan->aux_buffer_incr(i+1);
789     avg_aux_buffer_incr= add_sz/min_records;
790     min_sz+= add_sz;
791     set_if_bigger(min_sz, 1);
792     min_buff_size= min_sz;
793   }
794   return min_buff_size;
795 }
796 
797 
798 /*
799   Get the maximum possible size of the cache join buffer
800 
801   SYNOPSIS
802     get_max_join_buffer_size()
803 
804     optimize_buff_size  FALSE <-> do not take more memory than needed for
805                         the estimated number of records in the partial join
806 
807   DESCRIPTION
808     At the first its invocation for the cache the function calculates the
809     maximum possible size of join buffer for the cache. If the parameter
810     optimize_buff_size true then this value does not exceed the size of the
811     space needed for the estimated number of records 'max_records' in the
812     partial join that joins tables from the first one through join_tab. This
813     value is also capped off by the value of join_tab->join_buffer_size_limit,
814     if it has been set a to non-zero value, and by the value of the system
815     parameter join_buffer_size - otherwise. After the calculation of the
816     interesting size the function saves the value in the field 'max_buff_size'
817     in order to use it directly at the next  invocations of the function.
818 
819   NOTES
820     Currently the value of join_tab->join_buffer_size_limit is initialized
821     to 0 and is never reset.
822 
823   RETURN VALUE
824     The maximum possible size of the join buffer of this cache
825 */
826 
get_max_join_buffer_size(bool optimize_buff_size)827 size_t JOIN_CACHE::get_max_join_buffer_size(bool optimize_buff_size)
828 {
829   if (!max_buff_size)
830   {
831     size_t max_sz;
832     size_t min_sz= get_min_join_buffer_size();
833     size_t len= 0;
834     for (JOIN_TAB *tab= start_tab; tab != join_tab;
835          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
836     {
837       len+= tab->get_used_fieldlength();
838     }
839     len+= get_record_max_affix_length();
840     avg_record_length= len;
841     len+= get_max_key_addon_space_per_record() + avg_aux_buffer_incr;
842     space_per_record= len;
843 
844     size_t limit_sz= (size_t)join->thd->variables.join_buff_size;
845     if (join_tab->join_buffer_size_limit)
846       set_if_smaller(limit_sz, join_tab->join_buffer_size_limit);
847     if (!optimize_buff_size)
848       max_sz= limit_sz;
849     else
850     {
851       if (limit_sz / max_records > space_per_record)
852         max_sz= space_per_record * max_records;
853       else
854         max_sz= limit_sz;
855       max_sz+= pack_length_with_blob_ptrs;
856       set_if_smaller(max_sz, limit_sz);
857     }
858     set_if_bigger(max_sz, min_sz);
859     max_buff_size= max_sz;
860   }
861   return max_buff_size;
862 }
863 
864 
865 /*
866   Allocate memory for a join buffer
867 
868   SYNOPSIS
869     alloc_buffer()
870 
871   DESCRIPTION
872     The function allocates a lump of memory for the cache join buffer.
873     Initially the function sets the size of the buffer buff_size equal to
874     the value returned by get_max_join_buffer_size(). If the total size of
875     the space intended to be used for the join buffers employed by the
876     tables from the first one through join_tab exceeds the value of the
877     system parameter join_buff_space_limit, then the function first tries
878     to shrink the used buffers to make the occupied space fit the maximum
879     memory allowed to be used for all join buffers in total. After
880     this the function tries to allocate a join buffer for join_tab.
881     If it fails to do so, it decrements the requested size of the join
882     buffer, shrinks proportionally the join buffers used for the previous
883     tables and tries to allocate a buffer for join_tab. In the case of a
884     failure the function repeats its attempts with smaller and smaller
885     requested sizes of the buffer, but not more than 4 times.
886 
887   RETURN VALUE
888     0   if the memory has been successfully allocated
889     1   otherwise
890 */
891 
alloc_buffer()892 int JOIN_CACHE::alloc_buffer()
893 {
894   JOIN_TAB *tab;
895   JOIN_CACHE *cache;
896   ulonglong curr_buff_space_sz= 0;
897   ulonglong curr_min_buff_space_sz= 0;
898   ulonglong join_buff_space_limit=
899     join->thd->variables.join_buff_space_limit;
900   bool optimize_buff_size=
901          optimizer_flag(join->thd, OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE);
902   double partial_join_cardinality=  (join_tab-1)->get_partial_join_cardinality();
903   buff= NULL;
904   min_buff_size= 0;
905   max_buff_size= 0;
906   min_records= 1;
907   max_records= (size_t) (partial_join_cardinality <= join_buff_space_limit ?
908                  (ulonglong) partial_join_cardinality : join_buff_space_limit);
909   set_if_bigger(max_records, 10);
910   min_buff_size= get_min_join_buffer_size();
911   buff_size= get_max_join_buffer_size(optimize_buff_size);
912 
913   for (tab= start_tab; tab!= join_tab;
914        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
915   {
916     cache= tab->cache;
917     if (cache)
918     {
919       curr_min_buff_space_sz+= cache->get_min_join_buffer_size();
920       curr_buff_space_sz+= cache->get_join_buffer_size();
921     }
922   }
923   curr_min_buff_space_sz+= min_buff_size;
924   curr_buff_space_sz+= buff_size;
925 
926   if (curr_min_buff_space_sz > join_buff_space_limit ||
927       (curr_buff_space_sz > join_buff_space_limit &&
928        (!optimize_buff_size ||
929         join->shrink_join_buffers(join_tab, curr_buff_space_sz,
930                                   join_buff_space_limit))))
931     goto fail;
932 
933   if (for_explain_only)
934     return 0;
935 
936   for (size_t buff_size_decr= (buff_size-min_buff_size)/4 + 1; ; )
937   {
938     size_t next_buff_size;
939 
940     if ((buff= (uchar*) my_malloc(key_memory_JOIN_CACHE, buff_size,
941                                   MYF(MY_THREAD_SPECIFIC))))
942       break;
943 
944     next_buff_size= buff_size > buff_size_decr ? buff_size-buff_size_decr : 0;
945     if (next_buff_size < min_buff_size ||
946         join->shrink_join_buffers(join_tab, curr_buff_space_sz,
947                                   curr_buff_space_sz-buff_size_decr))
948       goto fail;
949     buff_size= next_buff_size;
950 
951     curr_buff_space_sz= 0;
952     for (tab= join->join_tab+join->const_tables; tab <= join_tab; tab++)
953     {
954       cache= tab->cache;
955       if (cache)
956         curr_buff_space_sz+= cache->get_join_buffer_size();
957     }
958   }
959   return 0;
960 
961 fail:
962   buff_size= 0;
963   return 1;
964 }
965 
966 
967 /*
968   Shrink the size if the cache join buffer in a given ratio
969 
970   SYNOPSIS
971     shrink_join_buffer_in_ratio()
972       n           nominator of the ratio to shrink the buffer in
973       d           denominator if the ratio
974 
975   DESCRIPTION
976     The function first deallocates the join buffer of the cache. Then
977     it allocates a buffer that is (n/d) times smaller.
978 
979   RETURN VALUE
980     FALSE   on success with allocation of the smaller join buffer
981     TRUE    otherwise
982 */
983 
shrink_join_buffer_in_ratio(ulonglong n,ulonglong d)984 bool JOIN_CACHE::shrink_join_buffer_in_ratio(ulonglong n, ulonglong d)
985 {
986   size_t next_buff_size;
987   if (n < d)
988     return FALSE;
989   next_buff_size= (size_t) ((double) buff_size / n * d);
990   set_if_bigger(next_buff_size, min_buff_size);
991   buff_size= next_buff_size;
992   return realloc_buffer();
993 }
994 
995 
996 /*
997   Reallocate the join buffer of a join cache
998 
999   SYNOPSIS
1000     realloc_buffer()
1001 
1002   DESCRITION
1003     The function reallocates the join buffer of the join cache. After this
1004     it resets the buffer for writing.
1005 
1006   NOTES
1007     The function assumes that buff_size contains the new value for the join
1008     buffer size.
1009 
1010   RETURN VALUE
1011     0   if the buffer has been successfully reallocated
1012     1   otherwise
1013 */
1014 
realloc_buffer()1015 int JOIN_CACHE::realloc_buffer()
1016 {
1017   free();
1018   buff= (uchar*) my_malloc(key_memory_JOIN_CACHE, buff_size,
1019                                          MYF(MY_THREAD_SPECIFIC));
1020   reset(TRUE);
1021   return buff == NULL;
1022 }
1023 
1024 
1025 /*
1026   Initialize a join cache
1027 
1028   SYNOPSIS
1029     init()
1030       for_explain       join buffer is initialized for explain only
1031 
1032   DESCRIPTION
1033     The function initializes the join cache structure. It supposed to be called
1034     by init methods for classes derived from the JOIN_CACHE.
1035     The function allocates memory for the join buffer and for descriptors of
1036     the record fields stored in the buffer.
1037 
1038   NOTES
1039     The code of this function should have been included into the constructor
1040     code itself. However the new operator for the class JOIN_CACHE would
1041     never fail while memory allocation for the join buffer is not absolutely
1042     unlikely to fail. That's why this memory allocation has to be placed in a
1043     separate function that is called in a couple with a cache constructor.
1044     It is quite natural to put almost all other constructor actions into
1045     this function.
1046 
1047   RETURN VALUE
1048     0   initialization with buffer allocations has been succeeded
1049     1   otherwise
1050 */
1051 
init(bool for_explain)1052 int JOIN_CACHE::init(bool for_explain)
1053 {
1054   DBUG_ENTER("JOIN_CACHE::init");
1055 
1056   for_explain_only= for_explain;
1057 
1058   calc_record_fields();
1059 
1060   collect_info_on_key_args();
1061 
1062   if (alloc_fields())
1063     DBUG_RETURN(1);
1064 
1065   create_flag_fields();
1066 
1067   create_key_arg_fields();
1068 
1069   create_remaining_fields();
1070 
1071   set_constants();
1072 
1073   if (alloc_buffer())
1074     DBUG_RETURN(1);
1075 
1076   reset(TRUE);
1077 
1078   DBUG_RETURN(0);
1079 }
1080 
1081 
1082 /*
1083   Check the possibility to read the access keys directly from the join buffer
1084   SYNOPSIS
1085     check_emb_key_usage()
1086 
1087   DESCRIPTION
1088     The function checks some conditions at which the key values can be read
1089     directly from the join buffer. This is possible when the key values can be
1090     composed by concatenation of the record fields stored in the join buffer.
1091     Sometimes when the access key is multi-component the function has to re-order
1092     the fields written into the join buffer to make keys embedded. If key
1093     values for the key access are detected as embedded then 'use_emb_key'
1094     is set to TRUE.
1095 
1096   EXAMPLE
1097     Let table t2 has an index defined on the columns a,b . Let's assume also
1098     that the columns t2.a, t2.b as well as the columns t1.a, t1.b are all
1099     of the integer type. Then if the query
1100       SELECT COUNT(*) FROM t1, t2 WHERE t1.a=t2.a and t1.b=t2.b
1101     is executed with a join cache in such a way that t1 is the driving
1102     table then the key values to access table t2 can be read directly
1103     from the join buffer.
1104 
1105   NOTES
1106     In some cases key values could be read directly from the join buffer but
1107     we still do not consider them embedded. In the future we'll expand the
1108     the class of keys which we identify as embedded.
1109 
1110   NOTES
1111     The function returns FALSE if no key is used to join the records
1112     from join_tab.
1113 
1114   RETURN VALUE
1115     TRUE    key values will be considered as embedded,
1116     FALSE   otherwise.
1117 */
1118 
check_emb_key_usage()1119 bool JOIN_CACHE::check_emb_key_usage()
1120 {
1121 
1122   if (!is_key_access())
1123     return FALSE;
1124 
1125   uint i;
1126   Item *item;
1127   KEY_PART_INFO *key_part;
1128   CACHE_FIELD *copy;
1129   CACHE_FIELD *copy_end;
1130   uint len= 0;
1131   TABLE_REF *ref= &join_tab->ref;
1132   KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
1133 
1134   /*
1135     If some of the key arguments are not from the local cache the key
1136     is not considered as embedded.
1137     TODO:
1138     Expand it to the case when ref->key_parts=1 and local_key_arg_fields=0.
1139   */
1140   if (external_key_arg_fields != 0)
1141     return FALSE;
1142   /*
1143     If the number of the local key arguments is not equal to the number
1144     of key parts the key value cannot be read directly from the join buffer.
1145   */
1146   if (local_key_arg_fields != ref->key_parts)
1147     return FALSE;
1148 
1149   /*
1150     A key is not considered embedded if one of the following is true:
1151     - one of its key parts is not equal to a field
1152     - it is a partial key
1153     - definition of the argument field does not coincide with the
1154       definition of the corresponding key component
1155     - some of the key components are nullable
1156   */
1157   for (i=0; i < ref->key_parts; i++)
1158   {
1159     item= ref->items[i]->real_item();
1160     if (item->type() != Item::FIELD_ITEM)
1161       return FALSE;
1162     key_part= keyinfo->key_part+i;
1163     if (key_part->key_part_flag & HA_PART_KEY_SEG)
1164       return FALSE;
1165     if (!key_part->field->eq_def(((Item_field *) item)->field))
1166       return FALSE;
1167     if (key_part->field->maybe_null())
1168       return FALSE;
1169   }
1170 
1171   copy= field_descr+flag_fields;
1172   copy_end= copy+local_key_arg_fields;
1173   for ( ; copy < copy_end; copy++)
1174   {
1175     /*
1176       If some of the key arguments are of variable length the key
1177       is not considered as embedded.
1178     */
1179     if (copy->type != 0)
1180       return FALSE;
1181     /*
1182       If some of the key arguments are bit fields whose bits are partially
1183       stored with null bits the key is not considered as embedded.
1184     */
1185     if (copy->field->type() == MYSQL_TYPE_BIT &&
1186 	 ((Field_bit*) (copy->field))->bit_len)
1187       return FALSE;
1188     len+= copy->length;
1189   }
1190 
1191   emb_key_length= len;
1192 
1193   /*
1194     Make sure that key fields follow the order of the corresponding
1195     key components these fields are equal to. For this the descriptors
1196     of the fields that comprise the key might be re-ordered.
1197   */
1198   for (i= 0; i < ref->key_parts; i++)
1199   {
1200     uint j;
1201     Item *item= ref->items[i]->real_item();
1202     Field *fld= ((Item_field *) item)->field;
1203     CACHE_FIELD *init_copy= field_descr+flag_fields+i;
1204     for (j= i, copy= init_copy; j < local_key_arg_fields;  j++, copy++)
1205     {
1206       if (fld->eq(copy->field))
1207       {
1208         if (j != i)
1209         {
1210           CACHE_FIELD key_part_copy= *copy;
1211           *copy= *init_copy;
1212           *init_copy= key_part_copy;
1213         }
1214         break;
1215       }
1216     }
1217   }
1218 
1219   return TRUE;
1220 }
1221 
1222 
1223 /*
1224   Write record fields and their required offsets into the join cache buffer
1225 
1226   SYNOPSIS
1227     write_record_data()
1228       link        a reference to the associated info in the previous cache
1229       is_full OUT true if it has been decided that no more records will be
1230                   added to the join buffer
1231 
1232   DESCRIPTION
1233     This function put into the cache buffer the following info that it reads
1234     from the join record buffers or computes somehow:
1235     (1) the length of all fields written for the record (optional)
1236     (2) an offset to the associated info in the previous cache (if there is any)
1237         determined by the link parameter
1238     (3) all flag fields of the tables whose data field are put into the cache:
1239         - match flag (optional),
1240         - null bitmaps for all tables,
1241         - null row flags for all tables
1242     (4) values of all data fields including
1243         - full images of those fixed legth data fields that cannot have
1244           trailing spaces
1245         - significant part of fixed length fields that can have trailing spaces
1246           with the prepanded length
1247         - data of non-blob variable length fields with the prepanded data length
1248         - blob data from blob fields with the prepanded data length
1249     (5) record offset values for the data fields that are referred to from
1250         other caches
1251 
1252     The record is written at the current position stored in the field 'pos'.
1253     At the end of the function 'pos' points at the position right after the
1254     written record data.
1255     The function increments the number of records in the cache that is stored
1256     in the 'records' field by 1. The function also modifies the values of
1257     'curr_rec_pos' and 'last_rec_pos' to point to the written record.
1258     The 'end_pos' cursor is modified accordingly.
1259     The 'last_rec_blob_data_is_in_rec_buff' is set on if the blob data
1260     remains in the record buffers and not copied to the join buffer. It may
1261     happen only to the blob data from the last record added into the cache.
1262     If on_precond is attached to join_tab and it is not evaluated to TRUE
1263     then MATCH_IMPOSSIBLE is placed in the match flag field of the record
1264     written into the join buffer.
1265 
1266   RETURN VALUE
1267     length of the written record data
1268 */
1269 
write_record_data(uchar * link,bool * is_full)1270 uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
1271 {
1272   uint len;
1273   bool last_record;
1274   CACHE_FIELD *copy;
1275   CACHE_FIELD *copy_end;
1276   uchar *flags_pos;
1277   uchar *cp= pos;
1278   uchar *init_pos= cp;
1279   uchar *rec_len_ptr= 0;
1280   uint key_extra= extra_key_length();
1281 
1282   records++;  /* Increment the counter of records in the cache */
1283 
1284   len= pack_length + key_extra;
1285 
1286   /* Make an adjustment for the size of the auxiliary buffer if there is any */
1287   uint incr= aux_buffer_incr(records);
1288   size_t rem= rem_space();
1289   aux_buff_size+= len+incr < rem ? incr : rem;
1290 
1291   /*
1292     For each blob to be put into cache save its length and a pointer
1293     to the value in the corresponding element of the blob_ptr array.
1294     Blobs with null values are skipped.
1295     Increment 'len' by the total length of all these blobs.
1296   */
1297   if (blobs)
1298   {
1299     CACHE_FIELD **copy_ptr= blob_ptr;
1300     CACHE_FIELD **copy_ptr_end= copy_ptr+blobs;
1301     for ( ; copy_ptr < copy_ptr_end; copy_ptr++)
1302     {
1303       Field_blob *blob_field= (Field_blob *) (*copy_ptr)->field;
1304       if (!blob_field->is_null())
1305       {
1306         uint blob_len= blob_field->get_length();
1307         (*copy_ptr)->blob_length= blob_len;
1308         len+= blob_len;
1309         (*copy_ptr)->str= blob_field->get_ptr();
1310       }
1311     }
1312   }
1313 
1314   /*
1315     Check whether we won't be able to add any new record into the cache after
1316     this one because the cache will be full. Set last_record to TRUE if it's so.
1317     The assume that the cache will be full after the record has been written
1318     into it if either the remaining space of the cache is not big enough for the
1319     record's blob values or if there is a chance that not all non-blob fields
1320     of the next record can be placed there.
1321     This function is called only in the case when there is enough space left in
1322     the cache to store at least non-blob parts of the current record.
1323   */
1324   last_record= (len+pack_length_with_blob_ptrs+key_extra) > rem_space();
1325 
1326   /*
1327     Save the position for the length of the record in the cache if it's needed.
1328     The length of the record will be inserted here when all fields of the record
1329     are put into the cache.
1330   */
1331   if (with_length)
1332   {
1333     rec_len_ptr= cp;
1334     DBUG_ASSERT(cp + size_of_rec_len <= buff + buff_size);
1335     cp+= size_of_rec_len;
1336   }
1337 
1338   /*
1339     Put a reference to the fields of the record that are stored in the previous
1340     cache if there is any. This reference is passed by the 'link' parameter.
1341   */
1342   if (prev_cache)
1343   {
1344     DBUG_ASSERT(cp + prev_cache->get_size_of_rec_offset() <= buff + buff_size);
1345     cp+= prev_cache->get_size_of_rec_offset();
1346     prev_cache->store_rec_ref(cp, link);
1347   }
1348 
1349   curr_rec_pos= cp;
1350 
1351   /* If the there is a match flag set its value to 0 */
1352   copy= field_descr;
1353   if (with_match_flag)
1354     *copy[0].str= 0;
1355 
1356   /* First put into the cache the values of all flag fields */
1357   copy_end= field_descr+flag_fields;
1358   flags_pos= cp;
1359   for ( ; copy < copy_end; copy++)
1360   {
1361     DBUG_ASSERT(cp + copy->length <= buff + buff_size);
1362     memcpy(cp, copy->str, copy->length);
1363     cp+= copy->length;
1364   }
1365 
1366   /* Now put the values of the remaining fields as soon as they are not nulls */
1367   copy_end= field_descr+fields;
1368   for ( ; copy < copy_end; copy++)
1369   {
1370     Field *field= copy->field;
1371     if (field && field->maybe_null() && field->is_null())
1372     {
1373       if (copy->referenced_field_no)
1374         copy->offset= 0;
1375       continue;
1376     }
1377     /* Save the offset of the field to put it later at the end of the record */
1378     if (copy->referenced_field_no)
1379       copy->offset= (uint)(cp-curr_rec_pos);
1380 
1381     switch (copy->type) {
1382     case CACHE_BLOB:
1383     {
1384       Field_blob *blob_field= (Field_blob *) copy->field;
1385       if (last_record)
1386       {
1387         last_rec_blob_data_is_in_rec_buff= 1;
1388         /* Put down the length of the blob and the pointer to the data */
1389         DBUG_ASSERT(cp + copy->length + sizeof(char*) <= buff + buff_size);
1390 	blob_field->get_image(cp, copy->length+sizeof(char*),
1391                               blob_field->charset());
1392 	cp+= copy->length+sizeof(char*);
1393       }
1394       else
1395       {
1396         /* First put down the length of the blob and then copy the data */
1397 	blob_field->get_image(cp, copy->length,
1398 			      blob_field->charset());
1399         DBUG_ASSERT(cp + copy->length + copy->blob_length <= buff + buff_size);
1400         if (copy->blob_length)
1401           memcpy(cp+copy->length, copy->str, copy->blob_length);
1402 	cp+= copy->length+copy->blob_length;
1403       }
1404       break;
1405     }
1406     case CACHE_VARSTR1:
1407       /* Copy the significant part of the short varstring field */
1408       len= (uint) copy->str[0] + 1;
1409       DBUG_ASSERT(cp + len <= buff + buff_size);
1410       memcpy(cp, copy->str, len);
1411       cp+= len;
1412       break;
1413     case CACHE_VARSTR2:
1414       /* Copy the significant part of the long varstring field */
1415       len= uint2korr(copy->str) + 2;
1416       DBUG_ASSERT(cp + len <= buff + buff_size);
1417       memcpy(cp, copy->str, len);
1418       cp+= len;
1419       break;
1420     case CACHE_STRIPPED:
1421     {
1422       /*
1423         Put down the field value stripping all trailing spaces off.
1424         After this insert the length of the written sequence of bytes.
1425       */
1426       uchar *str, *end;
1427       for (str= copy->str, end= str+copy->length;
1428            end > str && end[-1] == ' ';
1429            end--) ;
1430       len=(uint) (end-str);
1431       DBUG_ASSERT(cp + len + 2 <= buff + buff_size);
1432       int2store(cp, len);
1433       memcpy(cp+2, str, len);
1434       cp+= len+2;
1435       break;
1436     }
1437     case CACHE_ROWID:
1438       if (!copy->length)
1439       {
1440         /*
1441           This may happen only for ROWID fields of materialized
1442           derived tables and views.
1443         */
1444         TABLE *table= (TABLE *) copy->str;
1445         copy->str= table->file->ref;
1446         copy->length= table->file->ref_length;
1447         if (!copy->str)
1448         {
1449           /*
1450             If table is an empty inner table of an outer join and it is
1451             a materialized derived table then table->file->ref == NULL.
1452           */
1453           cp+= copy->length;
1454           break;
1455         }
1456       }
1457       /* fall through */
1458     default:
1459       /* Copy the entire image of the field from the record buffer */
1460       DBUG_ASSERT(cp + copy->length <= buff + buff_size);
1461       if (copy->str)
1462         memcpy(cp, copy->str, copy->length);
1463       cp+= copy->length;
1464     }
1465   }
1466 
1467   /* Add the offsets of the fields that are referenced from other caches */
1468   if (referenced_fields)
1469   {
1470     uint cnt= 0;
1471     for (copy= field_descr+flag_fields; copy < copy_end ; copy++)
1472     {
1473       if (copy->referenced_field_no)
1474       {
1475         store_fld_offset(cp+size_of_fld_ofs*(copy->referenced_field_no-1),
1476                          copy->offset);
1477         cnt++;
1478       }
1479     }
1480     DBUG_ASSERT(cp + size_of_fld_ofs*cnt <= buff + buff_size);
1481     cp+= size_of_fld_ofs*cnt;
1482   }
1483 
1484   if (rec_len_ptr)
1485     store_rec_length(rec_len_ptr, (ulong) (cp-rec_len_ptr-size_of_rec_len));
1486   last_rec_pos= curr_rec_pos;
1487   end_pos= pos= cp;
1488   *is_full= last_record;
1489 
1490   last_written_is_null_compl= 0;
1491   if (!join_tab->first_unmatched && join_tab->on_precond)
1492   {
1493     join_tab->found= 0;
1494     join_tab->not_null_compl= 1;
1495     if (!join_tab->on_precond->val_int())
1496     {
1497       flags_pos[0]= MATCH_IMPOSSIBLE;
1498       last_written_is_null_compl= 1;
1499     }
1500   }
1501 
1502   return (uint) (cp-init_pos);
1503 }
1504 
1505 
1506 /*
1507   Reset the join buffer for reading/writing: default implementation
1508 
1509   SYNOPSIS
1510     reset()
1511       for_writing  if it's TRUE the function reset the buffer for writing
1512 
1513   DESCRIPTION
1514     This default implementation of the virtual function reset() resets
1515     the join buffer for reading or writing.
1516     If the buffer is reset for reading only the 'pos' value is reset
1517     to point to the very beginning of the join buffer. If the buffer is
1518     reset for writing additionally:
1519     - the counter of the records in the buffer is set to 0,
1520     - the the value of 'last_rec_pos' gets pointing at the position just
1521       before the buffer,
1522     - 'end_pos' is set to point to the beginning of the join buffer,
1523     - the size of the auxiliary buffer is reset to 0,
1524     - the flag 'last_rec_blob_data_is_in_rec_buff' is set to 0.
1525 
1526   RETURN VALUE
1527     none
1528 */
reset(bool for_writing)1529 void JOIN_CACHE::reset(bool for_writing)
1530 {
1531   pos= buff;
1532   curr_rec_link= 0;
1533   if (for_writing)
1534   {
1535     records= 0;
1536     last_rec_pos= buff;
1537     aux_buff_size= 0;
1538     end_pos= pos;
1539     last_rec_blob_data_is_in_rec_buff= 0;
1540   }
1541 }
1542 
1543 
1544 /*
1545   Add a record into the join buffer: the default implementation
1546 
1547   SYNOPSIS
1548     put_record()
1549 
1550   DESCRIPTION
1551     This default implementation of the virtual function put_record writes
1552     the next matching record into the join buffer.
1553     It also links the record having been written into the join buffer with
1554     the matched record in the previous cache if there is any.
1555     The implementation assumes that the function get_curr_link()
1556     will return exactly the pointer to this matched record.
1557 
1558   RETURN VALUE
1559     TRUE    if it has been decided that it should be the last record
1560             in the join buffer,
1561     FALSE   otherwise
1562 */
1563 
put_record()1564 bool JOIN_CACHE::put_record()
1565 {
1566   bool is_full;
1567   uchar *link= 0;
1568   if (prev_cache)
1569     link= prev_cache->get_curr_rec_link();
1570   write_record_data(link, &is_full);
1571   return is_full;
1572 }
1573 
1574 
1575 /*
1576   Read the next record from the join buffer: the default implementation
1577 
1578   SYNOPSIS
1579     get_record()
1580 
1581   DESCRIPTION
1582     This default implementation of the virtual function get_record
1583     reads fields of the next record from the join buffer of this cache.
1584     The function also reads all other fields associated with this record
1585     from the the join buffers of the previous caches. The fields are read
1586     into the corresponding record buffers.
1587     It is supposed that 'pos' points to the position in the buffer
1588     right after the previous record when the function is called.
1589     When the function returns the 'pos' values is updated to point
1590     to the position after the read record.
1591     The value of 'curr_rec_pos' is also updated by the function to
1592     point to the beginning of the first field of the record in the
1593     join buffer.
1594 
1595   RETURN VALUE
1596     TRUE    there are no more records to read from the join buffer
1597     FALSE   otherwise
1598 */
1599 
get_record()1600 bool JOIN_CACHE::get_record()
1601 {
1602   bool res;
1603   uchar *prev_rec_ptr= 0;
1604   if (with_length)
1605     pos+= size_of_rec_len;
1606   if (prev_cache)
1607   {
1608     pos+= prev_cache->get_size_of_rec_offset();
1609     prev_rec_ptr= prev_cache->get_rec_ref(pos);
1610   }
1611   curr_rec_pos= pos;
1612   if (!(res= read_all_record_fields() == NO_MORE_RECORDS_IN_BUFFER))
1613   {
1614     pos+= referenced_fields*size_of_fld_ofs;
1615     if (prev_cache)
1616       prev_cache->get_record_by_pos(prev_rec_ptr);
1617   }
1618   return res;
1619 }
1620 
1621 
1622 /*
1623   Read a positioned record from the join buffer: the default implementation
1624 
1625   SYNOPSIS
1626     get_record_by_pos()
1627       rec_ptr  position of the first field of the record in the join buffer
1628 
1629   DESCRIPTION
1630     This default implementation of the virtual function get_record_pos
1631     reads the fields of the record positioned at 'rec_ptr' from the join buffer.
1632     The function also reads all other fields associated with this record
1633     from the the join buffers of the previous caches. The fields are read
1634     into the corresponding record buffers.
1635 
1636   RETURN VALUE
1637     none
1638 */
1639 
get_record_by_pos(uchar * rec_ptr)1640 void JOIN_CACHE::get_record_by_pos(uchar *rec_ptr)
1641 {
1642   uchar *save_pos= pos;
1643   pos= rec_ptr;
1644   read_all_record_fields();
1645   pos= save_pos;
1646   if (prev_cache)
1647   {
1648     uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
1649     prev_cache->get_record_by_pos(prev_rec_ptr);
1650   }
1651 }
1652 
1653 
1654 /*
1655   Get the match flag from the referenced record: the default implementation
1656 
1657   SYNOPSIS
1658     get_match_flag_by_pos()
1659       rec_ptr  position of the first field of the record in the join buffer
1660 
1661   DESCRIPTION
1662     This default implementation of the virtual function get_match_flag_by_pos
1663     get the match flag for the record pointed by the reference at the position
1664     rec_ptr. If the match flag is placed in one of the previous buffers the
1665     function first reaches the linked record fields in this buffer.
1666     The function returns the value of the first encountered match flag.
1667 
1668   RETURN VALUE
1669     match flag for the record at the position rec_ptr
1670 */
1671 
get_match_flag_by_pos(uchar * rec_ptr)1672 enum JOIN_CACHE::Match_flag JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr)
1673 {
1674   Match_flag match_fl= MATCH_NOT_FOUND;
1675   if (with_match_flag)
1676   {
1677     match_fl= (enum Match_flag) rec_ptr[0];
1678     return match_fl;
1679   }
1680   if (prev_cache)
1681   {
1682     uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
1683     return prev_cache->get_match_flag_by_pos(prev_rec_ptr);
1684   }
1685   DBUG_ASSERT(0);
1686   return match_fl;
1687 }
1688 
1689 
1690 /*
1691   Get the match flag for the referenced record from specified join buffer
1692 
1693   SYNOPSIS
1694     get_match_flag_by_pos_from_join_buffer()
1695       rec_ptr  position of the first field of the record in the join buffer
1696       tab      join table with join buffer where to look for the match flag
1697 
1698   DESCRIPTION
1699     This default implementation of the get_match_flag_by_pos_from_join_buffer
1700     method gets the match flag for the record pointed by the reference at the
1701     position rec_ptr from the join buffer attached to the join table tab.
1702 
1703   RETURN VALUE
1704     match flag for the record at the position rec_ptr from the join
1705     buffer attached to the table tab.
1706 */
1707 
1708 enum JOIN_CACHE::Match_flag
get_match_flag_by_pos_from_join_buffer(uchar * rec_ptr,JOIN_TAB * tab)1709 JOIN_CACHE::get_match_flag_by_pos_from_join_buffer(uchar *rec_ptr,
1710                                                    JOIN_TAB *tab)
1711 {
1712   DBUG_ASSERT(tab->cache && tab->cache->with_match_flag);
1713   for (JOIN_CACHE *cache= this; ; )
1714   {
1715     if (cache->join_tab == tab)
1716       return (enum Match_flag) rec_ptr[0];
1717     cache= cache->prev_cache;
1718     rec_ptr= cache->get_rec_ref(rec_ptr);
1719   }
1720 }
1721 
1722 
1723 /*
1724   Calculate the increment of the auxiliary buffer for a record write
1725 
1726   SYNOPSIS
1727     aux_buffer_incr()
1728       recno   the number of the record the increment to be calculated for
1729 
1730   DESCRIPTION
1731     This function calls the aux_buffer_incr the method of the
1732     companion member join_tab_scan to calculate the growth of the
1733     auxiliary buffer when the recno-th record is added to the
1734     join_buffer of this cache.
1735 
1736   RETURN VALUE
1737     the number of bytes in the increment
1738 */
1739 
aux_buffer_incr(size_t recno)1740 uint JOIN_CACHE::aux_buffer_incr(size_t recno)
1741 {
1742   return join_tab_scan->aux_buffer_incr(recno);
1743 }
1744 
1745 /*
1746   Read all flag and data fields of a record from the join buffer
1747 
1748   SYNOPSIS
1749     read_all_record_fields()
1750 
1751   DESCRIPTION
1752     The function reads all flag and data fields of a record from the join
1753     buffer into the corresponding record buffers.
1754     The fields are read starting from the position 'pos' which is
1755     supposed to point to the beginning of the first record field.
1756     The function increments the value of 'pos' by the length of the
1757     read data.
1758 
1759   RETURN VALUE
1760     (-1)   if there is no more records in the join buffer
1761     length of the data read from the join buffer - otherwise
1762 */
1763 
read_all_record_fields()1764 uint JOIN_CACHE::read_all_record_fields()
1765 {
1766   uchar *init_pos= pos;
1767 
1768   if (pos > last_rec_pos || !records)
1769     return NO_MORE_RECORDS_IN_BUFFER;
1770 
1771   /* First match flag, read null bitmaps and null_row flag for each table */
1772   read_flag_fields();
1773 
1774   /* Now read the remaining table fields if needed */
1775   CACHE_FIELD *copy= field_descr+flag_fields;
1776   CACHE_FIELD *copy_end= field_descr+fields;
1777   bool blob_in_rec_buff= blob_data_is_in_rec_buff(init_pos);
1778   for ( ; copy < copy_end; copy++)
1779     read_record_field(copy, blob_in_rec_buff);
1780 
1781   return (uint) (pos-init_pos);
1782 }
1783 
1784 
1785 /*
1786   Read all flag fields of a record from the join buffer
1787 
1788   SYNOPSIS
1789     read_flag_fields()
1790 
1791   DESCRIPTION
1792     The function reads all flag fields of a record from the join
1793     buffer into the corresponding record buffers.
1794     The fields are read starting from the position 'pos'.
1795     The function increments the value of 'pos' by the length of the
1796     read data.
1797 
1798   RETURN VALUE
1799     length of the data read from the join buffer
1800 */
1801 
read_flag_fields()1802 uint JOIN_CACHE::read_flag_fields()
1803 {
1804   uchar *init_pos= pos;
1805   CACHE_FIELD *copy= field_descr;
1806   CACHE_FIELD *copy_end= copy+flag_fields;
1807   if (with_match_flag)
1808   {
1809     copy->str[0]= MY_TEST((Match_flag) pos[0] == MATCH_FOUND);
1810     pos+= copy->length;
1811     copy++;
1812   }
1813   for ( ; copy < copy_end; copy++)
1814   {
1815     memcpy(copy->str, pos, copy->length);
1816     pos+= copy->length;
1817   }
1818   return (uint)(pos-init_pos);
1819 }
1820 
1821 
1822 /*
1823   Read a data record field from the join buffer
1824 
1825   SYNOPSIS
1826     read_record_field()
1827       copy             the descriptor of the data field to be read
1828       blob_in_rec_buff indicates whether this is the field from the record
1829                        whose blob data are in record buffers
1830 
1831   DESCRIPTION
1832     The function reads the data field specified by the parameter copy
1833     from the join buffer into the corresponding record buffer.
1834     The field is read starting from the position 'pos'.
1835     The data of blob values is not copied from the join buffer.
1836     The function increments the value of 'pos' by the length of the
1837     read data.
1838 
1839   RETURN VALUE
1840     length of the data read from the join buffer
1841 */
1842 
read_record_field(CACHE_FIELD * copy,bool blob_in_rec_buff)1843 uint JOIN_CACHE::read_record_field(CACHE_FIELD *copy, bool blob_in_rec_buff)
1844 {
1845   uint len;
1846   /* Do not copy the field if its value is null */
1847   if (copy->field && copy->field->maybe_null() && copy->field->is_null())
1848     return 0;
1849   switch (copy->type) {
1850   case CACHE_BLOB:
1851     {
1852       Field_blob *blob_field= (Field_blob *) copy->field;
1853       /*
1854         Copy the length and the pointer to data but not the blob data
1855         itself to the record buffer
1856       */
1857       if (blob_in_rec_buff)
1858       {
1859         blob_field->set_image(pos, copy->length + sizeof(char*),
1860                               blob_field->charset());
1861         len= copy->length + sizeof(char*);
1862       }
1863       else
1864       {
1865         blob_field->set_ptr(pos, pos+copy->length);
1866         len= copy->length + blob_field->get_length();
1867       }
1868     }
1869     break;
1870   case CACHE_VARSTR1:
1871     /* Copy the significant part of the short varstring field */
1872     len= (uint) pos[0] + 1;
1873     memcpy(copy->str, pos, len);
1874     break;
1875   case CACHE_VARSTR2:
1876     /* Copy the significant part of the long varstring field */
1877     len= uint2korr(pos) + 2;
1878     memcpy(copy->str, pos, len);
1879     break;
1880   case CACHE_STRIPPED:
1881     /* Pad the value by spaces that has been stripped off */
1882     len= uint2korr(pos);
1883     memcpy(copy->str, pos+2, len);
1884     memset(copy->str+len, ' ', copy->length-len);
1885     len+= 2;
1886     break;
1887   case CACHE_ROWID:
1888     if (!copy->str)
1889     {
1890       len= copy->length;
1891       break;
1892     }
1893     /* fall through */
1894   default:
1895     /* Copy the entire image of the field from the record buffer */
1896     len= copy->length;
1897     memcpy(copy->str, pos, len);
1898   }
1899   pos+= len;
1900   return len;
1901 }
1902 
1903 
1904 /*
1905   Read a referenced field from the join buffer
1906 
1907   SYNOPSIS
1908     read_referenced_field()
1909       copy         pointer to the descriptor of the referenced field
1910       rec_ptr      pointer to the record that may contain this field
1911       len  IN/OUT  total length of the record fields
1912 
1913   DESCRIPTION
1914     The function checks whether copy points to a data field descriptor
1915     for this cache object. If it does not then the function returns
1916     FALSE. Otherwise the function reads the field of the record in
1917     the join buffer pointed by 'rec_ptr' into the corresponding record
1918     buffer and returns TRUE.
1919     If the value of *len is 0 then the function sets it to the total
1920     length of the record fields including possible trailing offset
1921     values. Otherwise *len is supposed to provide this value that
1922     has been obtained earlier.
1923 
1924   NOTE
1925     If the value of the referenced field is null then the offset
1926     for the value is set to 0. If the value of a field can be null
1927     then the value of flag_fields is always positive. So the offset
1928     for any non-null value cannot be 0 in this case.
1929 
1930   RETURN VALUE
1931     TRUE   'copy' points to a data descriptor of this join cache
1932     FALSE  otherwise
1933 */
1934 
read_referenced_field(CACHE_FIELD * copy,uchar * rec_ptr,uint * len)1935 bool JOIN_CACHE::read_referenced_field(CACHE_FIELD *copy,
1936                                        uchar *rec_ptr,
1937                                        uint *len)
1938 {
1939   uchar *ptr;
1940   uint offset;
1941   if (copy < field_descr || copy >= field_descr+fields)
1942     return FALSE;
1943   if (!*len)
1944   {
1945     /* Get the total length of the record fields */
1946     uchar *len_ptr= rec_ptr;
1947     if (prev_cache)
1948       len_ptr-= prev_cache->get_size_of_rec_offset();
1949     *len= get_rec_length(len_ptr-size_of_rec_len);
1950   }
1951 
1952   ptr= rec_ptr-(prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
1953   offset= get_fld_offset(ptr+ *len -
1954                          size_of_fld_ofs*
1955                          (referenced_fields+1-copy->referenced_field_no));
1956   bool is_null= FALSE;
1957   Field *field= copy->field;
1958   if (offset == 0 && flag_fields)
1959     is_null= TRUE;
1960   if (is_null)
1961   {
1962     field->set_null();
1963     if (!field->real_maybe_null())
1964       field->table->null_row= 1;
1965   }
1966   else
1967   {
1968     uchar *save_pos= pos;
1969     field->set_notnull();
1970     if (!field->real_maybe_null())
1971       field->table->null_row= 0;
1972     pos= rec_ptr+offset;
1973     read_record_field(copy, blob_data_is_in_rec_buff(rec_ptr));
1974     pos= save_pos;
1975   }
1976   return TRUE;
1977 }
1978 
1979 
1980 /*
1981   Skip record from join buffer if's already matched: default implementation
1982 
1983   SYNOPSIS
1984     skip_if_matched()
1985 
1986   DESCRIPTION
1987     This default implementation of the virtual function skip_if_matched
1988     skips the next record from the join buffer if its  match flag is set to
1989     MATCH_FOUND.
1990     If the record is skipped the value of 'pos' is set to point to the position
1991     right after the record.
1992 
1993   NOTE
1994     Currently this function is called only when generating null complemented
1995     records for outer joins (=> only when join_tab->first_unmatched != NULL).
1996 
1997   RETURN VALUE
1998     TRUE   the match flag is set to MATCH_FOUND and the record has been skipped
1999     FALSE  otherwise
2000 */
2001 
skip_if_matched()2002 bool JOIN_CACHE::skip_if_matched()
2003 {
2004   DBUG_ASSERT(with_length);
2005   uint offset= size_of_rec_len;
2006   if (prev_cache)
2007     offset+= prev_cache->get_size_of_rec_offset();
2008   /* Check whether the match flag is MATCH_FOUND */
2009   if (get_match_flag_by_pos_from_join_buffer(pos+offset,
2010                                              join_tab->first_unmatched) ==
2011       MATCH_FOUND)
2012   {
2013     pos+= size_of_rec_len + get_rec_length(pos);
2014     return TRUE;
2015   }
2016   return FALSE;
2017 }
2018 
2019 
2020 /*
2021   Skip record from join buffer if the match isn't needed: default implementation
2022 
2023   SYNOPSIS
2024     skip_if_not_needed_match()
2025 
2026   DESCRIPTION
2027     This default implementation of the virtual function skip_if_not_needed_match
2028     skips the next record from the join when generating join extensions
2029     for the records in the join buffer depending on the value of the match flag.
2030       - In the case of a semi-nest the match flag may be in two states
2031         {MATCH_NOT_FOUND, MATCH_FOUND}. The record is skipped if the flag is set
2032         to MATCH_FOUND.
2033       - In the case of a outer join nest when not_exists optimization is applied
2034         the match may be in three states {MATCH_NOT_FOUND, MATCH_IMPOSSIBLE,
2035         MATCH_FOUND. The record is skipped if the flag is set to MATCH_FOUND or
2036         to MATCH_IMPOSSIBLE.
2037 
2038     If the record is skipped the value of 'pos' is set to point to the position
2039     right after the record.
2040 
2041   NOTE
2042     Currently the function is called only when generating non-null complemented
2043     extensions for records in the join buffer.
2044 
2045   RETURN VALUE
2046     TRUE    the record has to be skipped
2047     FALSE   otherwise
2048 */
2049 
skip_if_not_needed_match()2050 bool JOIN_CACHE::skip_if_not_needed_match()
2051 {
2052   DBUG_ASSERT(with_length);
2053   enum Match_flag match_fl;
2054   uint offset= size_of_rec_len;
2055   bool skip= FALSE;
2056   if (prev_cache)
2057     offset+= prev_cache->get_size_of_rec_offset();
2058 
2059   if (!join_tab->check_only_first_match())
2060     return FALSE;
2061 
2062   match_fl= get_match_flag_by_pos(pos+offset);
2063   skip= join_tab->first_sj_inner_tab ?
2064         match_fl == MATCH_FOUND :           // the case of semi-join
2065         match_fl != MATCH_NOT_FOUND;        // the case of outer-join
2066 
2067   if (skip)
2068   {
2069     pos+= size_of_rec_len + get_rec_length(pos);
2070     return TRUE;
2071   }
2072   return FALSE;
2073 }
2074 
2075 
2076 /*
2077   Restore the fields of the last record from the join buffer
2078 
2079   SYNOPSIS
2080     restore_last_record()
2081 
2082   DESCRIPTION
2083     This function restore the values of the fields of the last record put
2084     into join buffer in record buffers. The values most probably have been
2085     overwritten by the field values from other records when they were read
2086     from the join buffer into the record buffer in order to check pushdown
2087     predicates.
2088 
2089   RETURN
2090     none
2091 */
2092 
restore_last_record()2093 void JOIN_CACHE::restore_last_record()
2094 {
2095   if (records)
2096     get_record_by_pos(last_rec_pos);
2097 }
2098 
2099 
2100 /*
2101   Join records from the join buffer with records from the next join table
2102 
2103   SYNOPSIS
2104     join_records()
2105       skip_last    do not find matches for the last record from the buffer
2106 
2107   DESCRIPTION
2108     The functions extends all records from the join buffer by the matched
2109     records from join_tab. In the case of outer join operation it also
2110     adds null complementing extensions for the records from the join buffer
2111     that have no match.
2112     No extensions are generated for the last record from the buffer if
2113     skip_last is true.
2114 
2115   NOTES
2116     The function must make sure that if linked join buffers are used then
2117     a join buffer cannot be refilled again until all extensions in the
2118     buffers chained to this one are generated.
2119     Currently an outer join operation with several inner tables always uses
2120     at least two linked buffers with the match join flags placed in the
2121     first buffer. Any record composed of rows of the inner tables that
2122     matches a record in this buffer must refer to the position of the
2123     corresponding match flag.
2124 
2125   IMPLEMENTATION
2126     When generating extensions for outer tables of an outer join operation
2127     first we generate all extensions for those records from the join buffer
2128     that have matches, after which null complementing extension for all
2129     unmatched records from the join buffer are generated.
2130 
2131   RETURN VALUE
2132     return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS.
2133 */
2134 
join_records(bool skip_last)2135 enum_nested_loop_state JOIN_CACHE::join_records(bool skip_last)
2136 {
2137   JOIN_TAB *tab;
2138   enum_nested_loop_state rc= NESTED_LOOP_OK;
2139   bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
2140   DBUG_ENTER("JOIN_CACHE::join_records");
2141 
2142   if (outer_join_first_inner && !join_tab->first_unmatched)
2143     join_tab->not_null_compl= TRUE;
2144 
2145   if (!join_tab->first_unmatched)
2146   {
2147     bool pfs_batch_update= join_tab->pfs_batch_update(join);
2148     if (pfs_batch_update)
2149       join_tab->table->file->start_psi_batch_mode();
2150     /* Find all records from join_tab that match records from join buffer */
2151     rc= join_matching_records(skip_last);
2152     if (pfs_batch_update)
2153       join_tab->table->file->end_psi_batch_mode();
2154     if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2155       goto finish;
2156     if (outer_join_first_inner)
2157     {
2158       if (next_cache && join_tab != join_tab->last_inner)
2159       {
2160         /*
2161           Ensure that all matches for outer records from join buffer are to be
2162           found. Now we ensure that all full records are found for records from
2163           join buffer. Generally this is an overkill.
2164           TODO: Ensure that only matches of the inner table records have to be
2165           found for the records from join buffer.
2166 	*/
2167         rc= next_cache->join_records(skip_last);
2168         if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2169           goto finish;
2170       }
2171       join_tab->not_null_compl= FALSE;
2172       /*
2173         Prepare for generation of null complementing extensions.
2174         For all inner tables of the outer join operation for which
2175         regular matches have been just found the field 'first_unmatched'
2176         is set to point the the first inner table. After all null
2177         complement rows are generated for this outer join this field
2178         is set back to NULL.
2179       */
2180       for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
2181         tab->first_unmatched= join_tab->first_inner;
2182     }
2183   }
2184   if (join_tab->first_unmatched)
2185   {
2186     if (is_key_access())
2187       restore_last_record();
2188 
2189     /*
2190       Generate all null complementing extensions for the records from
2191       join buffer that don't have any matching rows from the inner tables.
2192     */
2193     reset(FALSE);
2194     rc= join_null_complements(skip_last);
2195     if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2196       goto finish;
2197   }
2198   if(next_cache)
2199   {
2200     /*
2201       When using linked caches we must ensure the records in the next caches
2202       that refer to the records in the join buffer are fully extended.
2203       Otherwise we could have references to the records that have been
2204       already erased from the join buffer and replaced for new records.
2205     */
2206     rc= next_cache->join_records(skip_last);
2207     if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2208       goto finish;
2209   }
2210 
2211   if (skip_last)
2212   {
2213     DBUG_ASSERT(!is_key_access());
2214     /*
2215        Restore the last record from the join buffer to generate
2216        all extensions for it.
2217     */
2218     get_record();
2219   }
2220 
2221 finish:
2222   if (outer_join_first_inner &&
2223       join_tab->first_inner == join_tab->first_unmatched)
2224   {
2225     /*
2226       All null complemented rows have been already generated for all
2227       outer records from join buffer. Restore the state of the
2228       first_unmatched values to 0 to avoid another null complementing.
2229     */
2230     for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
2231       tab->first_unmatched= 0;
2232   }
2233   restore_last_record();
2234   reset(TRUE);
2235   DBUG_PRINT("exit", ("rc: %d", rc));
2236   DBUG_RETURN(rc);
2237 }
2238 
2239 
2240 /*
2241   Find matches from the next table for records from the join buffer
2242 
2243   SYNOPSIS
2244     join_matching_records()
2245       skip_last    do not look for matches for the last partial join record
2246 
2247   DESCRIPTION
2248     The function retrieves rows of the join_tab table and checks whether they
2249     match partial join records from the join buffer. If a match is found
2250     the function will call the sub_select function trying to look for matches
2251     for the remaining join operations.
2252     This function currently is called only from the function join_records.
2253     If the value of skip_last is true the function writes the partial join
2254     record from the record buffer into the join buffer to save its value for
2255     the future processing in the caller function.
2256 
2257   NOTES
2258     If employed by BNL or BNLH join algorithms the function performs a full
2259     scan of join_tab for each refill of the join buffer. If BKA or BKAH
2260     algorithms are used then the function iterates only over those records
2261     from join_tab that can be accessed by keys built over records in the join
2262     buffer. To apply a proper method of iteration the function just calls
2263     virtual iterator methods (open, next, close) of the member join_tab_scan.
2264     The member can be either of the JOIN_TAB_SCAN or JOIN_TAB_SCAN_MMR type.
2265     The class JOIN_TAB_SCAN provides the iterator methods for BNL/BNLH join
2266     algorithms. The class JOIN_TAB_SCAN_MRR provides the iterator methods
2267     for BKA/BKAH join algorithms.
2268     When the function looks for records from the join buffer that would
2269     match a record from join_tab it iterates either over all records in
2270     the buffer or only over selected records. If BNL join operation is
2271     performed all records are checked for the match. If BNLH or BKAH
2272     algorithm is employed to join join_tab then the function looks only
2273     through the records with the same join key as the record from join_tab.
2274     With the BKA join algorithm only one record from the join buffer is checked
2275     for a match for any record from join_tab. To iterate over the candidates
2276     for a match the virtual function get_next_candidate_for_match is used,
2277     while the virtual function prepare_look_for_matches is called to prepare
2278     for such iteration process.
2279 
2280   NOTES
2281     The function produces all matching extensions for the records in the
2282     join buffer following the path of the employed blocked algorithm.
2283     When an outer join operation is performed all unmatched records from
2284     the join buffer must be extended by null values. The function
2285     'join_null_complements' serves this purpose.
2286 
2287   RETURN VALUE
2288     return one of enum_nested_loop_state
2289 */
2290 
join_matching_records(bool skip_last)2291 enum_nested_loop_state JOIN_CACHE::join_matching_records(bool skip_last)
2292 {
2293   int error;
2294   enum_nested_loop_state rc= NESTED_LOOP_OK;
2295   join_tab->table->null_row= 0;
2296   bool check_only_first_match=
2297     join_tab->check_only_first_match() &&
2298     (!join_tab->first_inner ||                            // semi-join case
2299      join_tab->first_inner == join_tab->first_unmatched); // outer join case
2300   bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
2301   DBUG_ENTER("JOIN_CACHE::join_matching_records");
2302 
2303   /* Return at once if there are no records in the join buffer */
2304   if (!records)
2305     DBUG_RETURN(NESTED_LOOP_OK);
2306 
2307   /*
2308     When joining we read records from the join buffer back into record buffers.
2309     If matches for the last partial join record are found through a call to
2310     the sub_select function then this partial join record must be saved in the
2311     join buffer in order to be restored just before the sub_select call.
2312   */
2313   if (skip_last)
2314     put_record();
2315 
2316   if (join_tab->use_quick == 2 && join_tab->select->quick)
2317   {
2318     /* A dynamic range access was used last. Clean up after it */
2319     delete join_tab->select->quick;
2320     join_tab->select->quick= 0;
2321   }
2322 
2323   if ((rc= join_tab_execution_startup(join_tab)) < 0)
2324     goto finish2;
2325 
2326   join_tab->build_range_rowid_filter_if_needed();
2327 
2328   /* Prepare to retrieve all records of the joined table */
2329   if (unlikely((error= join_tab_scan->open())))
2330   {
2331     /*
2332       TODO: if we get here, we will assert in net_send_statement(). Add test
2333       coverage and fix.
2334     */
2335     goto finish;
2336   }
2337 
2338   while (!(error= join_tab_scan->next()))
2339   {
2340     if (unlikely(join->thd->check_killed()))
2341     {
2342       /* The user has aborted the execution of the query */
2343       rc= NESTED_LOOP_KILLED;
2344       goto finish;
2345     }
2346 
2347     if (join_tab->keep_current_rowid)
2348       join_tab->table->file->position(join_tab->table->record[0]);
2349 
2350     /* Prepare to read matching candidates from the join buffer */
2351     if (prepare_look_for_matches(skip_last))
2352       continue;
2353     join_tab->jbuf_tracker->r_scans++;
2354 
2355     uchar *rec_ptr;
2356     /* Read each possible candidate from the buffer and look for matches */
2357     while ((rec_ptr= get_next_candidate_for_match()))
2358     {
2359       join_tab->jbuf_tracker->r_rows++;
2360       /*
2361         If only the first match is needed, and, it has been already found for
2362         the next record read from the join buffer, then the record is skipped.
2363         Also those records that must be null complemented are not considered
2364         as candidates for matches.
2365       */
2366       if ((!check_only_first_match && !outer_join_first_inner) ||
2367           !skip_next_candidate_for_match(rec_ptr))
2368       {
2369 	read_next_candidate_for_match(rec_ptr);
2370         rc= generate_full_extensions(rec_ptr);
2371         if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2372 	  goto finish;
2373       }
2374     }
2375   }
2376 
2377 finish:
2378   if (error)
2379     rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
2380 finish2:
2381   join_tab_scan->close();
2382   DBUG_RETURN(rc);
2383 }
2384 
2385 
2386 /*
2387   Set match flag for a record in join buffer if it has not been set yet
2388 
2389   SYNOPSIS
2390     set_match_flag_if_none()
2391       first_inner     the join table to which this flag is attached to
2392       rec_ptr         pointer to the record in the join buffer
2393 
2394   DESCRIPTION
2395     If the records of the table are accumulated in a join buffer the function
2396     sets the match flag for the record in the buffer that is referred to by
2397     the record from this cache positioned at 'rec_ptr'.
2398     The function also sets the match flag 'found' of the table first inner
2399     if it has not been set before.
2400 
2401   NOTES
2402     The function assumes that the match flag for any record in any cache
2403     is placed in the first byte occupied by the record fields.
2404 
2405   RETURN VALUE
2406     TRUE   the match flag is set by this call for the first time
2407     FALSE  the match flag has been set before this call
2408 */
2409 
set_match_flag_if_none(JOIN_TAB * first_inner,uchar * rec_ptr)2410 bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner,
2411                                         uchar *rec_ptr)
2412 {
2413   if (!first_inner->cache)
2414   {
2415     /*
2416       Records of the first inner table to which the flag is attached to
2417       are not accumulated in a join buffer.
2418     */
2419     if (first_inner->found)
2420       return FALSE;
2421     else
2422     {
2423       first_inner->found= 1;
2424       return TRUE;
2425     }
2426   }
2427   JOIN_CACHE *cache= this;
2428   while (cache->join_tab != first_inner)
2429   {
2430     cache= cache->prev_cache;
2431     DBUG_ASSERT(cache);
2432     rec_ptr= cache->get_rec_ref(rec_ptr);
2433   }
2434   if ((Match_flag) rec_ptr[0] != MATCH_FOUND)
2435   {
2436     rec_ptr[0]= MATCH_FOUND;
2437     first_inner->found= 1;
2438     return TRUE;
2439   }
2440   return FALSE;
2441 }
2442 
2443 
2444 /*
2445   Generate all full extensions for a partial join record in the buffer
2446 
2447   SYNOPSIS
2448     generate_full_extensions()
2449       rec_ptr     pointer to the record from join buffer to generate extensions
2450 
2451   DESCRIPTION
2452     The function first checks whether the current record of 'join_tab' matches
2453     the partial join record from join buffer located at 'rec_ptr'. If it is the
2454     case the function calls the join_tab->next_select method to generate
2455     all full extension for this partial join match.
2456 
2457   RETURN VALUE
2458     return one of enum_nested_loop_state.
2459 */
2460 
generate_full_extensions(uchar * rec_ptr)2461 enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr)
2462 {
2463   enum_nested_loop_state rc= NESTED_LOOP_OK;
2464   DBUG_ENTER("JOIN_CACHE::generate_full_extensions");
2465 
2466   /*
2467     Check whether the extended partial join record meets
2468     the pushdown conditions.
2469   */
2470   if (check_match(rec_ptr))
2471   {
2472     int res= 0;
2473 
2474     if (!join_tab->check_weed_out_table ||
2475         !(res= join_tab->check_weed_out_table->sj_weedout_check_row(join->thd)))
2476     {
2477       set_curr_rec_link(rec_ptr);
2478       rc= (join_tab->next_select)(join, join_tab+1, 0);
2479       if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2480       {
2481         reset(TRUE);
2482         DBUG_RETURN(rc);
2483       }
2484     }
2485     if (res == -1)
2486     {
2487       rc= NESTED_LOOP_ERROR;
2488       DBUG_RETURN(rc);
2489     }
2490   }
2491   else if (unlikely(join->thd->is_error()))
2492     rc= NESTED_LOOP_ERROR;
2493   DBUG_RETURN(rc);
2494 }
2495 
2496 
2497 /*
2498   Check matching to a partial join record from the join buffer
2499 
2500   SYNOPSIS
2501     check_match()
2502       rec_ptr     pointer to the record from join buffer to check matching to
2503 
2504   DESCRIPTION
2505     The function checks whether the current record of 'join_tab' matches
2506     the partial join record from join buffer located at 'rec_ptr'. If this is
2507     the case and 'join_tab' is the last inner table of a semi-join or an outer
2508     join the function turns on the match flag for the 'rec_ptr' record unless
2509     it has been already set.
2510 
2511   NOTES
2512     Setting the match flag on can trigger re-evaluation of pushdown conditions
2513     for the record when join_tab is the last inner table of an outer join.
2514 
2515   RETURN VALUE
2516     TRUE   there is a match
2517     FALSE  there is no match
2518            In this case the caller must also check thd->is_error() to see
2519            if there was a fatal error for the query.
2520 */
2521 
check_match(uchar * rec_ptr)2522 inline bool JOIN_CACHE::check_match(uchar *rec_ptr)
2523 {
2524   /* Check whether pushdown conditions are satisfied */
2525   DBUG_ENTER("JOIN_CACHE:check_match");
2526 
2527   if (join_tab->select && join_tab->select->skip_record(join->thd) <= 0)
2528     DBUG_RETURN(FALSE);
2529 
2530   join_tab->jbuf_tracker->r_rows_after_where++;
2531 
2532   if (!join_tab->is_last_inner_table())
2533     DBUG_RETURN(TRUE);
2534 
2535   /*
2536      This is the last inner table of an outer join,
2537      and maybe of other embedding outer joins, or
2538      this is the last inner table of a semi-join.
2539   */
2540   JOIN_TAB *first_inner= join_tab->get_first_inner_table();
2541   do
2542   {
2543     set_match_flag_if_none(first_inner, rec_ptr);
2544     if (first_inner->check_only_first_match() &&
2545         !join_tab->first_inner)
2546       DBUG_RETURN(TRUE);
2547     /*
2548       This is the first match for the outer table row.
2549       The function set_match_flag_if_none has turned the flag
2550       first_inner->found on. The pushdown predicates for
2551       inner tables must be re-evaluated with this flag on.
2552       Note that, if first_inner is the first inner table
2553       of a semi-join, but is not an inner table of an outer join
2554       such that 'not exists' optimization can  be applied to it,
2555       the re-evaluation of the pushdown predicates is not needed.
2556     */
2557     for (JOIN_TAB *tab= first_inner; tab <= join_tab; tab++)
2558     {
2559       if (tab->select && tab->select->skip_record(join->thd) <= 0)
2560         DBUG_RETURN(FALSE);
2561     }
2562   }
2563   while ((first_inner= first_inner->first_upper) &&
2564          first_inner->last_inner == join_tab);
2565     DBUG_RETURN(TRUE);
2566 }
2567 
2568 
2569 /*
2570   Add null complements for unmatched outer records from join buffer
2571 
2572   SYNOPSIS
2573     join_null_complements()
2574       skip_last    do not add null complements for the last record
2575 
2576   DESCRIPTION
2577     This function is called only for inner tables of outer joins.
2578     The function retrieves all rows from the join buffer and adds null
2579     complements for those of them that do not have matches for outer
2580     table records.
2581     If the 'join_tab' is the last inner table of the embedding outer
2582     join and the null complemented record satisfies the outer join
2583     condition then the the corresponding match flag is turned on
2584     unless it has been set earlier. This setting may trigger
2585     re-evaluation of pushdown conditions for the record.
2586 
2587   NOTES
2588     The same implementation of the virtual method join_null_complements
2589     is used for BNL/BNLH/BKA/BKA join algorthm.
2590 
2591   RETURN VALUE
2592     return one of enum_nested_loop_state.
2593 */
2594 
join_null_complements(bool skip_last)2595 enum_nested_loop_state JOIN_CACHE::join_null_complements(bool skip_last)
2596 {
2597   ulonglong cnt;
2598   enum_nested_loop_state rc= NESTED_LOOP_OK;
2599   bool is_first_inner= join_tab == join_tab->first_unmatched;
2600   DBUG_ENTER("JOIN_CACHE::join_null_complements");
2601 
2602   /* Return at once if there are no records in the join buffer */
2603   if (!records)
2604     DBUG_RETURN(NESTED_LOOP_OK);
2605 
2606   cnt= records - (is_key_access() ? 0 : MY_TEST(skip_last));
2607 
2608   /* This function may be called only for inner tables of outer joins */
2609   DBUG_ASSERT(join_tab->first_inner);
2610 
2611   for ( ; cnt; cnt--)
2612   {
2613     if (unlikely(join->thd->check_killed()))
2614     {
2615       /* The user has aborted the execution of the query */
2616       rc= NESTED_LOOP_KILLED;
2617       goto finish;
2618     }
2619     /* Just skip the whole record if a match for it has been already found */
2620     if (!is_first_inner || !skip_if_matched())
2621     {
2622       get_record();
2623       /* The outer row is complemented by nulls for each inner table */
2624       restore_record(join_tab->table, s->default_values);
2625       mark_as_null_row(join_tab->table);
2626       rc= generate_full_extensions(get_curr_rec());
2627       if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2628         goto finish;
2629     }
2630   }
2631 
2632 finish:
2633   DBUG_RETURN(rc);
2634 }
2635 
2636 
2637 /*
2638   Save data on the join algorithm employed by the join cache
2639 
2640   SYNOPSIS
2641     save_explain_data()
2642       str  string to add the comment on the employed join algorithm to
2643 
2644   DESCRIPTION
2645     This function puts info about the type of the used join buffer (flat or
2646     incremental) and on the type of the the employed join algorithm (BNL,
2647     BNLH, BKA or BKAH) to the data structure
2648 
2649   RETURN VALUE
2650    0 ok
2651    1 error
2652 */
2653 
save_explain_data(EXPLAIN_BKA_TYPE * explain)2654 bool JOIN_CACHE::save_explain_data(EXPLAIN_BKA_TYPE *explain)
2655 {
2656   explain->incremental= MY_TEST(prev_cache);
2657 
2658   explain->join_buffer_size= get_join_buffer_size();
2659 
2660   switch (get_join_alg()) {
2661   case BNL_JOIN_ALG:
2662     explain->join_alg= "BNL";
2663     break;
2664   case BNLH_JOIN_ALG:
2665     explain->join_alg= "BNLH";
2666     break;
2667   case BKA_JOIN_ALG:
2668     explain->join_alg= "BKA";
2669     break;
2670   case BKAH_JOIN_ALG:
2671     explain->join_alg= "BKAH";
2672     break;
2673   default:
2674     DBUG_ASSERT(0);
2675   }
2676   return 0;
2677 }
2678 
2679 /**
2680   get thread handle.
2681 */
2682 
thd()2683 THD *JOIN_CACHE::thd()
2684 {
2685   return join->thd;
2686 }
2687 
2688 
add_mrr_explain_info(String * str,uint mrr_mode,handler * file)2689 static bool add_mrr_explain_info(String *str, uint mrr_mode, handler *file)
2690 {
2691   char mrr_str_buf[128]={0};
2692   int len;
2693   len= file->multi_range_read_explain_info(mrr_mode, mrr_str_buf,
2694                                            sizeof(mrr_str_buf));
2695   if (len > 0)
2696   {
2697     if (str->length())
2698     {
2699       if (str->append(STRING_WITH_LEN("; ")))
2700         return 1;
2701     }
2702     if (str->append(mrr_str_buf, len))
2703       return 1;
2704   }
2705   return 0;
2706 }
2707 
2708 
save_explain_data(EXPLAIN_BKA_TYPE * explain)2709 bool JOIN_CACHE_BKA::save_explain_data(EXPLAIN_BKA_TYPE *explain)
2710 {
2711   if (JOIN_CACHE::save_explain_data(explain))
2712     return 1;
2713   return add_mrr_explain_info(&explain->mrr_type, mrr_mode, join_tab->table->file);
2714 }
2715 
2716 
save_explain_data(EXPLAIN_BKA_TYPE * explain)2717 bool JOIN_CACHE_BKAH::save_explain_data(EXPLAIN_BKA_TYPE *explain)
2718 {
2719   if (JOIN_CACHE::save_explain_data(explain))
2720     return 1;
2721   return add_mrr_explain_info(&explain->mrr_type, mrr_mode, join_tab->table->file);
2722 }
2723 
2724 
2725 /*
2726   Initialize a hashed join cache
2727 
2728   SYNOPSIS
2729     init()
2730       for_explain       join buffer is initialized for explain only
2731 
2732   DESCRIPTION
2733     The function initializes the cache structure with a hash table in it.
2734     The hash table will be used to store key values for the records from
2735     the join buffer.
2736     The function allocates memory for the join buffer and for descriptors of
2737     the record fields stored in the buffer.
2738     The function also initializes a hash table for record keys within the join
2739     buffer space.
2740 
2741   NOTES VALUE
2742     The function is supposed to be called by the init methods of the classes
2743     derived from JOIN_CACHE_HASHED.
2744 
2745   RETURN VALUE
2746     0   initialization with buffer allocations has been succeeded
2747     1   otherwise
2748 */
2749 
init(bool for_explain)2750 int JOIN_CACHE_HASHED::init(bool for_explain)
2751 {
2752   int rc= 0;
2753   TABLE_REF *ref= &join_tab->ref;
2754 
2755   DBUG_ENTER("JOIN_CACHE_HASHED::init");
2756 
2757   hash_table= 0;
2758   key_entries= 0;
2759 
2760   key_length= ref->key_length;
2761 
2762   if ((rc= JOIN_CACHE::init(for_explain)) || for_explain)
2763     DBUG_RETURN (rc);
2764 
2765   if (!(key_buff= (uchar*) join->thd->alloc(key_length)))
2766     DBUG_RETURN(1);
2767 
2768   /* Take into account a reference to the next record in the key chain */
2769   pack_length+= get_size_of_rec_offset();
2770   pack_length_with_blob_ptrs+= get_size_of_rec_offset();
2771 
2772   ref_key_info= join_tab->get_keyinfo_by_key_no(join_tab->ref.key);
2773   ref_used_key_parts= join_tab->ref.key_parts;
2774 
2775   hash_func= &JOIN_CACHE_HASHED::get_hash_idx_simple;
2776   hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_simple;
2777 
2778   KEY_PART_INFO *key_part= ref_key_info->key_part;
2779   KEY_PART_INFO *key_part_end= key_part+ref_used_key_parts;
2780   for ( ; key_part < key_part_end; key_part++)
2781   {
2782     if (!key_part->field->eq_cmp_as_binary())
2783     {
2784       hash_func= &JOIN_CACHE_HASHED::get_hash_idx_complex;
2785       hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_complex;
2786       break;
2787     }
2788   }
2789 
2790   init_hash_table();
2791 
2792   rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
2793                      (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
2794 
2795   data_fields_offset= 0;
2796   if (use_emb_key)
2797   {
2798     CACHE_FIELD *copy= field_descr;
2799     CACHE_FIELD *copy_end= copy+flag_fields;
2800     for ( ; copy < copy_end; copy++)
2801       data_fields_offset+= copy->length;
2802   }
2803 
2804   DBUG_RETURN(0);
2805 }
2806 
2807 
2808 /*
2809   Initialize the hash table of a hashed join cache
2810 
2811   SYNOPSIS
2812     init_hash_table()
2813 
2814   DESCRIPTION
2815     The function estimates the number of hash table entries in the hash
2816     table to be used and initializes this hash table within the join buffer
2817     space.
2818 
2819   RETURN VALUE
2820     Currently the function always returns 0;
2821 */
2822 
init_hash_table()2823 int JOIN_CACHE_HASHED::init_hash_table()
2824 {
2825   hash_table= 0;
2826   key_entries= 0;
2827 
2828   /* Calculate the minimal possible value of size_of_key_ofs greater than 1 */
2829   uint max_size_of_key_ofs= MY_MAX(2, get_size_of_rec_offset());
2830   for (size_of_key_ofs= 2;
2831        size_of_key_ofs <= max_size_of_key_ofs;
2832        size_of_key_ofs+= 2)
2833   {
2834     key_entry_length= get_size_of_rec_offset() + // key chain header
2835                       size_of_key_ofs +          // reference to the next key
2836                       (use_emb_key ?  get_size_of_rec_offset() : key_length);
2837 
2838     size_t space_per_rec= avg_record_length +
2839                          avg_aux_buffer_incr +
2840                          key_entry_length+size_of_key_ofs;
2841     size_t n= buff_size / space_per_rec;
2842 
2843     /*
2844       TODO: Make a better estimate for this upper bound of
2845             the number of records in in the join buffer.
2846     */
2847     size_t max_n= buff_size / (pack_length-length+
2848                              key_entry_length+size_of_key_ofs);
2849 
2850     hash_entries= (uint) (n / 0.7);
2851     set_if_bigger(hash_entries, 1);
2852 
2853     if (offset_size((uint)(max_n*key_entry_length)) <=
2854         size_of_key_ofs)
2855       break;
2856   }
2857 
2858   /* Initialize the hash table */
2859   hash_table= buff + (buff_size-hash_entries*size_of_key_ofs);
2860   cleanup_hash_table();
2861   curr_key_entry= hash_table;
2862 
2863   return 0;
2864 }
2865 
2866 
2867 /*
2868   Reallocate the join buffer of a hashed join cache
2869 
2870   SYNOPSIS
2871     realloc_buffer()
2872 
2873   DESCRITION
2874     The function reallocates the join buffer of the hashed join cache.
2875     After this it initializes a hash table within the buffer space and
2876     resets the join cache for writing.
2877 
2878   NOTES
2879     The function assumes that buff_size contains the new value for the join
2880     buffer size.
2881 
2882   RETURN VALUE
2883     0   if the buffer has been successfully reallocated
2884     1   otherwise
2885 */
2886 
realloc_buffer()2887 int JOIN_CACHE_HASHED::realloc_buffer()
2888 {
2889   free();
2890   buff= (uchar*) my_malloc(key_memory_JOIN_CACHE, buff_size,
2891                                          MYF(MY_THREAD_SPECIFIC));
2892   init_hash_table();
2893   reset(TRUE);
2894   return buff == NULL;
2895 }
2896 
2897 
2898 /*
2899   Get maximum size of the additional space per record used for record keys
2900 
2901   SYNOPSYS
2902     get_max_key_addon_space_per_record()
2903 
2904   DESCRIPTION
2905     The function returns the size of the space occupied by one key entry
2906     and one hash table entry.
2907 
2908   RETURN VALUE
2909     maximum size of the additional space per record that is used to store
2910     record keys in the hash table
2911 */
2912 
get_max_key_addon_space_per_record()2913 uint JOIN_CACHE_HASHED::get_max_key_addon_space_per_record()
2914 {
2915   ulong len;
2916   TABLE_REF *ref= &join_tab->ref;
2917   /*
2918     The total number of hash entries in the hash tables is bounded by
2919     ceiling(N/0.7) where N is the maximum number of records in the buffer.
2920     That's why the multiplier 2 is used in the formula below.
2921   */
2922   len= (use_emb_key ?  get_size_of_rec_offset() : ref->key_length) +
2923         size_of_rec_ofs +    // size of the key chain header
2924         size_of_rec_ofs +    // >= size of the reference to the next key
2925         2*size_of_rec_ofs;   // >= 2*( size of hash table entry)
2926   return len;
2927 }
2928 
2929 
2930 /*
2931   Reset the buffer of a hashed join cache for reading/writing
2932 
2933   SYNOPSIS
2934     reset()
2935       for_writing  if it's TRUE the function reset the buffer for writing
2936 
2937   DESCRIPTION
2938     This implementation of the virtual function reset() resets the join buffer
2939     of the JOIN_CACHE_HASHED class for reading or writing.
2940     Additionally to what the default implementation does this function
2941     cleans up the hash table allocated within the buffer.
2942 
2943   RETURN VALUE
2944     none
2945 */
2946 
reset(bool for_writing)2947 void JOIN_CACHE_HASHED::reset(bool for_writing)
2948 {
2949   this->JOIN_CACHE::reset(for_writing);
2950   if (for_writing && hash_table)
2951     cleanup_hash_table();
2952   curr_key_entry= hash_table;
2953 }
2954 
2955 
2956 /*
2957   Add a record into the buffer of a hashed join cache
2958 
2959   SYNOPSIS
2960     put_record()
2961 
2962   DESCRIPTION
2963     This implementation of the virtual function put_record writes the next
2964     matching record into the join buffer of the JOIN_CACHE_HASHED class.
2965     Additionally to what the default implementation does this function
2966     performs the following.
2967     It extracts from the record the key value used in lookups for matching
2968     records and searches for this key in the hash tables from the join cache.
2969     If it finds the key in the hash table it joins the record to the chain
2970     of records with this key. If the key is not found in the hash table the
2971     key is placed into it and a chain containing only the newly added record
2972     is attached to the key entry. The key value is either placed in the hash
2973     element added for the key or, if the use_emb_key flag is set, remains in
2974     the record from the partial join.
2975     If the match flag field of a record contains MATCH_IMPOSSIBLE the key is
2976     not created for this record.
2977 
2978   RETURN VALUE
2979     TRUE    if it has been decided that it should be the last record
2980             in the join buffer,
2981     FALSE   otherwise
2982 */
2983 
put_record()2984 bool JOIN_CACHE_HASHED::put_record()
2985 {
2986   bool is_full;
2987   uchar *key;
2988   uint key_len= key_length;
2989   uchar *key_ref_ptr;
2990   uchar *link= 0;
2991   TABLE_REF *ref= &join_tab->ref;
2992   uchar *next_ref_ptr= pos;
2993 
2994   pos+= get_size_of_rec_offset();
2995   /* Write the record into the join buffer */
2996   if (prev_cache)
2997     link= prev_cache->get_curr_rec_link();
2998   write_record_data(link, &is_full);
2999 
3000   if (last_written_is_null_compl)
3001     return is_full;
3002 
3003   if (use_emb_key)
3004     key= get_curr_emb_key();
3005   else
3006   {
3007     /* Build the key over the fields read into the record buffers */
3008     cp_buffer_from_ref(join->thd, join_tab->table, ref);
3009     key= ref->key_buff;
3010   }
3011 
3012   /* Look for the key in the hash table */
3013   if (key_search(key, key_len, &key_ref_ptr))
3014   {
3015     uchar *last_next_ref_ptr;
3016     /*
3017       The key is found in the hash table.
3018       Add the record to the circular list of the records attached to this key.
3019       Below 'rec' is the record to be added into the record chain for the found
3020       key, 'key_ref' points to a flatten representation of the st_key_entry
3021       structure that contains the key and the head of the record chain.
3022     */
3023     last_next_ref_ptr= get_next_rec_ref(key_ref_ptr+get_size_of_key_offset());
3024     /* rec->next_rec= key_entry->last_rec->next_rec */
3025     memcpy(next_ref_ptr, last_next_ref_ptr, get_size_of_rec_offset());
3026     /* key_entry->last_rec->next_rec= rec */
3027     store_next_rec_ref(last_next_ref_ptr, next_ref_ptr);
3028     /* key_entry->last_rec= rec */
3029     store_next_rec_ref(key_ref_ptr+get_size_of_key_offset(), next_ref_ptr);
3030   }
3031   else
3032   {
3033     /*
3034       The key is not found in the hash table.
3035       Put the key into the join buffer linking it with the keys for the
3036       corresponding hash entry. Create a circular list with one element
3037       referencing the record and attach the list to the key in the buffer.
3038     */
3039     uchar *cp= last_key_entry;
3040     cp-= get_size_of_rec_offset()+get_size_of_key_offset();
3041     store_next_key_ref(key_ref_ptr, cp);
3042     store_null_key_ref(cp);
3043     store_next_rec_ref(next_ref_ptr, next_ref_ptr);
3044     store_next_rec_ref(cp+get_size_of_key_offset(), next_ref_ptr);
3045     if (use_emb_key)
3046     {
3047       cp-= get_size_of_rec_offset();
3048       store_emb_key_ref(cp, key);
3049     }
3050     else
3051     {
3052       cp-= key_len;
3053       memcpy(cp, key, key_len);
3054     }
3055     last_key_entry= cp;
3056     DBUG_ASSERT(last_key_entry >= end_pos);
3057     /* Increment the counter of key_entries in the hash table */
3058     key_entries++;
3059   }
3060   return is_full;
3061 }
3062 
3063 
3064 /*
3065   Read the next record from the buffer of a hashed join cache
3066 
3067   SYNOPSIS
3068     get_record()
3069 
3070   DESCRIPTION
3071     Additionally to what the default implementation of the virtual
3072     function get_record does this implementation skips the link element
3073     used to connect the records with the same key into a chain.
3074 
3075   RETURN VALUE
3076     TRUE    there are no more records to read from the join buffer
3077     FALSE   otherwise
3078 */
3079 
get_record()3080 bool JOIN_CACHE_HASHED::get_record()
3081 {
3082   pos+= get_size_of_rec_offset();
3083   return this->JOIN_CACHE::get_record();
3084 }
3085 
3086 
3087 /*
3088   Skip record from a hashed join buffer if its match flag is set to MATCH_FOUND
3089 
3090   SYNOPSIS
3091     skip_if_matched()
3092 
3093   DESCRIPTION
3094     This implementation of the virtual function skip_if_matched does
3095     the same as the default implementation does, but it takes into account
3096     the link element used to connect the records with the same key into a chain.
3097 
3098   RETURN VALUE
3099     TRUE    the match flag is MATCH_FOUND  and the record has been skipped
3100     FALSE   otherwise
3101 */
3102 
skip_if_matched()3103 bool JOIN_CACHE_HASHED::skip_if_matched()
3104 {
3105   uchar *save_pos= pos;
3106   pos+= get_size_of_rec_offset();
3107   if (!this->JOIN_CACHE::skip_if_matched())
3108   {
3109     pos= save_pos;
3110     return FALSE;
3111   }
3112   return TRUE;
3113 }
3114 
3115 
3116 /*
3117   Skip record from a hashed join buffer if its match flag dictates to do so
3118 
3119   SYNOPSIS
3120     skip_if_uneeded_match()
3121 
3122   DESCRIPTION
3123     This implementation of the virtual function skip_if_not_needed_match does
3124     the same as the default implementation does, but it takes into account
3125     the link element used to connect the records with the same key into a chain.
3126 
3127   RETURN VALUE
3128     TRUE    the match flag dictates to skip the record
3129     FALSE   the match flag is off
3130 */
3131 
skip_if_not_needed_match()3132 bool JOIN_CACHE_HASHED::skip_if_not_needed_match()
3133 {
3134   uchar *save_pos= pos;
3135   pos+= get_size_of_rec_offset();
3136   if (!this->JOIN_CACHE::skip_if_not_needed_match())
3137   {
3138     pos= save_pos;
3139     return FALSE;
3140   }
3141   return TRUE;
3142 }
3143 
3144 
3145 /*
3146   Search for a key in the hash table of the join buffer
3147 
3148   SYNOPSIS
3149     key_search()
3150       key             pointer to the key value
3151       key_len         key value length
3152       key_ref_ptr OUT position of the reference to the next key from
3153                       the hash element for the found key , or
3154                       a position where the reference to the the hash
3155                       element for the key is to be added in the
3156                       case when the key has not been found
3157 
3158   DESCRIPTION
3159     The function looks for a key in the hash table of the join buffer.
3160     If the key is found the functionreturns the position of the reference
3161     to the next key from  to the hash element for the given key.
3162     Otherwise the function returns the position where the reference to the
3163     newly created hash element for the given key is to be added.
3164 
3165   RETURN VALUE
3166     TRUE    the key is found in the hash table
3167     FALSE   otherwise
3168 */
3169 
key_search(uchar * key,uint key_len,uchar ** key_ref_ptr)3170 bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
3171                                    uchar **key_ref_ptr)
3172 {
3173   bool is_found= FALSE;
3174   uint idx= (this->*hash_func)(key, key_length);
3175   uchar *ref_ptr= hash_table+size_of_key_ofs*idx;
3176   while (!is_null_key_ref(ref_ptr))
3177   {
3178     uchar *next_key;
3179     ref_ptr= get_next_key_ref(ref_ptr);
3180     next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) :
3181                             ref_ptr-key_length;
3182 
3183     if ((this->*hash_cmp_func)(next_key, key, key_len))
3184     {
3185       is_found= TRUE;
3186       break;
3187     }
3188   }
3189   *key_ref_ptr= ref_ptr;
3190   return is_found;
3191 }
3192 
3193 
3194 /*
3195   Hash function that considers a key in the hash table as byte array
3196 
3197   SYNOPSIS
3198     get_hash_idx_simple()
3199       key             pointer to the key value
3200       key_len         key value length
3201 
3202   DESCRIPTION
3203     The function calculates an index of the hash entry in the hash table
3204     of the join buffer for the given key. It considers the key just as
3205     a sequence of bytes of the length key_len.
3206 
3207   RETURN VALUE
3208     the calculated index of the hash entry for the given key
3209 */
3210 
3211 inline
get_hash_idx_simple(uchar * key,uint key_len)3212 uint JOIN_CACHE_HASHED::get_hash_idx_simple(uchar* key, uint key_len)
3213 {
3214   ulong nr= 1;
3215   ulong nr2= 4;
3216   uchar *pos= key;
3217   uchar *end= key+key_len;
3218   for (; pos < end ; pos++)
3219   {
3220     nr^= (ulong) ((((uint) nr & 63)+nr2)*((uint) *pos))+ (nr << 8);
3221     nr2+= 3;
3222   }
3223   return nr % hash_entries;
3224 }
3225 
3226 
3227 /*
3228   Hash function that takes into account collations of the components of the key
3229 
3230   SYNOPSIS
3231     get_hash_idx_complex()
3232       key             pointer to the key value
3233       key_len         key value length
3234 
3235   DESCRIPTION
3236     The function calculates an index of the hash entry in the hash table
3237     of the join buffer for the given key. It takes into account that the
3238     components of the key may be of a varchar type with different collations.
3239     The function guarantees that the same hash value for any two equal
3240     keys that may differ as byte sequences.
3241     The function takes the info about the components of the key, their
3242     types and used collations from the class member ref_key_info containing
3243     a pointer to the descriptor of the index that can be used for the join
3244     operation.
3245 
3246   RETURN VALUE
3247     the calculated index of the hash entry for the given key
3248 */
3249 
3250 inline
get_hash_idx_complex(uchar * key,uint key_len)3251 uint JOIN_CACHE_HASHED::get_hash_idx_complex(uchar *key, uint key_len)
3252 {
3253   return
3254     (uint) (key_hashnr(ref_key_info, ref_used_key_parts, key) % hash_entries);
3255 }
3256 
3257 
3258 /*
3259   Compare two key entries in the hash table as sequence of bytes
3260 
3261   SYNOPSIS
3262     equal_keys_simple()
3263       key1            pointer to the first key entry
3264       key2            pointer to the second key entry
3265       key_len         the length of the key values
3266 
3267   DESCRIPTION
3268     The function compares two key entries in the hash table key1 and key2
3269     as two sequences bytes of the length key_len
3270 
3271   RETURN VALUE
3272     TRUE       key1 coincides with key2
3273     FALSE      otherwise
3274 */
3275 
3276 inline
equal_keys_simple(uchar * key1,uchar * key2,uint key_len)3277 bool JOIN_CACHE_HASHED::equal_keys_simple(uchar *key1, uchar *key2,
3278                                           uint key_len)
3279 {
3280   return memcmp(key1, key2, key_len) == 0;
3281 }
3282 
3283 
3284 /*
3285   Compare two key entries taking into account the used collation
3286 
3287   SYNOPSIS
3288     equal_keys_complex()
3289       key1            pointer to the first key entry
3290       key2            pointer to the second key entry
3291       key_len         the length of the key values
3292 
3293   DESCRIPTION
3294     The function checks whether two key entries in the hash table
3295     key1 and key2 are equal as, possibly, compound keys of a certain
3296     structure whose components may be of a varchar type and may
3297     employ different collations.
3298     The descriptor of the key structure is taken from the class
3299     member ref_key_info.
3300 
3301   RETURN VALUE
3302     TRUE       key1 is equal tokey2
3303     FALSE      otherwise
3304 */
3305 
3306 inline
equal_keys_complex(uchar * key1,uchar * key2,uint key_len)3307 bool JOIN_CACHE_HASHED::equal_keys_complex(uchar *key1, uchar *key2,
3308                                           uint key_len)
3309 {
3310   return key_buf_cmp(ref_key_info, ref_used_key_parts, key1, key2) == 0;
3311 }
3312 
3313 
3314 /*
3315   Clean up the hash table of the join buffer
3316 
3317   SYNOPSIS
3318     cleanup_hash_table()
3319       key             pointer to the key value
3320       key_len         key value length
3321 
3322   DESCRIPTION
3323     The function cleans up the hash table in the join buffer removing all
3324     hash elements from the table.
3325 
3326   RETURN VALUE
3327     none
3328 */
3329 
cleanup_hash_table()3330 void JOIN_CACHE_HASHED:: cleanup_hash_table()
3331 {
3332   last_key_entry= hash_table;
3333   bzero(hash_table, (buff+buff_size)-hash_table);
3334   key_entries= 0;
3335 }
3336 
3337 
3338 /*
3339   Check whether all records in a key chain have their match flags set on
3340 
3341   SYNOPSIS
3342     check_all_match_flags_for_key()
3343       key_chain_ptr
3344 
3345   DESCRIPTION
3346     This function retrieves records in the given circular chain and checks
3347     whether their match flags are set on. The parameter key_chain_ptr shall
3348     point to the position in the join buffer storing the reference to the
3349     last element of this chain.
3350 
3351   RETURN VALUE
3352     TRUE   if each retrieved record has its match flag set to MATCH_FOUND
3353     FALSE  otherwise
3354 */
3355 
check_all_match_flags_for_key(uchar * key_chain_ptr)3356 bool JOIN_CACHE_HASHED::check_all_match_flags_for_key(uchar *key_chain_ptr)
3357 {
3358   uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
3359   uchar *next_rec_ref_ptr= last_rec_ref_ptr;
3360   do
3361   {
3362     next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
3363     uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
3364     if (get_match_flag_by_pos(rec_ptr) != MATCH_FOUND)
3365       return FALSE;
3366   }
3367   while (next_rec_ref_ptr != last_rec_ref_ptr);
3368   return TRUE;
3369 }
3370 
3371 
3372 /*
3373   Get the next key built for the records from the buffer of a hashed join cache
3374 
3375   SYNOPSIS
3376     get_next_key()
3377       key    pointer to the buffer where the key value is to be placed
3378 
3379   DESCRIPTION
3380     The function reads the next key value stored in the hash table of the
3381     join buffer. Depending on the value of the use_emb_key flag of the
3382     join cache the value is read either from the table itself or from
3383     the record field where it occurs.
3384 
3385   RETURN VALUE
3386     length of the key value - if the starting value of 'cur_key_entry' refers
3387     to the position after that referred by the the value of 'last_key_entry',
3388     0 - otherwise.
3389 */
3390 
get_next_key(uchar ** key)3391 uint JOIN_CACHE_HASHED::get_next_key(uchar ** key)
3392 {
3393   if (curr_key_entry == last_key_entry)
3394     return 0;
3395 
3396   curr_key_entry-= key_entry_length;
3397 
3398   *key = use_emb_key ? get_emb_key(curr_key_entry) : curr_key_entry;
3399 
3400   DBUG_ASSERT(*key >= buff && *key < hash_table);
3401 
3402   return key_length;
3403 }
3404 
3405 
3406 /*
3407   Initiate an iteration process over records in the joined table
3408 
3409   SYNOPSIS
3410     open()
3411 
3412   DESCRIPTION
3413     The function initiates the process of iteration over records from the
3414     joined table recurrently performed by the BNL/BKLH join algorithm.
3415 
3416   RETURN VALUE
3417     0            the initiation is a success
3418     error code   otherwise
3419 */
3420 
open()3421 int JOIN_TAB_SCAN::open()
3422 {
3423   save_or_restore_used_tabs(join_tab, FALSE);
3424   is_first_record= TRUE;
3425   join_tab->tracker->r_scans++;
3426   return join_init_read_record(join_tab);
3427 }
3428 
3429 
3430 /*
3431   Read the next record that can match while scanning the joined table
3432 
3433   SYNOPSIS
3434     next()
3435 
3436   DESCRIPTION
3437     The function reads the next record from the joined table that can
3438     match some records in the buffer of the join cache 'cache'. To do
3439     this the function calls the function that scans table records and
3440     looks for the next one that meets the condition pushed to the
3441     joined table join_tab.
3442 
3443   NOTES
3444     The function catches the signal that kills the query.
3445 
3446   RETURN VALUE
3447     0            the next record exists and has been successfully read
3448     error code   otherwise
3449 */
3450 
next()3451 int JOIN_TAB_SCAN::next()
3452 {
3453   int err= 0;
3454   int skip_rc;
3455   READ_RECORD *info= &join_tab->read_record;
3456   SQL_SELECT *select= join_tab->cache_select;
3457   THD *thd= join->thd;
3458 
3459   if (is_first_record)
3460     is_first_record= FALSE;
3461   else
3462     err= info->read_record();
3463 
3464   if (!err)
3465   {
3466     join_tab->tracker->r_rows++;
3467   }
3468 
3469   while (!err && select && (skip_rc= select->skip_record(thd)) <= 0)
3470   {
3471     if (unlikely(thd->check_killed()) || skip_rc < 0)
3472       return 1;
3473     /*
3474       Move to the next record if the last retrieved record does not
3475       meet the condition pushed to the table join_tab.
3476     */
3477     err= info->read_record();
3478     if (!err)
3479     {
3480       join_tab->tracker->r_rows++;
3481     }
3482   }
3483 
3484   if (!err)
3485     join_tab->tracker->r_rows_after_where++;
3486   return err;
3487 }
3488 
3489 
3490 /*
3491   Walk back in join order from join_tab until we encounter a join tab with
3492   tab->cache!=NULL, and save/restore tab->table->status along the way.
3493 
3494   @param save TRUE   save
3495               FALSE  restore
3496 */
3497 
save_or_restore_used_tabs(JOIN_TAB * join_tab,bool save)3498 static void save_or_restore_used_tabs(JOIN_TAB *join_tab, bool save)
3499 {
3500   JOIN_TAB *first= join_tab->bush_root_tab?
3501                      join_tab->bush_root_tab->bush_children->start :
3502                      join_tab->join->join_tab + join_tab->join->const_tables;
3503 
3504   for (JOIN_TAB *tab= join_tab-1; tab != first && !tab->cache; tab--)
3505   {
3506     if (tab->bush_children)
3507     {
3508       for (JOIN_TAB *child= tab->bush_children->start;
3509            child != tab->bush_children->end;
3510            child++)
3511       {
3512         if (save)
3513           child->table->status= child->status;
3514         else
3515         {
3516           tab->status= tab->table->status;
3517           tab->table->status= 0;
3518         }
3519       }
3520     }
3521 
3522     if (save)
3523       tab->table->status= tab->status;
3524     else
3525     {
3526       tab->status= tab->table->status;
3527       tab->table->status= 0;
3528     }
3529   }
3530 }
3531 
3532 
3533 /*
3534   Perform finalizing actions for a scan over the table records
3535 
3536   SYNOPSIS
3537     close()
3538 
3539   DESCRIPTION
3540     The function performs the necessary restoring actions after
3541     the table scan over the joined table has been finished.
3542 
3543   RETURN VALUE
3544     none
3545 */
3546 
close()3547 void JOIN_TAB_SCAN::close()
3548 {
3549   save_or_restore_used_tabs(join_tab, TRUE);
3550 }
3551 
3552 
3553 /*
3554   Prepare to iterate over the BNL join cache buffer to look for matches
3555 
3556   SYNOPSIS
3557     prepare_look_for_matches()
3558       skip_last   <-> ignore the last record in the buffer
3559 
3560   DESCRIPTION
3561     The function prepares the join cache for an iteration over the
3562     records in the join buffer. The iteration is performed when looking
3563     for matches for the record from the joined table join_tab that
3564     has been placed into the record buffer of the joined table.
3565     If the value of the parameter skip_last is TRUE then the last
3566     record from the join buffer is ignored.
3567     The function initializes the counter of the records that have been
3568     not iterated over yet.
3569 
3570   RETURN VALUE
3571     TRUE    there are no records in the buffer to iterate over
3572     FALSE   otherwise
3573 */
3574 
prepare_look_for_matches(bool skip_last)3575 bool JOIN_CACHE_BNL::prepare_look_for_matches(bool skip_last)
3576 {
3577   if (!records)
3578     return TRUE;
3579   reset(FALSE);
3580   rem_records= (uint)records - MY_TEST(skip_last);
3581   return rem_records == 0;
3582 }
3583 
3584 
3585 /*
3586   Get next record from the BNL join cache buffer when looking for matches
3587 
3588   SYNOPSIS
3589     get_next_candidate_for_match
3590 
3591   DESCRIPTION
3592     This method is used for iterations over the records from the join
3593     cache buffer when looking for matches for records from join_tab.
3594     The methods performs the necessary preparations to read the next record
3595     from the join buffer into the record buffer by the method
3596     read_next_candidate_for_match, or, to skip the next record from the join
3597     buffer by the method skip_recurrent_candidate_for_match.
3598     This implementation of the virtual method get_next_candidate_for_match
3599     just  decrements the counter of the records that are to be iterated over
3600     and returns the current value of the cursor 'pos' as the position of
3601     the record to be processed.
3602 
3603   RETURN VALUE
3604     pointer to the position right after the prefix of the current record
3605     in the join buffer if the there is another record to iterate over,
3606     0 - otherwise.
3607 */
3608 
get_next_candidate_for_match()3609 uchar *JOIN_CACHE_BNL::get_next_candidate_for_match()
3610 {
3611   if (!rem_records)
3612     return 0;
3613   rem_records--;
3614   return pos+base_prefix_length;
3615 }
3616 
3617 
3618 /*
3619   Check whether the matching record from the BNL cache is to be skipped
3620 
3621   SYNOPSIS
3622     skip_next_candidate_for_match
3623     rec_ptr  pointer to the position in the join buffer right after the prefix
3624              of the current record
3625 
3626   DESCRIPTION
3627     This implementation of the virtual function just calls the
3628     method skip_if_not_needed_match to check whether the record referenced by
3629     ref_ptr has its match flag set either to MATCH_FOUND and join_tab is the
3630     first inner table of a semi-join, or it's set to MATCH_IMPOSSIBLE and
3631     join_tab is the first inner table of an outer join.
3632     If so, the function just skips this record setting the value of the
3633     cursor 'pos' to the position right after it.
3634 
3635   RETURN VALUE
3636     TRUE   the record referenced by rec_ptr has been skipped
3637     FALSE  otherwise
3638 */
3639 
skip_next_candidate_for_match(uchar * rec_ptr)3640 bool JOIN_CACHE_BNL::skip_next_candidate_for_match(uchar *rec_ptr)
3641 {
3642   pos= rec_ptr-base_prefix_length;
3643   return skip_if_not_needed_match();
3644 }
3645 
3646 
3647 /*
3648   Read next record from the BNL join cache buffer when looking for matches
3649 
3650   SYNOPSIS
3651     read_next_candidate_for_match
3652     rec_ptr  pointer to the position in the join buffer right after the prefix
3653              the current record.
3654 
3655   DESCRIPTION
3656     This implementation of the virtual method read_next_candidate_for_match
3657     calls the method get_record to read the record referenced by rec_ptr from
3658     the join buffer into the record buffer. If this record refers to the
3659     fields in the other join buffers the call of get_record ensures that
3660     these fields are read into the corresponding record buffers as well.
3661     This function is supposed to be called after a successful call of
3662     the method get_next_candidate_for_match.
3663 
3664   RETURN VALUE
3665     none
3666 */
3667 
read_next_candidate_for_match(uchar * rec_ptr)3668 void JOIN_CACHE_BNL::read_next_candidate_for_match(uchar *rec_ptr)
3669 {
3670   pos= rec_ptr-base_prefix_length;
3671   get_record();
3672 }
3673 
3674 
3675 /*
3676   Initialize the BNL join cache
3677 
3678   SYNOPSIS
3679     init
3680       for_explain       join buffer is initialized for explain only
3681 
3682   DESCRIPTION
3683     The function initializes the cache structure. It is supposed to be called
3684     right after a constructor for the JOIN_CACHE_BNL.
3685 
3686   NOTES
3687     The function first constructs a companion object of the type JOIN_TAB_SCAN,
3688     then it calls the init method of the parent class.
3689 
3690   RETURN VALUE
3691     0   initialization with buffer allocations has been succeeded
3692     1   otherwise
3693 */
3694 
init(bool for_explain)3695 int JOIN_CACHE_BNL::init(bool for_explain)
3696 {
3697   DBUG_ENTER("JOIN_CACHE_BNL::init");
3698 
3699   if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
3700     DBUG_RETURN(1);
3701 
3702   DBUG_RETURN(JOIN_CACHE::init(for_explain));
3703 }
3704 
3705 
3706 /*
3707   Get the chain of records from buffer matching the current candidate for join
3708 
3709   SYNOPSIS
3710     get_matching_chain_by_join_key()
3711 
3712   DESCRIPTION
3713     This function first build a join key for the record of join_tab that
3714     currently is in the join buffer for this table. Then it looks for
3715     the key entry with this key in the hash table of the join cache.
3716     If such a key entry is found the function returns the pointer to
3717     the head of the chain of records in the join_buffer that match this
3718     key.
3719 
3720   RETURN VALUE
3721     The pointer to the corresponding circular list of records if
3722     the key entry with the join key is found, 0 - otherwise.
3723 */
3724 
get_matching_chain_by_join_key()3725 uchar *JOIN_CACHE_BNLH::get_matching_chain_by_join_key()
3726 {
3727   uchar *key_ref_ptr;
3728   TABLE *table= join_tab->table;
3729   TABLE_REF *ref= &join_tab->ref;
3730   KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
3731   /* Build the join key value out of the record in the record buffer */
3732   key_copy(key_buff, table->record[0], keyinfo, key_length, TRUE);
3733   /* Look for this key in the join buffer */
3734   if (!key_search(key_buff, key_length, &key_ref_ptr))
3735     return 0;
3736   return key_ref_ptr+get_size_of_key_offset();
3737 }
3738 
3739 
3740 /*
3741   Prepare to iterate over the BNLH join cache buffer to look for matches
3742 
3743   SYNOPSIS
3744     prepare_look_for_matches()
3745       skip_last   <-> ignore the last record in the buffer
3746 
3747   DESCRIPTION
3748     The function prepares the join cache for an iteration over the
3749     records in the join buffer. The iteration is performed when looking
3750     for matches for the record from the joined table join_tab that
3751     has been placed into the record buffer of the joined table.
3752     If the value of the parameter skip_last is TRUE then the last
3753     record from the join buffer is ignored.
3754     The function builds the hashed key from the join fields of join_tab
3755     and uses this key to look in the hash table of the join cache for
3756     the chain of matching records in in the join buffer. If it finds
3757     such a chain it sets  the member last_rec_ref_ptr to point to the
3758     last link of the chain while setting the member next_rec_ref_po 0.
3759 
3760   RETURN VALUE
3761     TRUE    there are no matching records in the buffer to iterate over
3762     FALSE   otherwise
3763 */
3764 
prepare_look_for_matches(bool skip_last)3765 bool JOIN_CACHE_BNLH::prepare_look_for_matches(bool skip_last)
3766 {
3767   uchar *curr_matching_chain;
3768   last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
3769   if (!(curr_matching_chain= get_matching_chain_by_join_key()))
3770     return 1;
3771   last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain);
3772   return 0;
3773 }
3774 
3775 
3776 /*
3777   Get next record from the BNLH join cache buffer when looking for matches
3778 
3779   SYNOPSIS
3780     get_next_candidate_for_match
3781 
3782   DESCRIPTION
3783     This method is used for iterations over the records from the join
3784     cache buffer when looking for matches for records from join_tab.
3785     The methods performs the necessary preparations to read the next record
3786     from the join buffer into the record buffer by the method
3787     read_next_candidate_for_match, or, to skip the next record from the join
3788     buffer by the method skip_next_candidate_for_match.
3789     This implementation of the virtual method moves to the next record
3790     in the chain of all records from the join buffer that are to be
3791     equi-joined with the current record from join_tab.
3792 
3793   RETURN VALUE
3794     pointer to the beginning of the record fields in the join buffer
3795     if the there is another record to iterate over, 0 - otherwise.
3796 */
3797 
get_next_candidate_for_match()3798 uchar *JOIN_CACHE_BNLH::get_next_candidate_for_match()
3799 {
3800   if (next_matching_rec_ref_ptr == last_matching_rec_ref_ptr)
3801     return 0;
3802   next_matching_rec_ref_ptr= get_next_rec_ref(next_matching_rec_ref_ptr ?
3803                                                 next_matching_rec_ref_ptr :
3804                                                 last_matching_rec_ref_ptr);
3805   return next_matching_rec_ref_ptr+rec_fields_offset;
3806 }
3807 
3808 
3809 /*
3810   Check whether the matching record from the BNLH cache is to be skipped
3811 
3812   SYNOPSIS
3813     skip_next_candidate_for_match
3814     rec_ptr  pointer to the position in the join buffer right after
3815              the previous record
3816 
3817   DESCRIPTION
3818     This implementation of the virtual function just calls the
3819     method get_match_flag_by_pos to check whether the record referenced
3820     by ref_ptr has its match flag set to MATCH_FOUND.
3821 
3822   RETURN VALUE
3823     TRUE   the record referenced by rec_ptr has its match flag set to
3824            MATCH_FOUND
3825     FALSE  otherwise
3826 */
3827 
skip_next_candidate_for_match(uchar * rec_ptr)3828 bool JOIN_CACHE_BNLH::skip_next_candidate_for_match(uchar *rec_ptr)
3829 {
3830  return  join_tab->check_only_first_match() &&
3831           (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
3832 }
3833 
3834 
3835 /*
3836   Read next record from the BNLH join cache buffer when looking for matches
3837 
3838   SYNOPSIS
3839     read_next_candidate_for_match
3840     rec_ptr  pointer to the position in the join buffer right after
3841              the previous record
3842 
3843   DESCRIPTION
3844     This implementation of the virtual method read_next_candidate_for_match
3845     calls the method get_record_by_pos to read the record referenced by rec_ptr
3846     from the join buffer into the record buffer. If this record refers to
3847     fields in the other join buffers the call of get_record_by_po ensures that
3848     these fields are read into the corresponding record buffers as well.
3849     This function is supposed to be called after a successful call of
3850     the method get_next_candidate_for_match.
3851 
3852   RETURN VALUE
3853     none
3854 */
3855 
read_next_candidate_for_match(uchar * rec_ptr)3856 void JOIN_CACHE_BNLH::read_next_candidate_for_match(uchar *rec_ptr)
3857 {
3858   get_record_by_pos(rec_ptr);
3859 }
3860 
3861 
3862 /*
3863   Initialize the BNLH join cache
3864 
3865   SYNOPSIS
3866     init
3867       for_explain       join buffer is initialized for explain only
3868 
3869   DESCRIPTION
3870     The function initializes the cache structure. It is supposed to be called
3871     right after a constructor for the JOIN_CACHE_BNLH.
3872 
3873   NOTES
3874     The function first constructs a companion object of the type JOIN_TAB_SCAN,
3875     then it calls the init method of the parent class.
3876 
3877   RETURN VALUE
3878     0   initialization with buffer allocations has been succeeded
3879     1   otherwise
3880 */
3881 
init(bool for_explain)3882 int JOIN_CACHE_BNLH::init(bool for_explain)
3883 {
3884   DBUG_ENTER("JOIN_CACHE_BNLH::init");
3885 
3886   if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
3887     DBUG_RETURN(1);
3888 
3889   DBUG_RETURN(JOIN_CACHE_HASHED::init(for_explain));
3890 }
3891 
3892 
3893 /*
3894   Calculate the increment of the MRR buffer for a record write
3895 
3896   SYNOPSIS
3897     aux_buffer_incr()
3898 
3899   DESCRIPTION
3900     This implementation of the virtual function aux_buffer_incr determines
3901     for how much the size of the MRR buffer should be increased when another
3902     record is added to the cache.
3903 
3904   RETURN VALUE
3905     the increment of the size of the MRR buffer for the next record
3906 */
3907 
aux_buffer_incr(size_t recno)3908 uint JOIN_TAB_SCAN_MRR::aux_buffer_incr(size_t recno)
3909 {
3910   uint incr= 0;
3911   TABLE_REF *ref= &join_tab->ref;
3912   TABLE *tab= join_tab->table;
3913   ha_rows rec_per_key=
3914     (ha_rows) tab->key_info[ref->key].actual_rec_per_key(ref->key_parts-1);
3915   set_if_bigger(rec_per_key, 1);
3916   if (recno == 1)
3917     incr=  ref->key_length + tab->file->ref_length;
3918   incr+= (uint)(tab->file->stats.mrr_length_per_rec * rec_per_key);
3919   return incr;
3920 }
3921 
3922 
3923 /*
3924   Initiate iteration over records returned by MRR for the current join buffer
3925 
3926   SYNOPSIS
3927     open()
3928 
3929   DESCRIPTION
3930     The function initiates the process of iteration over the records from
3931     join_tab returned by the MRR interface functions for records from
3932     the join buffer. Such an iteration is performed by the BKA/BKAH join
3933     algorithm for each new refill of the join buffer.
3934     The function calls the MRR handler function multi_range_read_init to
3935     initiate this process.
3936 
3937   RETURN VALUE
3938     0            the initiation is a success
3939     error code   otherwise
3940 */
3941 
open()3942 int JOIN_TAB_SCAN_MRR::open()
3943 {
3944   handler *file= join_tab->table->file;
3945 
3946   join_tab->table->null_row= 0;
3947 
3948 
3949   /* Dynamic range access is never used with BKA */
3950   DBUG_ASSERT(join_tab->use_quick != 2);
3951 
3952   join_tab->tracker->r_scans++;
3953   save_or_restore_used_tabs(join_tab, FALSE);
3954 
3955   init_mrr_buff();
3956 
3957   /*
3958     Prepare to iterate over keys from the join buffer and to get
3959     matching candidates obtained with MMR handler functions.
3960   */
3961   if (!file->inited)
3962     file->ha_index_init(join_tab->ref.key, 1);
3963   ranges= cache->get_number_of_ranges_for_mrr();
3964   if (!join_tab->cache_idx_cond)
3965     range_seq_funcs.skip_index_tuple= 0;
3966   return file->multi_range_read_init(&range_seq_funcs, (void*) cache,
3967                                      ranges, mrr_mode, &mrr_buff);
3968 }
3969 
3970 
3971 /*
3972   Read the next record returned by MRR for the current join buffer
3973 
3974   SYNOPSIS
3975     next()
3976 
3977   DESCRIPTION
3978     The function reads the next record from the joined table join_tab
3979     returned by the MRR handler function multi_range_read_next for
3980     the current refill of the join buffer. The record is read into
3981     the record buffer used for join_tab records in join operations.
3982 
3983   RETURN VALUE
3984     0            the next record exists and has been successfully read
3985     error code   otherwise
3986 */
3987 
next()3988 int JOIN_TAB_SCAN_MRR::next()
3989 {
3990   char **ptr= (char **) cache->get_curr_association_ptr();
3991 
3992   DBUG_ASSERT(sizeof(range_id_t) == sizeof(*ptr));
3993   int rc= join_tab->table->file->multi_range_read_next((range_id_t*)ptr) ? -1 : 0;
3994   if (!rc)
3995   {
3996     join_tab->tracker->r_rows++;
3997     join_tab->tracker->r_rows_after_where++;
3998     /*
3999       If a record in in an incremental cache contains no fields then the
4000       association for the last record in cache will be equal to cache->end_pos
4001     */
4002     /*
4003     psergey: this makes no sense where HA_MRR_NO_ASSOC is used.
4004     DBUG_ASSERT(cache->buff <= (uchar *) (*ptr) &&
4005                 (uchar *) (*ptr) <= cache->end_pos);
4006     */
4007   }
4008   return rc;
4009 }
4010 
4011 
4012 static
bka_range_seq_key_info(void * init_params,uint * length,key_part_map * map)4013 void bka_range_seq_key_info(void *init_params, uint *length,
4014                           key_part_map *map)
4015 {
4016   TABLE_REF *ref= &(((JOIN_CACHE*)init_params)->join_tab->ref);
4017   *length= ref->key_length;
4018   *map= (key_part_map(1) << ref->key_parts) - 1;
4019 }
4020 
4021 
4022 /*
4023 Initialize retrieval of range sequence for BKA join algorithm
4024 
4025 SYNOPSIS
4026   bka_range_seq_init()
4027    init_params   pointer to the BKA join cache object
4028    n_ranges      the number of ranges obtained
4029    flags         combination of MRR flags
4030 
4031 DESCRIPTION
4032   The function interprets init_param as a pointer to a JOIN_CACHE_BKA
4033   object. The function prepares for an iteration over the join keys
4034   built for all records from the cache join buffer.
4035 
4036 NOTE
4037   This function are used only as a callback function.
4038 
4039 RETURN VALUE
4040   init_param value that is to be used as a parameter of bka_range_seq_next()
4041 */
4042 
4043 static
bka_range_seq_init(void * init_param,uint n_ranges,uint flags)4044 range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags)
4045 {
4046   DBUG_ENTER("bka_range_seq_init");
4047   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param;
4048   cache->reset(0);
4049   DBUG_RETURN((range_seq_t) init_param);
4050 }
4051 
4052 
4053 /*
4054 Get the next range/key over records from the join buffer used by a BKA cache
4055 
4056 SYNOPSIS
4057   bka_range_seq_next()
4058     seq        the value returned by  bka_range_seq_init
4059     range  OUT reference to the next range
4060 
4061 DESCRIPTION
4062   The function interprets seq as a pointer to a JOIN_CACHE_BKA
4063   object. The function returns a pointer to the range descriptor
4064   for the key built over the next record from the join buffer.
4065 
4066 NOTE
4067   This function are used only as a callback function.
4068 
4069 RETURN VALUE
4070   FALSE   ok, the range structure filled with info about the next range/key
4071   TRUE    no more ranges
4072 */
4073 
4074 static
bka_range_seq_next(range_seq_t rseq,KEY_MULTI_RANGE * range)4075 bool bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
4076 {
4077   DBUG_ENTER("bka_range_seq_next");
4078   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
4079   TABLE_REF *ref= &cache->join_tab->ref;
4080   key_range *start_key= &range->start_key;
4081   if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
4082   {
4083     start_key->keypart_map= (1 << ref->key_parts) - 1;
4084     start_key->flag= HA_READ_KEY_EXACT;
4085     range->end_key= *start_key;
4086     range->end_key.flag= HA_READ_AFTER_KEY;
4087     range->ptr= (char *) cache->get_curr_rec();
4088     range->range_flag= EQ_RANGE;
4089     DBUG_RETURN(0);
4090   }
4091   DBUG_RETURN(1);
4092 }
4093 
4094 
4095 /*
4096 Check whether range_info orders to skip the next record from BKA buffer
4097 
4098 SYNOPSIS
4099   bka_range_seq_skip_record()
4100     seq              value returned by bka_range_seq_init()
4101     range_info       information about the next range
4102     rowid [NOT USED] rowid of the record to be checked
4103 
4104 
4105 DESCRIPTION
4106   The function interprets seq as a pointer to a JOIN_CACHE_BKA object.
4107   The function returns TRUE if the record with this range_info
4108   is to be filtered out from the stream of records returned by
4109   multi_range_read_next().
4110 
4111 NOTE
4112   This function are used only as a callback function.
4113 
4114 RETURN VALUE
4115   1    record with this range_info is to be filtered out from the stream
4116        of records returned by multi_range_read_next()
4117   0    the record is to be left in the stream
4118 */
4119 
4120 static
bka_range_seq_skip_record(range_seq_t rseq,range_id_t range_info,uchar * rowid)4121 bool bka_range_seq_skip_record(range_seq_t rseq, range_id_t range_info, uchar *rowid)
4122 {
4123   DBUG_ENTER("bka_range_seq_skip_record");
4124   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
4125   bool res= cache->get_match_flag_by_pos((uchar *) range_info) ==
4126     JOIN_CACHE::MATCH_FOUND;
4127   DBUG_RETURN(res);
4128 }
4129 
4130 
4131 /*
4132 Check if the record combination from BKA cache matches the index condition
4133 
4134 SYNOPSIS
4135   bka_skip_index_tuple()
4136     rseq             value returned by bka_range_seq_init()
4137     range_info       record chain for the next range/key returned by MRR
4138 
4139 DESCRIPTION
4140   This is wrapper for JOIN_CACHE_BKA::skip_index_tuple method,
4141   see comments there.
4142 
4143 NOTE
4144   This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
4145 
4146 RETURN VALUE
4147   0    The record combination satisfies the index condition
4148   1    Otherwise
4149 */
4150 
4151 static
bka_skip_index_tuple(range_seq_t rseq,range_id_t range_info)4152 bool bka_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
4153 {
4154   DBUG_ENTER("bka_skip_index_tuple");
4155   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
4156   THD *thd= cache->thd();
4157   bool res;
4158   status_var_increment(thd->status_var.ha_icp_attempts);
4159   if (!(res= cache->skip_index_tuple(range_info)))
4160     status_var_increment(thd->status_var.ha_icp_match);
4161   DBUG_RETURN(res);
4162 }
4163 
4164 
4165 /*
4166 Prepare to read the record from BKA cache matching the current joined record
4167 
4168 SYNOPSIS
4169   prepare_look_for_matches()
4170     skip_last <-> ignore the last record in the buffer (always unused here)
4171 
4172 DESCRIPTION
4173   The function prepares to iterate over records in the join cache buffer
4174   matching the record loaded into the record buffer for join_tab when
4175   performing join operation by BKA join algorithm. With BKA algorithms the
4176   record loaded into the record buffer for join_tab always has a direct
4177   reference to the matching records from the join buffer. When the regular
4178   BKA join algorithm is employed the record from join_tab can refer to
4179   only one such record.
4180   The function sets the counter of the remaining records from the cache
4181   buffer that would match the current join_tab record to 1.
4182 
4183 RETURN VALUE
4184   TRUE    there are no records in the buffer to iterate over
4185   FALSE   otherwise
4186 */
4187 
prepare_look_for_matches(bool skip_last)4188 bool JOIN_CACHE_BKA::prepare_look_for_matches(bool skip_last)
4189 {
4190   if (!records)
4191     return TRUE;
4192   rem_records= 1;
4193   return FALSE;
4194 }
4195 
4196 
4197 /*
4198 Get the record from the BKA cache matching the current joined record
4199 
4200 SYNOPSIS
4201   get_next_candidate_for_match
4202 
4203 DESCRIPTION
4204   This method is used for iterations over the records from the join
4205   cache buffer when looking for matches for records from join_tab.
4206   The method performs the necessary preparations to read the next record
4207   from the join buffer into the record buffer by the method
4208   read_next_candidate_for_match, or, to skip the next record from the join
4209   buffer by the method skip_if_not_needed_match.
4210   This implementation of the virtual method get_next_candidate_for_match
4211   just  decrements the counter of the records that are to be iterated over
4212   and returns the value of curr_association as a reference to the position
4213   of the beginning of the record fields in the buffer.
4214 
4215 RETURN VALUE
4216   pointer to the start of the record fields in the join buffer
4217   if the there is another record to iterate over, 0 - otherwise.
4218 */
4219 
get_next_candidate_for_match()4220 uchar *JOIN_CACHE_BKA::get_next_candidate_for_match()
4221 {
4222   if (!rem_records)
4223     return 0;
4224   rem_records--;
4225   return curr_association;
4226 }
4227 
4228 
4229 /*
4230 Check whether the matching record from the BKA cache is to be skipped
4231 
4232 SYNOPSIS
4233   skip_next_candidate_for_match
4234   rec_ptr  pointer to the position in the join buffer right after
4235            the previous record
4236 
4237 DESCRIPTION
4238   This implementation of the virtual function just calls the
4239   method get_match_flag_by_pos to check whether the record referenced
4240   by ref_ptr has its match flag set to MATCH_FOUND.
4241 
4242 RETURN VALUE
4243   TRUE   the record referenced by rec_ptr has its match flag set to
4244          MATCH_FOUND
4245   FALSE  otherwise
4246 */
4247 
skip_next_candidate_for_match(uchar * rec_ptr)4248 bool JOIN_CACHE_BKA::skip_next_candidate_for_match(uchar *rec_ptr)
4249 {
4250   return join_tab->check_only_first_match() &&
4251     (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
4252 }
4253 
4254 
4255 /*
4256 Read the next record from the BKA join cache buffer when looking for matches
4257 
4258 SYNOPSIS
4259   read_next_candidate_for_match
4260   rec_ptr  pointer to the position in the join buffer right after
4261            the previous record
4262 
4263 DESCRIPTION
4264   This implementation of the virtual method read_next_candidate_for_match
4265   calls the method get_record_by_pos to read the record referenced by rec_ptr
4266   from the join buffer into the record buffer. If this record refers to
4267   fields in the other join buffers the call of get_record_by_po ensures that
4268   these fields are read into the corresponding record buffers as well.
4269   This function is supposed to be called after a successful call of
4270   the method get_next_candidate_for_match.
4271 
4272 RETURN VALUE
4273   none
4274 */
4275 
read_next_candidate_for_match(uchar * rec_ptr)4276 void JOIN_CACHE_BKA::read_next_candidate_for_match(uchar *rec_ptr)
4277 {
4278   get_record_by_pos(rec_ptr);
4279 }
4280 
4281 
4282 /*
4283 Initialize the BKA join cache
4284 
4285 SYNOPSIS
4286   init
4287     for_explain       join buffer is initialized for explain only
4288 
4289 
4290 DESCRIPTION
4291   The function initializes the cache structure. It is supposed to be called
4292   right after a constructor for the JOIN_CACHE_BKA.
4293 
4294 NOTES
4295   The function first constructs a companion object of the type
4296   JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
4297 
4298 RETURN VALUE
4299   0   initialization with buffer allocations has been succeeded
4300   1   otherwise
4301 */
4302 
init(bool for_explain)4303 int JOIN_CACHE_BKA::init(bool for_explain)
4304 {
4305   int res;
4306   bool check_only_first_match= join_tab->check_only_first_match();
4307 
4308   RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
4309     bka_range_seq_init,
4310     bka_range_seq_next,
4311     check_only_first_match ?  bka_range_seq_skip_record : 0,
4312     bka_skip_index_tuple };
4313 
4314   DBUG_ENTER("JOIN_CACHE_BKA::init");
4315 
4316   JOIN_TAB_SCAN_MRR *jsm;
4317   if (!(join_tab_scan= jsm= new JOIN_TAB_SCAN_MRR(join, join_tab,
4318                                                   mrr_mode, rs_funcs)))
4319     DBUG_RETURN(1);
4320 
4321   if ((res= JOIN_CACHE::init(for_explain)))
4322     DBUG_RETURN(res);
4323 
4324   if (use_emb_key)
4325     jsm->mrr_mode |= HA_MRR_MATERIALIZED_KEYS;
4326 
4327   DBUG_RETURN(0);
4328 }
4329 
4330 
4331 /*
4332 Get the key built over the next record from BKA join buffer
4333 
4334 SYNOPSIS
4335   get_next_key()
4336     key    pointer to the buffer where the key value is to be placed
4337 
4338 DESCRIPTION
4339   The function reads key fields from the current record in the join buffer.
4340   and builds the key value out of these fields that will be used to access
4341   the 'join_tab' table. Some of key fields may belong to previous caches.
4342   They are accessed via record references to the record parts stored in the
4343   previous join buffers. The other key fields always are placed right after
4344   the flag fields of the record.
4345   If the key is embedded, which means that its value can be read directly
4346   from the join buffer, then *key is set to the beginning of the key in
4347   this buffer. Otherwise the key is built in the join_tab->ref->key_buff.
4348   The function returns the length of the key if it succeeds ro read it.
4349   If is assumed that the functions starts reading at the position of
4350   the record length which is provided for each records in a BKA cache.
4351   After the key is built the 'pos' value points to the first position after
4352   the current record.
4353   The function just skips the records with MATCH_IMPOSSIBLE in the
4354   match flag field if there is any.
4355   The function returns 0 if the initial position is after the beginning
4356   of the record fields for last record from the join buffer.
4357 
4358 RETURN VALUE
4359   length of the key value - if the starting value of 'pos' points to
4360   the position before the fields for the last record,
4361   0 - otherwise.
4362 */
4363 
get_next_key(uchar ** key)4364 uint JOIN_CACHE_BKA::get_next_key(uchar ** key)
4365 {
4366   uint len;
4367   uint32 rec_len;
4368   uchar *init_pos;
4369   JOIN_CACHE *cache;
4370 
4371 start:
4372 
4373   /* Any record in a BKA cache is prepended with its length */
4374   DBUG_ASSERT(with_length);
4375 
4376   if ((pos+size_of_rec_len) > last_rec_pos || !records)
4377     return 0;
4378 
4379   /* Read the length of the record */
4380   rec_len= get_rec_length(pos);
4381   pos+= size_of_rec_len;
4382   init_pos= pos;
4383 
4384   /* Read a reference to the previous cache if any */
4385   if (prev_cache)
4386     pos+= prev_cache->get_size_of_rec_offset();
4387 
4388   curr_rec_pos= pos;
4389 
4390   /* Read all flag fields of the record */
4391   read_flag_fields();
4392 
4393   if (with_match_flag &&
4394       (Match_flag) curr_rec_pos[0] == MATCH_IMPOSSIBLE )
4395   {
4396     pos= init_pos+rec_len;
4397     goto start;
4398   }
4399 
4400   if (use_emb_key)
4401   {
4402     /* An embedded key is taken directly from the join buffer */
4403     *key= pos;
4404     len= emb_key_length;
4405   }
4406   else
4407   {
4408     /* Read key arguments from previous caches if there are any such fields */
4409     if (external_key_arg_fields)
4410     {
4411       uchar *rec_ptr= curr_rec_pos;
4412       uint key_arg_count= external_key_arg_fields;
4413       CACHE_FIELD **copy_ptr= blob_ptr-key_arg_count;
4414       for (cache= prev_cache; key_arg_count; cache= cache->prev_cache)
4415       {
4416         uint len= 0;
4417         DBUG_ASSERT(cache);
4418         rec_ptr= cache->get_rec_ref(rec_ptr);
4419         while (!cache->referenced_fields)
4420         {
4421           cache= cache->prev_cache;
4422           DBUG_ASSERT(cache);
4423           rec_ptr= cache->get_rec_ref(rec_ptr);
4424         }
4425         while (key_arg_count &&
4426                cache->read_referenced_field(*copy_ptr, rec_ptr, &len))
4427         {
4428           copy_ptr++;
4429           --key_arg_count;
4430         }
4431       }
4432     }
4433 
4434     /*
4435       Read the other key arguments from the current record. The fields for
4436       these arguments are always first in the sequence of the record's fields.
4437     */
4438     CACHE_FIELD *copy= field_descr+flag_fields;
4439     CACHE_FIELD *copy_end= copy+local_key_arg_fields;
4440     bool blob_in_rec_buff= blob_data_is_in_rec_buff(curr_rec_pos);
4441     for ( ; copy < copy_end; copy++)
4442       read_record_field(copy, blob_in_rec_buff);
4443 
4444     /* Build the key over the fields read into the record buffers */
4445     TABLE_REF *ref= &join_tab->ref;
4446     cp_buffer_from_ref(join->thd, join_tab->table, ref);
4447     *key= ref->key_buff;
4448     len= ref->key_length;
4449   }
4450 
4451   pos= init_pos+rec_len;
4452 
4453   return len;
4454 }
4455 
4456 
4457 /*
4458 Check the index condition of the joined table for a record from the BKA cache
4459 
4460 SYNOPSIS
4461   skip_index_tuple()
4462     range_info       pointer to the record returned by MRR
4463 
4464 DESCRIPTION
4465   This function is invoked from MRR implementation to check if an index
4466   tuple matches the index condition. It is used in the case where the index
4467   condition actually depends on both columns of the used index and columns
4468   from previous tables.
4469 
4470 NOTES
4471   Accessing columns of the previous tables requires special handling with
4472   BKA. The idea of BKA is to collect record combinations in a buffer and
4473   then do a batch of ref access lookups, i.e. by the time we're doing a
4474   lookup its previous-records-combination is not in prev_table->record[0]
4475   but somewhere in the join buffer.
4476   We need to get it from there back into prev_table(s)->record[0] before we
4477   can evaluate the index condition, and that's why we need this function
4478   instead of regular IndexConditionPushdown.
4479 
4480 NOTES
4481   Possible optimization:
4482   Before we unpack the record from a previous table
4483   check if this table is used in the condition.
4484   If so then unpack the record otherwise skip the unpacking.
4485   This should be done by a special virtual method
4486   get_partial_record_by_pos().
4487 
4488 RETURN VALUE
4489   1    the record combination does not satisfies the index condition
4490   0    otherwise
4491 */
4492 
skip_index_tuple(range_id_t range_info)4493 bool JOIN_CACHE_BKA::skip_index_tuple(range_id_t range_info)
4494 {
4495   DBUG_ENTER("JOIN_CACHE_BKA::skip_index_tuple");
4496   get_record_by_pos((uchar*)range_info);
4497   DBUG_RETURN(!join_tab->cache_idx_cond->val_int());
4498 }
4499 
4500 
4501 
4502 /*
4503 Initialize retrieval of range sequence for the BKAH join algorithm
4504 
4505 SYNOPSIS
4506   bkah_range_seq_init()
4507     init_params   pointer to the BKAH join cache object
4508     n_ranges      the number of ranges obtained
4509     flags         combination of MRR flags
4510 
4511 DESCRIPTION
4512   The function interprets init_param as a pointer to a JOIN_CACHE_BKAH
4513   object. The function prepares for an iteration over distinct join keys
4514   built over the records from the cache join buffer.
4515 
4516 NOTE
4517   This function are used only as a callback function.
4518 
4519 RETURN VALUE
4520   init_param    value that is to be used as a parameter of
4521                 bkah_range_seq_next()
4522 */
4523 
4524 static
bkah_range_seq_init(void * init_param,uint n_ranges,uint flags)4525 range_seq_t bkah_range_seq_init(void *init_param, uint n_ranges, uint flags)
4526 {
4527   DBUG_ENTER("bkah_range_seq_init");
4528   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) init_param;
4529   cache->reset(0);
4530   DBUG_RETURN((range_seq_t) init_param);
4531 }
4532 
4533 
4534 /*
4535 Get the next range/key over records from the join buffer of a BKAH cache
4536 
4537 SYNOPSIS
4538   bkah_range_seq_next()
4539     seq        value returned by  bkah_range_seq_init()
4540     range  OUT reference to the next range
4541 
4542 DESCRIPTION
4543   The function interprets seq as a pointer to a JOIN_CACHE_BKAH
4544   object. The function returns a pointer to the range descriptor
4545   for the next unique key built over records from the join buffer.
4546 
4547 NOTE
4548   This function are used only as a callback function.
4549 
4550 RETURN VALUE
4551   FALSE  ok, the range structure filled with info about the next range/key
4552   TRUE   no more ranges
4553 */
4554 
4555 static
bkah_range_seq_next(range_seq_t rseq,KEY_MULTI_RANGE * range)4556 bool bkah_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
4557 {
4558   DBUG_ENTER("bkah_range_seq_next");
4559   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
4560   TABLE_REF *ref= &cache->join_tab->ref;
4561   key_range *start_key= &range->start_key;
4562   if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
4563   {
4564     start_key->keypart_map= (1 << ref->key_parts) - 1;
4565     start_key->flag= HA_READ_KEY_EXACT;
4566     range->end_key= *start_key;
4567     range->end_key.flag= HA_READ_AFTER_KEY;
4568     range->ptr= (char *) cache->get_curr_key_chain();
4569     range->range_flag= EQ_RANGE;
4570     DBUG_RETURN(0);
4571   }
4572   DBUG_RETURN(1);
4573 }
4574 
4575 
4576 /*
4577 Check whether range_info orders to skip the next record from BKAH join buffer
4578 
4579 SYNOPSIS
4580   bkah_range_seq_skip_record()
4581     seq              value returned by bkah_range_seq_init()
4582     range_info       information about the next range/key returned by MRR
4583     rowid [NOT USED] rowid of the record to be checked (not used)
4584 
4585 DESCRIPTION
4586   The function interprets seq as a pointer to a JOIN_CACHE_BKAH
4587   object. The function returns TRUE if the record with this range_info
4588   is to be filtered out from the stream of records returned by
4589   multi_range_read_next().
4590 
4591 NOTE
4592   This function are used only as a callback function.
4593 
4594 RETURN VALUE
4595   1    record with this range_info is to be filtered out from the stream
4596        of records returned by multi_range_read_next()
4597   0    the record is to be left in the stream
4598 */
4599 
4600 static
bkah_range_seq_skip_record(range_seq_t rseq,range_id_t range_info,uchar * rowid)4601 bool bkah_range_seq_skip_record(range_seq_t rseq, range_id_t range_info, uchar *rowid)
4602 {
4603   DBUG_ENTER("bkah_range_seq_skip_record");
4604   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
4605   bool res= cache->check_all_match_flags_for_key((uchar *) range_info);
4606   DBUG_RETURN(res);
4607 }
4608 
4609 
4610 /*
4611 Check if the record combination from BKAH cache matches the index condition
4612 
4613 SYNOPSIS
4614   bkah_skip_index_tuple()
4615     rseq             value returned by bka_range_seq_init()
4616     range_info       record chain for the next range/key returned by MRR
4617 
4618 DESCRIPTION
4619   This is wrapper for JOIN_CACHE_BKA_UNIQUE::skip_index_tuple method,
4620   see comments there.
4621 
4622 NOTE
4623   This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
4624 
4625 RETURN VALUE
4626   0    some records from the chain satisfy the index condition
4627   1    otherwise
4628 */
4629 
4630 static
bkah_skip_index_tuple(range_seq_t rseq,range_id_t range_info)4631 bool bkah_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
4632 {
4633   DBUG_ENTER("bka_unique_skip_index_tuple");
4634   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
4635   THD *thd= cache->thd();
4636   bool res;
4637   status_var_increment(thd->status_var.ha_icp_attempts);
4638   if (!(res= cache->skip_index_tuple(range_info)))
4639     status_var_increment(thd->status_var.ha_icp_match);
4640   DBUG_RETURN(res);
4641 }
4642 
4643 
4644 /*
4645 Prepare to read record from BKAH cache matching the current joined record
4646 
4647 SYNOPSIS
4648   prepare_look_for_matches()
4649     skip_last <-> ignore the last record in the buffer (always unused here)
4650 
4651 DESCRIPTION
4652   The function prepares to iterate over records in the join cache buffer
4653   matching the record loaded into the record buffer for join_tab when
4654   performing join operation by BKAH join algorithm. With BKAH algorithm, if
4655   association labels are used, then record loaded into the record buffer
4656   for join_tab always has a direct reference to the chain of the mathing
4657   records from the join buffer. If association labels are not used then
4658   then the chain of the matching records is obtained by the call of the
4659   get_key_chain_by_join_key function.
4660 
4661 RETURN VALUE
4662   TRUE    there are no records in the buffer to iterate over
4663   FALSE   otherwise
4664 */
4665 
prepare_look_for_matches(bool skip_last)4666 bool JOIN_CACHE_BKAH::prepare_look_for_matches(bool skip_last)
4667 {
4668   last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
4669   if (no_association &&
4670       !(curr_matching_chain= get_matching_chain_by_join_key())) //psergey: added '!'
4671     return 1;
4672   last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain);
4673   return 0;
4674 }
4675 
4676 /*
4677   Initialize the BKAH join cache
4678 
4679   SYNOPSIS
4680     init
4681       for_explain       join buffer is initialized for explain only
4682 
4683   DESCRIPTION
4684     The function initializes the cache structure. It is supposed to be called
4685     right after a constructor for the JOIN_CACHE_BKAH.
4686 
4687   NOTES
4688     The function first constructs a companion object of the type
4689     JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
4690 
4691   RETURN VALUE
4692     0   initialization with buffer allocations has been succeeded
4693     1   otherwise
4694 */
4695 
init(bool for_explain)4696 int JOIN_CACHE_BKAH::init(bool for_explain)
4697 {
4698   bool check_only_first_match= join_tab->check_only_first_match();
4699 
4700   no_association= MY_TEST(mrr_mode & HA_MRR_NO_ASSOCIATION);
4701 
4702   RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
4703                            bkah_range_seq_init,
4704                            bkah_range_seq_next,
4705                            check_only_first_match && !no_association ?
4706                              bkah_range_seq_skip_record : 0,
4707                            bkah_skip_index_tuple };
4708 
4709   DBUG_ENTER("JOIN_CACHE_BKAH::init");
4710 
4711   if (!(join_tab_scan= new JOIN_TAB_SCAN_MRR(join, join_tab,
4712                                              mrr_mode, rs_funcs)))
4713     DBUG_RETURN(1);
4714 
4715   DBUG_RETURN(JOIN_CACHE_HASHED::init(for_explain));
4716 }
4717 
4718 
4719 /*
4720   Check the index condition of the joined table for a record from the BKA cache
4721 
4722   SYNOPSIS
4723     skip_index_tuple()
4724       range_info       record chain returned by MRR
4725 
4726   DESCRIPTION
4727     See JOIN_CACHE_BKA::skip_index_tuple().
4728     This function is the variant for use with rhe class JOIN_CACHE_BKAH.
4729     The difference from JOIN_CACHE_BKA case is that there may be multiple
4730     previous table record combinations that share the same key(MRR range).
4731     As a consequence, we need to loop through the chain of all table record
4732     combinations that match the given MRR range key range_info until we find
4733     one that satisfies the index condition.
4734 
4735   NOTE
4736     Possible optimization:
4737     Before we unpack the record from a previous table
4738     check if this table is used in the condition.
4739     If so then unpack the record otherwise skip the unpacking.
4740     This should be done by a special virtual method
4741     get_partial_record_by_pos().
4742 
4743   RETURN VALUE
4744     1    any record combination from the chain referred by range_info
4745          does not satisfy the index condition
4746     0    otherwise
4747 
4748 
4749 */
4750 
skip_index_tuple(range_id_t range_info)4751 bool JOIN_CACHE_BKAH::skip_index_tuple(range_id_t range_info)
4752 {
4753   uchar *last_rec_ref_ptr= get_next_rec_ref((uchar*) range_info);
4754   uchar *next_rec_ref_ptr= last_rec_ref_ptr;
4755   DBUG_ENTER("JOIN_CACHE_BKAH::skip_index_tuple");
4756   do
4757   {
4758     next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
4759     uchar *rec_ptr= next_rec_ref_ptr + rec_fields_offset;
4760     get_record_by_pos(rec_ptr);
4761     if (join_tab->cache_idx_cond->val_int())
4762       DBUG_RETURN(FALSE);
4763   } while(next_rec_ref_ptr != last_rec_ref_ptr);
4764   DBUG_RETURN(TRUE);
4765 }
4766