1 /* Copyright (C) 2000-2006 MySQL AB
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15 
16 /**
17   @file
18 
19   @brief
20   join cache optimizations
21 
22   @defgroup Query_Optimizer  Query Optimizer
23   @{
24 */
25 
26 #ifdef USE_PRAGMA_IMPLEMENTATION
27 #pragma implementation				// gcc: Class implementation
28 #endif
29 
30 #include "mariadb.h"
31 #include "key.h"
32 #include "sql_base.h"
33 #include "sql_select.h"
34 #include "opt_subselect.h"
35 
36 #define NO_MORE_RECORDS_IN_BUFFER  (uint)(-1)
37 
38 static void save_or_restore_used_tabs(JOIN_TAB *join_tab, bool save);
39 
40 /*****************************************************************************
41  *  Join cache module
42 ******************************************************************************/
43 
44 /*
45   Fill in the descriptor of a flag field associated with a join cache
46 
47   SYNOPSIS
48     add_field_flag_to_join_cache()
49       str           position in a record buffer to copy the field from/to
50       length        length of the field
51       field  IN/OUT pointer to the field descriptor to fill in
52 
53   DESCRIPTION
54     The function fill in the descriptor of a cache flag field to which
55     the parameter 'field' points to. The function uses the first two
56     parameters to set the position in the record buffer from/to which
57     the field value is to be copied and the length of the copied fragment.
58     Before returning the result the function increments the value of
59     *field by 1.
60     The function ignores the fields 'blob_length' and 'ofset' of the
61     descriptor.
62 
63   RETURN VALUE
64     the length of the field
65 */
66 
67 static
68 uint add_flag_field_to_join_cache(uchar *str, uint length, CACHE_FIELD **field)
69 {
70   CACHE_FIELD *copy= *field;
71   copy->str= str;
72   copy->length= length;
73   copy->type= 0;
74   copy->field= 0;
75   copy->referenced_field_no= 0;
76   (*field)++;
77   return length;
78 }
79 
80 
81 /*
82   Fill in the descriptors of table data fields associated with a join cache
83 
84   SYNOPSIS
85     add_table_data_fields_to_join_cache()
86       tab              descriptors of fields from this table are to be filled
87       field_set        descriptors for only these fields are to be created
88       field_cnt IN/OUT     counter of data fields
89       descr  IN/OUT        pointer to the first descriptor to be filled
90       field_ptr_cnt IN/OUT counter of pointers to the data fields
91       descr_ptr IN/OUT     pointer to the first pointer to blob descriptors
92 
93   DESCRIPTION
94     The function fills in the descriptors of cache data fields from the table
95     'tab'. The descriptors are filled only for the fields marked in the
96     bitmap 'field_set'.
97     The function fills the descriptors starting from the position pointed
98     by 'descr'. If an added field is of a BLOB type then a pointer to the
99     its descriptor is added to the array descr_ptr.
100     At the return 'descr' points to the position after the last added
101     descriptor  while 'descr_ptr' points to the position right after the
102     last added pointer.
103 
104   RETURN VALUE
105     the total length of the added fields
106 */
107 
108 static
109 uint add_table_data_fields_to_join_cache(JOIN_TAB *tab,
110                                          MY_BITMAP *field_set,
111                                          uint *field_cnt,
112                                          CACHE_FIELD **descr,
113                                          uint *field_ptr_cnt,
114                                          CACHE_FIELD ***descr_ptr)
115 {
116   Field **fld_ptr;
117   uint len= 0;
118   CACHE_FIELD *copy= *descr;
119   CACHE_FIELD **copy_ptr= *descr_ptr;
120   uint used_fields= bitmap_bits_set(field_set);
121   for (fld_ptr= tab->table->field; used_fields; fld_ptr++)
122   {
123     if (bitmap_is_set(field_set, (*fld_ptr)->field_index))
124     {
125       len+= (*fld_ptr)->fill_cache_field(copy);
126       if (copy->type == CACHE_BLOB)
127       {
128         *copy_ptr= copy;
129         copy_ptr++;
130         (*field_ptr_cnt)++;
131       }
132       copy->field= *fld_ptr;
133       copy->referenced_field_no= 0;
134       copy++;
135       (*field_cnt)++;
136       used_fields--;
137     }
138   }
139   *descr= copy;
140   *descr_ptr= copy_ptr;
141   return len;
142 }
143 
144 /*
145   Determine different counters of fields associated with a record in the cache
146 
147   SYNOPSIS
148     calc_record_fields()
149 
150   DESCRIPTION
151     The function counts the number of total fields stored in a record
152     of the cache and saves this number in the 'fields' member. It also
153     determines the number of flag fields and the number of blobs.
154     The function sets 'with_match_flag' on if 'join_tab' needs a match flag
155     i.e. if it is the first inner table of an outer join or a semi-join.
156 
157   RETURN VALUE
158     none
159 */
160 
161 void JOIN_CACHE::calc_record_fields()
162 {
163   JOIN_TAB *tab;
164 
165   if (prev_cache)
166     tab= prev_cache->join_tab;
167   else
168   {
169     if (join_tab->bush_root_tab)
170     {
171       /*
172         --ot1--SJM1--------------ot2--...
173                 |
174                 |
175                 +-it1--...--itN
176                         ^____________ this->join_tab is somewhere here,
177                                       inside an sjm nest.
178 
179         The join buffer should store the values of it1.*, it2.*, ..
180         It should not store values of ot1.*.
181       */
182       tab= join_tab->bush_root_tab->bush_children->start;
183     }
184     else
185     {
186       /*
187         -ot1--ot2--SJM1--SJM2--------------ot3--...--otN
188                     |     |                      ^
189                     |     +-it21--...--it2N      |
190                     |                            \-- we're somewhere here,
191                     +-it11--...--it1N                at the top level
192 
193         The join buffer should store the values of
194 
195           ot1.*, ot2.*, it1{i}, it2{j}.*, ot3.*, ...
196 
197         that is, we should start from the first non-const top-level table.
198 
199         We will need to store columns of SJ-inner tables (it_X_Y.*), but we're
200         not interested in storing the columns of materialization tables
201         themselves. Beause of that, if the first non-const top-level table is a
202         materialized table, we move to its bush_children:
203       */
204       tab= join->join_tab + join->const_tables;
205       if (tab->bush_children)
206         tab= tab->bush_children->start;
207     }
208   }
209   DBUG_ASSERT(!tab->bush_children);
210 
211   start_tab= tab;
212   fields= 0;
213   blobs= 0;
214   flag_fields= 0;
215   data_field_count= 0;
216   data_field_ptr_count= 0;
217   referenced_fields= 0;
218 
219   /*
220     The following loop will get inside SJM nests, because data may be unpacked
221     to sjm-inner tables.
222   */
223   for (; tab != join_tab ; tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
224   {
225     tab->calc_used_field_length(FALSE);
226     flag_fields+= MY_TEST(tab->used_null_fields || tab->used_uneven_bit_fields);
227     flag_fields+= MY_TEST(tab->table->maybe_null);
228     fields+= tab->used_fields;
229     blobs+= tab->used_blobs;
230   }
231   if ((with_match_flag= join_tab->use_match_flag()))
232     flag_fields++;
233   fields+= flag_fields;
234 }
235 
236 
237 /*
238   Collect information on join key arguments
239 
240   SYNOPSIS
241     collect_info_on_key_args()
242 
243   DESCRIPTION
244     The function traverses the ref expressions that are used to access the
245     joined table join_tab. For each table 'tab' whose fields are to be stored
246     in the join buffer of the cache the function finds the fields from 'tab'
247     that occur in the ref expressions and marks these fields in the bitmap
248     tab->table->tmp_set. The function counts the number of them stored
249     in this cache and the total number of them stored in the previous caches
250     and saves the results of the counting in 'local_key_arg_fields' and
251     'external_key_arg_fields' respectively.
252 
253   NOTES
254     The function does not do anything if no key is used to join the records
255     from join_tab.
256 
257   RETURN VALUE
258     none
259 */
260 
261 void JOIN_CACHE::collect_info_on_key_args()
262 {
263   JOIN_TAB *tab;
264   JOIN_CACHE *cache;
265   local_key_arg_fields= 0;
266   external_key_arg_fields= 0;
267 
268   if (!is_key_access())
269     return;
270 
271   TABLE_REF *ref= &join_tab->ref;
272   cache= this;
273   do
274   {
275     for (tab= cache->start_tab; tab != cache->join_tab;
276          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
277     {
278       uint key_args;
279       bitmap_clear_all(&tab->table->tmp_set);
280       for (uint i= 0; i < ref->key_parts; i++)
281       {
282         Item *ref_item= ref->items[i];
283         if (!(tab->table->map & ref_item->used_tables()))
284 	  continue;
285 	 ref_item->walk(&Item::add_field_to_set_processor, 1, tab->table);
286       }
287       if ((key_args= bitmap_bits_set(&tab->table->tmp_set)))
288       {
289         if (cache == this)
290           local_key_arg_fields+= key_args;
291         else
292           external_key_arg_fields+= key_args;
293       }
294     }
295     cache= cache->prev_cache;
296   }
297   while (cache);
298 
299   return;
300 }
301 
302 
303 /*
304   Allocate memory for descriptors and pointers to them associated with the cache
305 
306   SYNOPSIS
307     alloc_fields()
308 
309   DESCRIPTION
310     The function allocates memory for the array of fields descriptors
311     and the array of pointers to the field descriptors used to copy
312     join record data from record buffers into the join buffer and
313     backward. Some pointers refer to the field descriptor associated
314     with previous caches. They are placed at the beginning of the array
315     of pointers and its total number is stored in external_key_arg_fields.
316     The pointer of the first array is assigned to field_descr and the number
317     of the elements in it is precalculated by the function calc_record_fields.
318     The allocated arrays are adjacent.
319 
320   NOTES
321     The memory is allocated in join->thd->mem_root
322 
323   RETURN VALUE
324     pointer to the first array
325 */
326 
327 int JOIN_CACHE::alloc_fields()
328 {
329   uint ptr_cnt= external_key_arg_fields+blobs+1;
330   uint fields_size= sizeof(CACHE_FIELD)*fields;
331   field_descr= (CACHE_FIELD*) join->thd->alloc(fields_size +
332                                                sizeof(CACHE_FIELD*)*ptr_cnt);
333   blob_ptr= (CACHE_FIELD **) ((uchar *) field_descr + fields_size);
334   return (field_descr == NULL);
335 }
336 
337 
338 /*
339   Create descriptors of the record flag fields stored in the join buffer
340 
341   SYNOPSIS
342     create_flag_fields()
343 
344   DESCRIPTION
345     The function creates descriptors of the record flag fields stored
346     in the join buffer. These are descriptors for:
347     - an optional match flag field,
348     - table null bitmap fields,
349     - table null row fields.
350     The match flag field is created when 'join_tab' is the first inner
351     table of an outer join our a semi-join. A null bitmap field is
352     created for any table whose fields are to be stored in the join
353     buffer if at least one of these fields is nullable or is a BIT field
354     whose bits are partially stored with null bits. A null row flag
355     is created for any table assigned to the cache if it is an inner
356     table of an outer join.
357     The descriptor for flag fields are placed one after another at the
358     beginning of the array of field descriptors 'field_descr' that
359     contains 'fields' elements. If there is a match flag field the
360     descriptor for it is always first in the sequence of flag fields.
361     The descriptors for other flag fields can follow in an arbitrary
362     order.
363     The flag field values follow in a record stored in the join buffer
364     in the same order as field descriptors, with the match flag always
365     following first.
366     The function sets the value of 'flag_fields' to the total number
367     of the descriptors created for the flag fields.
368     The function sets the value of 'length' to the total length of the
369     flag fields.
370 
371   RETURN VALUE
372     none
373 */
374 
375 void JOIN_CACHE::create_flag_fields()
376 {
377   CACHE_FIELD *copy;
378   JOIN_TAB *tab;
379 
380   copy= field_descr;
381 
382   length=0;
383 
384   /* If there is a match flag the first field is always used for this flag */
385   if (with_match_flag)
386     length+= add_flag_field_to_join_cache((uchar*) &join_tab->found,
387                                           sizeof(join_tab->found),
388 	                                  &copy);
389 
390   /* Create fields for all null bitmaps and null row flags that are needed */
391   for (tab= start_tab; tab != join_tab;
392        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
393   {
394     TABLE *table= tab->table;
395 
396     /* Create a field for the null bitmap from table if needed */
397     if (tab->used_null_fields || tab->used_uneven_bit_fields)
398       length+= add_flag_field_to_join_cache(table->null_flags,
399                                             table->s->null_bytes,
400                                             &copy);
401 
402     /* Create table for the null row flag if needed */
403     if (table->maybe_null)
404       length+= add_flag_field_to_join_cache((uchar*) &table->null_row,
405                                             sizeof(table->null_row),
406                                             &copy);
407   }
408 
409   /* Theoretically the new value of flag_fields can be less than the old one */
410   flag_fields= (uint)(copy-field_descr);
411 }
412 
413 
414 /*
415   Create descriptors of the fields used to build access keys to the joined table
416 
417   SYNOPSIS
418     create_key_arg_fields()
419 
420   DESCRIPTION
421     The function creates descriptors of the record fields stored in the join
422     buffer that are used to build access keys to the joined table. These
423     fields are put into the buffer ahead of other records fields stored in
424     the buffer. Such placement helps to optimize construction of access keys.
425     For each field that is used to build access keys to the joined table but
426     is stored in some other join cache buffer the function saves a pointer
427     to the the field descriptor. The array of such pointers are placed in the
428     the join cache structure just before the array of pointers to the
429     blob fields blob_ptr.
430     Any field stored in a join cache buffer that is used to construct keys
431     to access tables associated with other join caches is called a referenced
432     field. It receives a unique number that is saved by the function in the
433     member 'referenced_field_no' of the CACHE_FIELD descriptor for the field.
434     This number is used as index to the array of offsets to the referenced
435     fields that are saved and put in the join cache buffer after all record
436     fields.
437     The function also finds out whether that the keys to access join_tab
438     can be considered as embedded and, if so, sets the flag 'use_emb_key' in
439     this join cache appropriately.
440 
441   NOTES.
442     When a key to access the joined table 'join_tab' is constructed the array
443     of pointers to the field descriptors for the external fields is looked
444     through. For each of this pointers we find out in what previous key cache
445     the referenced field is stored. The value of 'referenced_field_no'
446     provides us with the index into the array of offsets for referenced
447     fields stored in the join cache. The offset read by the the index allows
448     us to read the field without reading all other fields of the record
449     stored the join cache buffer. This optimizes the construction of keys
450     to access 'join_tab' when some key arguments are stored in the previous
451     join caches.
452 
453   NOTES
454     The function does not do anything if no key is used to join the records
455     from join_tab.
456 
457   RETURN VALUE
458     none
459 */
460 void JOIN_CACHE::create_key_arg_fields()
461 {
462   JOIN_TAB *tab;
463   JOIN_CACHE *cache;
464 
465   if (!is_key_access())
466     return;
467 
468   /*
469     Save pointers to the cache fields in previous caches
470     that  are used to build keys for this key access.
471   */
472   cache= this;
473   uint ext_key_arg_cnt= external_key_arg_fields;
474   CACHE_FIELD *copy;
475   CACHE_FIELD **copy_ptr= blob_ptr;
476   while (ext_key_arg_cnt)
477   {
478     cache= cache->prev_cache;
479     for (tab= cache->start_tab; tab != cache->join_tab;
480          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
481     {
482       CACHE_FIELD *copy_end;
483       MY_BITMAP *key_read_set= &tab->table->tmp_set;
484       /* key_read_set contains the bitmap of tab's fields referenced by ref */
485       if (bitmap_is_clear_all(key_read_set))
486         continue;
487       copy_end= cache->field_descr+cache->fields;
488       for (copy= cache->field_descr+cache->flag_fields; copy < copy_end; copy++)
489       {
490         /*
491           (1) - when we store rowids for DuplicateWeedout, they have
492                 copy->field==NULL
493         */
494         if (copy->field &&  // (1)
495             copy->field->table == tab->table &&
496             bitmap_is_set(key_read_set, copy->field->field_index))
497         {
498           *copy_ptr++= copy;
499           ext_key_arg_cnt--;
500           if (!copy->referenced_field_no)
501           {
502             /*
503               Register the referenced field 'copy':
504               - set the offset number in copy->referenced_field_no,
505               - adjust the value of the flag 'with_length',
506               - adjust the values of 'pack_length' and
507                 of 'pack_length_with_blob_ptrs'.
508 	    */
509             copy->referenced_field_no= ++cache->referenced_fields;
510             if (!cache->with_length)
511             {
512               cache->with_length= TRUE;
513               uint sz= cache->get_size_of_rec_length();
514               cache->base_prefix_length+= sz;
515               cache->pack_length+= sz;
516               cache->pack_length_with_blob_ptrs+= sz;
517             }
518 	    cache->pack_length+= cache->get_size_of_fld_offset();
519             cache->pack_length_with_blob_ptrs+= cache->get_size_of_fld_offset();
520           }
521         }
522       }
523     }
524   }
525   /* After this 'blob_ptr' shall not be be changed */
526   blob_ptr= copy_ptr;
527 
528   /* Now create local fields that are used to build ref for this key access */
529   copy= field_descr+flag_fields;
530   for (tab= start_tab; tab != join_tab;
531        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
532   {
533     length+= add_table_data_fields_to_join_cache(tab, &tab->table->tmp_set,
534                                                  &data_field_count, &copy,
535                                                  &data_field_ptr_count,
536                                                  &copy_ptr);
537   }
538 
539   use_emb_key= check_emb_key_usage();
540 
541   return;
542 }
543 
544 
545 /*
546   Create descriptors of all remaining data fields stored in the join buffer
547 
548   SYNOPSIS
549     create_remaining_fields()
550 
551   DESCRIPTION
552     The function creates descriptors for all remaining data fields of a
553     record from the join buffer. If the value returned by is_key_access() is
554     false the function creates fields for all read record fields that
555     comprise the partial join record joined with join_tab. Otherwise,
556     for each table tab, the set of the read fields for which the descriptors
557     have to be added is determined as the difference between all read fields
558     and and those for which the descriptors have been already created.
559     The latter are supposed to be marked in the bitmap tab->table->tmp_set.
560     The function increases the value of 'length' to the the total length of
561     the added fields.
562 
563   NOTES
564     If is_key_access() returns true the function modifies the value of
565     tab->table->tmp_set for a each table whose fields are stored in the cache.
566     The function calls the method Field::fill_cache_field to figure out
567     the type of the cache field and the maximal length of its representation
568     in the join buffer. If this is a blob field then additionally a pointer
569     to this field is added as an element of the array blob_ptr. For a blob
570     field only the size of the length of the blob data is taken into account.
571     It is assumed that 'data_field_count' contains the number of descriptors
572     for data fields that have been already created and 'data_field_ptr_count'
573     contains the number of the pointers to such descriptors having been
574     stored up to the moment.
575 
576   RETURN VALUE
577     none
578 */
579 
580 void JOIN_CACHE::create_remaining_fields()
581 {
582   JOIN_TAB *tab;
583   bool all_read_fields= !is_key_access();
584   CACHE_FIELD *copy= field_descr+flag_fields+data_field_count;
585   CACHE_FIELD **copy_ptr= blob_ptr+data_field_ptr_count;
586 
587   for (tab= start_tab; tab != join_tab;
588        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
589   {
590     MY_BITMAP *rem_field_set;
591     TABLE *table= tab->table;
592 
593     if (all_read_fields)
594       rem_field_set= table->read_set;
595     else
596     {
597       bitmap_invert(&table->tmp_set);
598       bitmap_intersect(&table->tmp_set, table->read_set);
599       rem_field_set= &table->tmp_set;
600     }
601 
602     length+= add_table_data_fields_to_join_cache(tab, rem_field_set,
603                                                  &data_field_count, &copy,
604                                                  &data_field_ptr_count,
605                                                  &copy_ptr);
606 
607     /* SemiJoinDuplicateElimination: allocate space for rowid if needed */
608     if (tab->keep_current_rowid)
609     {
610       copy->str= table->file->ref;
611       if (copy->str)
612         copy->length= table->file->ref_length;
613       else
614       {
615         /* This may happen only for materialized derived tables and views */
616         copy->length= 0;
617         copy->str= (uchar *) table;
618       }
619       copy->type= CACHE_ROWID;
620       copy->field= 0;
621       copy->referenced_field_no= 0;
622       /*
623         Note: this may seem odd, but at this point we have
624         table->file->ref==NULL while table->file->ref_length is already set
625         to correct value.
626       */
627       length += table->file->ref_length;
628       data_field_count++;
629       copy++;
630     }
631   }
632 }
633 
634 
635 
636 /*
637   Calculate and set all cache constants
638 
639   SYNOPSIS
640     set_constants()
641 
642   DESCRIPTION
643     The function calculates and set all precomputed constants that are used
644     when writing records into the join buffer and reading them from it.
645     It calculates the size of offsets of a record within the join buffer
646     and of a field within a record. It also calculates the number of bytes
647     used to store record lengths.
648     The function also calculates the maximal length of the representation
649     of record in the cache excluding blob_data. This value is used when
650     making a dicision whether more records should be added into the join
651     buffer or not.
652 
653   RETURN VALUE
654     none
655 */
656 
657 void JOIN_CACHE::set_constants()
658 {
659   /*
660     Any record from a BKA cache is prepended with the record length.
661     We use the record length when reading the buffer and building key values
662     for each record. The length allows us not to read the fields that are
663     not needed for keys.
664     If a record has match flag it also may be skipped when the match flag
665     is on. It happens if the cache is used for a semi-join operation or
666     for outer join when the 'not exist' optimization can be applied.
667     If some of the fields are referenced from other caches then
668     the record length allows us to easily reach the saved offsets for
669     these fields since the offsets are stored at the very end of the record.
670     However at this moment we don't know whether we have referenced fields for
671     the cache or not. Later when a referenced field is registered for the cache
672     we adjust the value of the flag 'with_length'.
673   */
674   with_length= is_key_access() ||
675                join_tab->is_inner_table_of_semi_join_with_first_match() ||
676                join_tab->is_inner_table_of_outer_join();
677   /*
678      At this moment we don't know yet the value of 'referenced_fields',
679      but in any case it can't be greater than the value of 'fields'.
680   */
681   uint len= length + fields*sizeof(uint)+blobs*sizeof(uchar *) +
682             (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
683             sizeof(ulong);
684   /*
685     The values of  size_of_rec_ofs, size_of_rec_len, size_of_fld_ofs,
686      base_prefix_length, pack_length,  pack_length_with_blob_ptrs
687      will be recalculated later in this function when we get the estimate
688      for the actual value of the join buffer size.
689   */
690   size_of_rec_ofs=  size_of_rec_len= size_of_fld_ofs= 4;
691   base_prefix_length= (with_length ? size_of_rec_len : 0) +
692                       (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
693   pack_length= (with_length ? size_of_rec_len : 0) +
694                (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
695                length + fields*sizeof(uint);
696   pack_length_with_blob_ptrs= pack_length + blobs*sizeof(uchar *);
697   min_buff_size= 0;
698   min_records= 1;
699   buff_size= (size_t)MY_MAX(join->thd->variables.join_buff_size,
700                  get_min_join_buffer_size());
701   size_of_rec_ofs= offset_size(buff_size);
702   size_of_rec_len= blobs ? size_of_rec_ofs : offset_size(len);
703   size_of_fld_ofs= size_of_rec_len;
704   base_prefix_length= (with_length ? size_of_rec_len : 0) +
705                       (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
706   /*
707     The size of the offsets for referenced fields will be added later.
708     The values of 'pack_length' and 'pack_length_with_blob_ptrs' are adjusted
709     every time when the first reference to the referenced field is registered.
710   */
711   pack_length= (with_length ? size_of_rec_len : 0) +
712                (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
713                length;
714   pack_length_with_blob_ptrs= pack_length + blobs*sizeof(uchar *);
715 }
716 
717 
718 /*
719   Get maximum total length of all affixes of a record in the join cache buffer
720 
721   SYNOPSIS
722     get_record_max_affix_length()
723 
724   DESCRIPTION
725     The function calculates the maximum possible total length of all affixes
726     of a record in the join cache buffer, that is made of:
727       - the length of all prefixes used in this cache,
728       - the length of the match flag if it's needed
729       - the total length of the maximum possible offsets to the fields of
730         a record in the buffer.
731 
732   RETURN VALUE
733     The maximum total length of all affixes of a record in the join buffer
734 */
735 
736 uint JOIN_CACHE::get_record_max_affix_length()
737 {
738   uint len= get_prefix_length() +
739             MY_TEST(with_match_flag) +
740             size_of_fld_ofs * data_field_count;
741   return len;
742 }
743 
744 
745 /*
746   Get the minimum possible size of the cache join buffer
747 
748   SYNOPSIS
749     get_min_join_buffer_size()
750 
751   DESCRIPTION
752     At the first its invocation for the cache the function calculates the
753     minimum possible size of the join buffer of the cache. This value depends
754     on the minimal number of records 'min_records' to be stored in the join
755     buffer. The number is supposed to be determined by the procedure that
756     chooses the best access path to the joined table join_tab in the execution
757     plan. After the calculation of the interesting size the function saves it
758     in the field 'min_buff_size' in order to use it directly at the next
759     invocations of the function.
760 
761   NOTES
762     Currently the number of minimal records is just set to 1.
763 
764   RETURN VALUE
765     The minimal possible size of the join buffer of this cache
766 */
767 
768 size_t JOIN_CACHE::get_min_join_buffer_size()
769 {
770   if (!min_buff_size)
771   {
772     size_t len= 0;
773     size_t len_last= 0;
774     for (JOIN_TAB *tab= start_tab; tab != join_tab;
775          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
776     {
777       len+= tab->get_max_used_fieldlength();
778       len_last+= tab->get_used_fieldlength();
779     }
780     size_t len_addon= get_record_max_affix_length() +
781                       get_max_key_addon_space_per_record();
782     len+= len_addon;
783     len_last+= len_addon;
784     size_t min_sz= len*(min_records-1) + len_last;
785     min_sz+= pack_length_with_blob_ptrs;
786     size_t add_sz= 0;
787     for (uint i=0; i < min_records; i++)
788       add_sz+= join_tab_scan->aux_buffer_incr(i+1);
789     avg_aux_buffer_incr= add_sz/min_records;
790     min_sz+= add_sz;
791     set_if_bigger(min_sz, 1);
792     min_buff_size= min_sz;
793   }
794   return min_buff_size;
795 }
796 
797 
798 /*
799   Get the maximum possible size of the cache join buffer
800 
801   SYNOPSIS
802     get_max_join_buffer_size()
803 
804     optimize_buff_size  FALSE <-> do not take more memory than needed for
805                         the estimated number of records in the partial join
806 
807   DESCRIPTION
808     At the first its invocation for the cache the function calculates the
809     maximum possible size of join buffer for the cache. If the parameter
810     optimize_buff_size true then this value does not exceed the size of the
811     space needed for the estimated number of records 'max_records' in the
812     partial join that joins tables from the first one through join_tab. This
813     value is also capped off by the value of join_tab->join_buffer_size_limit,
814     if it has been set a to non-zero value, and by the value of the system
815     parameter join_buffer_size - otherwise. After the calculation of the
816     interesting size the function saves the value in the field 'max_buff_size'
817     in order to use it directly at the next  invocations of the function.
818 
819   NOTES
820     Currently the value of join_tab->join_buffer_size_limit is initialized
821     to 0 and is never reset.
822 
823   RETURN VALUE
824     The maximum possible size of the join buffer of this cache
825 */
826 
827 size_t JOIN_CACHE::get_max_join_buffer_size(bool optimize_buff_size)
828 {
829   if (!max_buff_size)
830   {
831     size_t max_sz;
832     size_t min_sz= get_min_join_buffer_size();
833     size_t len= 0;
834     for (JOIN_TAB *tab= start_tab; tab != join_tab;
835          tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
836     {
837       len+= tab->get_used_fieldlength();
838     }
839     len+= get_record_max_affix_length();
840     avg_record_length= len;
841     len+= get_max_key_addon_space_per_record() + avg_aux_buffer_incr;
842     space_per_record= len;
843 
844     size_t limit_sz= (size_t)join->thd->variables.join_buff_size;
845     if (join_tab->join_buffer_size_limit)
846       set_if_smaller(limit_sz, join_tab->join_buffer_size_limit);
847     if (!optimize_buff_size)
848       max_sz= limit_sz;
849     else
850     {
851       if (limit_sz / max_records > space_per_record)
852         max_sz= space_per_record * max_records;
853       else
854         max_sz= limit_sz;
855       max_sz+= pack_length_with_blob_ptrs;
856       set_if_smaller(max_sz, limit_sz);
857     }
858     set_if_bigger(max_sz, min_sz);
859     max_buff_size= max_sz;
860   }
861   return max_buff_size;
862 }
863 
864 
865 /*
866   Allocate memory for a join buffer
867 
868   SYNOPSIS
869     alloc_buffer()
870 
871   DESCRIPTION
872     The function allocates a lump of memory for the cache join buffer.
873     Initially the function sets the size of the buffer buff_size equal to
874     the value returned by get_max_join_buffer_size(). If the total size of
875     the space intended to be used for the join buffers employed by the
876     tables from the first one through join_tab exceeds the value of the
877     system parameter join_buff_space_limit, then the function first tries
878     to shrink the used buffers to make the occupied space fit the maximum
879     memory allowed to be used for all join buffers in total. After
880     this the function tries to allocate a join buffer for join_tab.
881     If it fails to do so, it decrements the requested size of the join
882     buffer, shrinks proportionally the join buffers used for the previous
883     tables and tries to allocate a buffer for join_tab. In the case of a
884     failure the function repeats its attempts with smaller and smaller
885     requested sizes of the buffer, but not more than 4 times.
886 
887   RETURN VALUE
888     0   if the memory has been successfully allocated
889     1   otherwise
890 */
891 
892 int JOIN_CACHE::alloc_buffer()
893 {
894   JOIN_TAB *tab;
895   JOIN_CACHE *cache;
896   ulonglong curr_buff_space_sz= 0;
897   ulonglong curr_min_buff_space_sz= 0;
898   ulonglong join_buff_space_limit=
899     join->thd->variables.join_buff_space_limit;
900   bool optimize_buff_size=
901          optimizer_flag(join->thd, OPTIMIZER_SWITCH_OPTIMIZE_JOIN_BUFFER_SIZE);
902   double partial_join_cardinality=  (join_tab-1)->get_partial_join_cardinality();
903   buff= NULL;
904   min_buff_size= 0;
905   max_buff_size= 0;
906   min_records= 1;
907   max_records= (size_t) (partial_join_cardinality <= join_buff_space_limit ?
908                  (ulonglong) partial_join_cardinality : join_buff_space_limit);
909   set_if_bigger(max_records, 10);
910   min_buff_size= get_min_join_buffer_size();
911   buff_size= get_max_join_buffer_size(optimize_buff_size);
912 
913   for (tab= start_tab; tab!= join_tab;
914        tab= next_linear_tab(join, tab, WITHOUT_BUSH_ROOTS))
915   {
916     cache= tab->cache;
917     if (cache)
918     {
919       curr_min_buff_space_sz+= cache->get_min_join_buffer_size();
920       curr_buff_space_sz+= cache->get_join_buffer_size();
921     }
922   }
923   curr_min_buff_space_sz+= min_buff_size;
924   curr_buff_space_sz+= buff_size;
925 
926   if (curr_min_buff_space_sz > join_buff_space_limit ||
927       (curr_buff_space_sz > join_buff_space_limit &&
928        (!optimize_buff_size ||
929         join->shrink_join_buffers(join_tab, curr_buff_space_sz,
930                                   join_buff_space_limit))))
931     goto fail;
932 
933   if (for_explain_only)
934     return 0;
935 
936   for (size_t buff_size_decr= (buff_size-min_buff_size)/4 + 1; ; )
937   {
938     size_t next_buff_size;
939 
940     if ((buff= (uchar*) my_malloc(buff_size, MYF(MY_THREAD_SPECIFIC))))
941       break;
942 
943     next_buff_size= buff_size > buff_size_decr ? buff_size-buff_size_decr : 0;
944     if (next_buff_size < min_buff_size ||
945         join->shrink_join_buffers(join_tab, curr_buff_space_sz,
946                                   curr_buff_space_sz-buff_size_decr))
947       goto fail;
948     buff_size= next_buff_size;
949 
950     curr_buff_space_sz= 0;
951     for (tab= join->join_tab+join->const_tables; tab <= join_tab; tab++)
952     {
953       cache= tab->cache;
954       if (cache)
955         curr_buff_space_sz+= cache->get_join_buffer_size();
956     }
957   }
958   return 0;
959 
960 fail:
961   buff_size= 0;
962   return 1;
963 }
964 
965 
966 /*
967   Shrink the size if the cache join buffer in a given ratio
968 
969   SYNOPSIS
970     shrink_join_buffer_in_ratio()
971       n           nominator of the ratio to shrink the buffer in
972       d           denominator if the ratio
973 
974   DESCRIPTION
975     The function first deallocates the join buffer of the cache. Then
976     it allocates a buffer that is (n/d) times smaller.
977 
978   RETURN VALUE
979     FALSE   on success with allocation of the smaller join buffer
980     TRUE    otherwise
981 */
982 
983 bool JOIN_CACHE::shrink_join_buffer_in_ratio(ulonglong n, ulonglong d)
984 {
985   size_t next_buff_size;
986   if (n < d)
987     return FALSE;
988   next_buff_size= (size_t) ((double) buff_size / n * d);
989   set_if_bigger(next_buff_size, min_buff_size);
990   buff_size= next_buff_size;
991   return realloc_buffer();
992 }
993 
994 
995 /*
996   Reallocate the join buffer of a join cache
997 
998   SYNOPSIS
999     realloc_buffer()
1000 
1001   DESCRITION
1002     The function reallocates the join buffer of the join cache. After this
1003     it resets the buffer for writing.
1004 
1005   NOTES
1006     The function assumes that buff_size contains the new value for the join
1007     buffer size.
1008 
1009   RETURN VALUE
1010     0   if the buffer has been successfully reallocated
1011     1   otherwise
1012 */
1013 
1014 int JOIN_CACHE::realloc_buffer()
1015 {
1016   int rc;
1017   free();
1018   rc= MY_TEST(!(buff= (uchar*) my_malloc(buff_size, MYF(MY_THREAD_SPECIFIC))));
1019   reset(TRUE);
1020   return rc;
1021 }
1022 
1023 
1024 /*
1025   Initialize a join cache
1026 
1027   SYNOPSIS
1028     init()
1029       for_explain       join buffer is initialized for explain only
1030 
1031   DESCRIPTION
1032     The function initializes the join cache structure. It supposed to be called
1033     by init methods for classes derived from the JOIN_CACHE.
1034     The function allocates memory for the join buffer and for descriptors of
1035     the record fields stored in the buffer.
1036 
1037   NOTES
1038     The code of this function should have been included into the constructor
1039     code itself. However the new operator for the class JOIN_CACHE would
1040     never fail while memory allocation for the join buffer is not absolutely
1041     unlikely to fail. That's why this memory allocation has to be placed in a
1042     separate function that is called in a couple with a cache constructor.
1043     It is quite natural to put almost all other constructor actions into
1044     this function.
1045 
1046   RETURN VALUE
1047     0   initialization with buffer allocations has been succeeded
1048     1   otherwise
1049 */
1050 
1051 int JOIN_CACHE::init(bool for_explain)
1052 {
1053   DBUG_ENTER("JOIN_CACHE::init");
1054 
1055   for_explain_only= for_explain;
1056 
1057   calc_record_fields();
1058 
1059   collect_info_on_key_args();
1060 
1061   if (alloc_fields())
1062     DBUG_RETURN(1);
1063 
1064   create_flag_fields();
1065 
1066   create_key_arg_fields();
1067 
1068   create_remaining_fields();
1069 
1070   set_constants();
1071 
1072   if (alloc_buffer())
1073     DBUG_RETURN(1);
1074 
1075   reset(TRUE);
1076 
1077   DBUG_RETURN(0);
1078 }
1079 
1080 
1081 /*
1082   Check the possibility to read the access keys directly from the join buffer
1083   SYNOPSIS
1084     check_emb_key_usage()
1085 
1086   DESCRIPTION
1087     The function checks some conditions at which the key values can be read
1088     directly from the join buffer. This is possible when the key values can be
1089     composed by concatenation of the record fields stored in the join buffer.
1090     Sometimes when the access key is multi-component the function has to re-order
1091     the fields written into the join buffer to make keys embedded. If key
1092     values for the key access are detected as embedded then 'use_emb_key'
1093     is set to TRUE.
1094 
1095   EXAMPLE
1096     Let table t2 has an index defined on the columns a,b . Let's assume also
1097     that the columns t2.a, t2.b as well as the columns t1.a, t1.b are all
1098     of the integer type. Then if the query
1099       SELECT COUNT(*) FROM t1, t2 WHERE t1.a=t2.a and t1.b=t2.b
1100     is executed with a join cache in such a way that t1 is the driving
1101     table then the key values to access table t2 can be read directly
1102     from the join buffer.
1103 
1104   NOTES
1105     In some cases key values could be read directly from the join buffer but
1106     we still do not consider them embedded. In the future we'll expand the
1107     the class of keys which we identify as embedded.
1108 
1109   NOTES
1110     The function returns FALSE if no key is used to join the records
1111     from join_tab.
1112 
1113   RETURN VALUE
1114     TRUE    key values will be considered as embedded,
1115     FALSE   otherwise.
1116 */
1117 
1118 bool JOIN_CACHE::check_emb_key_usage()
1119 {
1120 
1121   if (!is_key_access())
1122     return FALSE;
1123 
1124   uint i;
1125   Item *item;
1126   KEY_PART_INFO *key_part;
1127   CACHE_FIELD *copy;
1128   CACHE_FIELD *copy_end;
1129   uint len= 0;
1130   TABLE_REF *ref= &join_tab->ref;
1131   KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
1132 
1133   /*
1134     If some of the key arguments are not from the local cache the key
1135     is not considered as embedded.
1136     TODO:
1137     Expand it to the case when ref->key_parts=1 and local_key_arg_fields=0.
1138   */
1139   if (external_key_arg_fields != 0)
1140     return FALSE;
1141   /*
1142     If the number of the local key arguments is not equal to the number
1143     of key parts the key value cannot be read directly from the join buffer.
1144   */
1145   if (local_key_arg_fields != ref->key_parts)
1146     return FALSE;
1147 
1148   /*
1149     A key is not considered embedded if one of the following is true:
1150     - one of its key parts is not equal to a field
1151     - it is a partial key
1152     - definition of the argument field does not coincide with the
1153       definition of the corresponding key component
1154     - some of the key components are nullable
1155   */
1156   for (i=0; i < ref->key_parts; i++)
1157   {
1158     item= ref->items[i]->real_item();
1159     if (item->type() != Item::FIELD_ITEM)
1160       return FALSE;
1161     key_part= keyinfo->key_part+i;
1162     if (key_part->key_part_flag & HA_PART_KEY_SEG)
1163       return FALSE;
1164     if (!key_part->field->eq_def(((Item_field *) item)->field))
1165       return FALSE;
1166     if (key_part->field->maybe_null())
1167       return FALSE;
1168   }
1169 
1170   copy= field_descr+flag_fields;
1171   copy_end= copy+local_key_arg_fields;
1172   for ( ; copy < copy_end; copy++)
1173   {
1174     /*
1175       If some of the key arguments are of variable length the key
1176       is not considered as embedded.
1177     */
1178     if (copy->type != 0)
1179       return FALSE;
1180     /*
1181       If some of the key arguments are bit fields whose bits are partially
1182       stored with null bits the key is not considered as embedded.
1183     */
1184     if (copy->field->type() == MYSQL_TYPE_BIT &&
1185 	 ((Field_bit*) (copy->field))->bit_len)
1186       return FALSE;
1187     len+= copy->length;
1188   }
1189 
1190   emb_key_length= len;
1191 
1192   /*
1193     Make sure that key fields follow the order of the corresponding
1194     key components these fields are equal to. For this the descriptors
1195     of the fields that comprise the key might be re-ordered.
1196   */
1197   for (i= 0; i < ref->key_parts; i++)
1198   {
1199     uint j;
1200     Item *item= ref->items[i]->real_item();
1201     Field *fld= ((Item_field *) item)->field;
1202     CACHE_FIELD *init_copy= field_descr+flag_fields+i;
1203     for (j= i, copy= init_copy; j < local_key_arg_fields;  j++, copy++)
1204     {
1205       if (fld->eq(copy->field))
1206       {
1207         if (j != i)
1208         {
1209           CACHE_FIELD key_part_copy= *copy;
1210           *copy= *init_copy;
1211           *init_copy= key_part_copy;
1212         }
1213         break;
1214       }
1215     }
1216   }
1217 
1218   return TRUE;
1219 }
1220 
1221 
1222 /*
1223   Write record fields and their required offsets into the join cache buffer
1224 
1225   SYNOPSIS
1226     write_record_data()
1227       link        a reference to the associated info in the previous cache
1228       is_full OUT true if it has been decided that no more records will be
1229                   added to the join buffer
1230 
1231   DESCRIPTION
1232     This function put into the cache buffer the following info that it reads
1233     from the join record buffers or computes somehow:
1234     (1) the length of all fields written for the record (optional)
1235     (2) an offset to the associated info in the previous cache (if there is any)
1236         determined by the link parameter
1237     (3) all flag fields of the tables whose data field are put into the cache:
1238         - match flag (optional),
1239         - null bitmaps for all tables,
1240         - null row flags for all tables
1241     (4) values of all data fields including
1242         - full images of those fixed legth data fields that cannot have
1243           trailing spaces
1244         - significant part of fixed length fields that can have trailing spaces
1245           with the prepanded length
1246         - data of non-blob variable length fields with the prepanded data length
1247         - blob data from blob fields with the prepanded data length
1248     (5) record offset values for the data fields that are referred to from
1249         other caches
1250 
1251     The record is written at the current position stored in the field 'pos'.
1252     At the end of the function 'pos' points at the position right after the
1253     written record data.
1254     The function increments the number of records in the cache that is stored
1255     in the 'records' field by 1. The function also modifies the values of
1256     'curr_rec_pos' and 'last_rec_pos' to point to the written record.
1257     The 'end_pos' cursor is modified accordingly.
1258     The 'last_rec_blob_data_is_in_rec_buff' is set on if the blob data
1259     remains in the record buffers and not copied to the join buffer. It may
1260     happen only to the blob data from the last record added into the cache.
1261     If on_precond is attached to join_tab and it is not evaluated to TRUE
1262     then MATCH_IMPOSSIBLE is placed in the match flag field of the record
1263     written into the join buffer.
1264 
1265   RETURN VALUE
1266     length of the written record data
1267 */
1268 
1269 uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
1270 {
1271   uint len;
1272   bool last_record;
1273   CACHE_FIELD *copy;
1274   CACHE_FIELD *copy_end;
1275   uchar *flags_pos;
1276   uchar *cp= pos;
1277   uchar *init_pos= cp;
1278   uchar *rec_len_ptr= 0;
1279   uint key_extra= extra_key_length();
1280 
1281   records++;  /* Increment the counter of records in the cache */
1282 
1283   len= pack_length + key_extra;
1284 
1285   /* Make an adjustment for the size of the auxiliary buffer if there is any */
1286   uint incr= aux_buffer_incr(records);
1287   size_t rem= rem_space();
1288   aux_buff_size+= len+incr < rem ? incr : rem;
1289 
1290   /*
1291     For each blob to be put into cache save its length and a pointer
1292     to the value in the corresponding element of the blob_ptr array.
1293     Blobs with null values are skipped.
1294     Increment 'len' by the total length of all these blobs.
1295   */
1296   if (blobs)
1297   {
1298     CACHE_FIELD **copy_ptr= blob_ptr;
1299     CACHE_FIELD **copy_ptr_end= copy_ptr+blobs;
1300     for ( ; copy_ptr < copy_ptr_end; copy_ptr++)
1301     {
1302       Field_blob *blob_field= (Field_blob *) (*copy_ptr)->field;
1303       if (!blob_field->is_null())
1304       {
1305         uint blob_len= blob_field->get_length();
1306         (*copy_ptr)->blob_length= blob_len;
1307         len+= blob_len;
1308         (*copy_ptr)->str= blob_field->get_ptr();
1309       }
1310     }
1311   }
1312 
1313   /*
1314     Check whether we won't be able to add any new record into the cache after
1315     this one because the cache will be full. Set last_record to TRUE if it's so.
1316     The assume that the cache will be full after the record has been written
1317     into it if either the remaining space of the cache is not big enough for the
1318     record's blob values or if there is a chance that not all non-blob fields
1319     of the next record can be placed there.
1320     This function is called only in the case when there is enough space left in
1321     the cache to store at least non-blob parts of the current record.
1322   */
1323   last_record= (len+pack_length_with_blob_ptrs+key_extra) > rem_space();
1324 
1325   /*
1326     Save the position for the length of the record in the cache if it's needed.
1327     The length of the record will be inserted here when all fields of the record
1328     are put into the cache.
1329   */
1330   if (with_length)
1331   {
1332     rec_len_ptr= cp;
1333     DBUG_ASSERT(cp + size_of_rec_len <= buff + buff_size);
1334     cp+= size_of_rec_len;
1335   }
1336 
1337   /*
1338     Put a reference to the fields of the record that are stored in the previous
1339     cache if there is any. This reference is passed by the 'link' parameter.
1340   */
1341   if (prev_cache)
1342   {
1343     DBUG_ASSERT(cp + prev_cache->get_size_of_rec_offset() <= buff + buff_size);
1344     cp+= prev_cache->get_size_of_rec_offset();
1345     prev_cache->store_rec_ref(cp, link);
1346   }
1347 
1348   curr_rec_pos= cp;
1349 
1350   /* If the there is a match flag set its value to 0 */
1351   copy= field_descr;
1352   if (with_match_flag)
1353     *copy[0].str= 0;
1354 
1355   /* First put into the cache the values of all flag fields */
1356   copy_end= field_descr+flag_fields;
1357   flags_pos= cp;
1358   for ( ; copy < copy_end; copy++)
1359   {
1360     DBUG_ASSERT(cp + copy->length <= buff + buff_size);
1361     memcpy(cp, copy->str, copy->length);
1362     cp+= copy->length;
1363   }
1364 
1365   /* Now put the values of the remaining fields as soon as they are not nulls */
1366   copy_end= field_descr+fields;
1367   for ( ; copy < copy_end; copy++)
1368   {
1369     Field *field= copy->field;
1370     if (field && field->maybe_null() && field->is_null())
1371     {
1372       if (copy->referenced_field_no)
1373         copy->offset= 0;
1374       continue;
1375     }
1376     /* Save the offset of the field to put it later at the end of the record */
1377     if (copy->referenced_field_no)
1378       copy->offset= (uint)(cp-curr_rec_pos);
1379 
1380     switch (copy->type) {
1381     case CACHE_BLOB:
1382     {
1383       Field_blob *blob_field= (Field_blob *) copy->field;
1384       if (last_record)
1385       {
1386         last_rec_blob_data_is_in_rec_buff= 1;
1387         /* Put down the length of the blob and the pointer to the data */
1388         DBUG_ASSERT(cp + copy->length + sizeof(char*) <= buff + buff_size);
1389 	blob_field->get_image(cp, copy->length+sizeof(char*),
1390                               blob_field->charset());
1391 	cp+= copy->length+sizeof(char*);
1392       }
1393       else
1394       {
1395         /* First put down the length of the blob and then copy the data */
1396 	blob_field->get_image(cp, copy->length,
1397 			      blob_field->charset());
1398         DBUG_ASSERT(cp + copy->length + copy->blob_length <= buff + buff_size);
1399         if (copy->blob_length)
1400           memcpy(cp+copy->length, copy->str, copy->blob_length);
1401 	cp+= copy->length+copy->blob_length;
1402       }
1403       break;
1404     }
1405     case CACHE_VARSTR1:
1406       /* Copy the significant part of the short varstring field */
1407       len= (uint) copy->str[0] + 1;
1408       DBUG_ASSERT(cp + len <= buff + buff_size);
1409       memcpy(cp, copy->str, len);
1410       cp+= len;
1411       break;
1412     case CACHE_VARSTR2:
1413       /* Copy the significant part of the long varstring field */
1414       len= uint2korr(copy->str) + 2;
1415       DBUG_ASSERT(cp + len <= buff + buff_size);
1416       memcpy(cp, copy->str, len);
1417       cp+= len;
1418       break;
1419     case CACHE_STRIPPED:
1420     {
1421       /*
1422         Put down the field value stripping all trailing spaces off.
1423         After this insert the length of the written sequence of bytes.
1424       */
1425       uchar *str, *end;
1426       for (str= copy->str, end= str+copy->length;
1427            end > str && end[-1] == ' ';
1428            end--) ;
1429       len=(uint) (end-str);
1430       DBUG_ASSERT(cp + len + 2 <= buff + buff_size);
1431       int2store(cp, len);
1432       memcpy(cp+2, str, len);
1433       cp+= len+2;
1434       break;
1435     }
1436     case CACHE_ROWID:
1437       if (!copy->length)
1438       {
1439         /*
1440           This may happen only for ROWID fields of materialized
1441           derived tables and views.
1442         */
1443         TABLE *table= (TABLE *) copy->str;
1444         copy->str= table->file->ref;
1445         copy->length= table->file->ref_length;
1446         if (!copy->str)
1447         {
1448           /*
1449             If table is an empty inner table of an outer join and it is
1450             a materialized derived table then table->file->ref == NULL.
1451           */
1452           cp+= copy->length;
1453           break;
1454         }
1455       }
1456       /* fall through */
1457     default:
1458       /* Copy the entire image of the field from the record buffer */
1459       DBUG_ASSERT(cp + copy->length <= buff + buff_size);
1460       if (copy->str)
1461         memcpy(cp, copy->str, copy->length);
1462       cp+= copy->length;
1463     }
1464   }
1465 
1466   /* Add the offsets of the fields that are referenced from other caches */
1467   if (referenced_fields)
1468   {
1469     uint cnt= 0;
1470     for (copy= field_descr+flag_fields; copy < copy_end ; copy++)
1471     {
1472       if (copy->referenced_field_no)
1473       {
1474         store_fld_offset(cp+size_of_fld_ofs*(copy->referenced_field_no-1),
1475                          copy->offset);
1476         cnt++;
1477       }
1478     }
1479     DBUG_ASSERT(cp + size_of_fld_ofs*cnt <= buff + buff_size);
1480     cp+= size_of_fld_ofs*cnt;
1481   }
1482 
1483   if (rec_len_ptr)
1484     store_rec_length(rec_len_ptr, (ulong) (cp-rec_len_ptr-size_of_rec_len));
1485   last_rec_pos= curr_rec_pos;
1486   end_pos= pos= cp;
1487   *is_full= last_record;
1488 
1489   last_written_is_null_compl= 0;
1490   if (!join_tab->first_unmatched && join_tab->on_precond)
1491   {
1492     join_tab->found= 0;
1493     join_tab->not_null_compl= 1;
1494     if (!join_tab->on_precond->val_int())
1495     {
1496       flags_pos[0]= MATCH_IMPOSSIBLE;
1497       last_written_is_null_compl= 1;
1498     }
1499   }
1500 
1501   return (uint) (cp-init_pos);
1502 }
1503 
1504 
1505 /*
1506   Reset the join buffer for reading/writing: default implementation
1507 
1508   SYNOPSIS
1509     reset()
1510       for_writing  if it's TRUE the function reset the buffer for writing
1511 
1512   DESCRIPTION
1513     This default implementation of the virtual function reset() resets
1514     the join buffer for reading or writing.
1515     If the buffer is reset for reading only the 'pos' value is reset
1516     to point to the very beginning of the join buffer. If the buffer is
1517     reset for writing additionally:
1518     - the counter of the records in the buffer is set to 0,
1519     - the the value of 'last_rec_pos' gets pointing at the position just
1520       before the buffer,
1521     - 'end_pos' is set to point to the beginning of the join buffer,
1522     - the size of the auxiliary buffer is reset to 0,
1523     - the flag 'last_rec_blob_data_is_in_rec_buff' is set to 0.
1524 
1525   RETURN VALUE
1526     none
1527 */
1528 void JOIN_CACHE::reset(bool for_writing)
1529 {
1530   pos= buff;
1531   curr_rec_link= 0;
1532   if (for_writing)
1533   {
1534     records= 0;
1535     last_rec_pos= buff;
1536     aux_buff_size= 0;
1537     end_pos= pos;
1538     last_rec_blob_data_is_in_rec_buff= 0;
1539   }
1540 }
1541 
1542 
1543 /*
1544   Add a record into the join buffer: the default implementation
1545 
1546   SYNOPSIS
1547     put_record()
1548 
1549   DESCRIPTION
1550     This default implementation of the virtual function put_record writes
1551     the next matching record into the join buffer.
1552     It also links the record having been written into the join buffer with
1553     the matched record in the previous cache if there is any.
1554     The implementation assumes that the function get_curr_link()
1555     will return exactly the pointer to this matched record.
1556 
1557   RETURN VALUE
1558     TRUE    if it has been decided that it should be the last record
1559             in the join buffer,
1560     FALSE   otherwise
1561 */
1562 
1563 bool JOIN_CACHE::put_record()
1564 {
1565   bool is_full;
1566   uchar *link= 0;
1567   if (prev_cache)
1568     link= prev_cache->get_curr_rec_link();
1569   write_record_data(link, &is_full);
1570   return is_full;
1571 }
1572 
1573 
1574 /*
1575   Read the next record from the join buffer: the default implementation
1576 
1577   SYNOPSIS
1578     get_record()
1579 
1580   DESCRIPTION
1581     This default implementation of the virtual function get_record
1582     reads fields of the next record from the join buffer of this cache.
1583     The function also reads all other fields associated with this record
1584     from the the join buffers of the previous caches. The fields are read
1585     into the corresponding record buffers.
1586     It is supposed that 'pos' points to the position in the buffer
1587     right after the previous record when the function is called.
1588     When the function returns the 'pos' values is updated to point
1589     to the position after the read record.
1590     The value of 'curr_rec_pos' is also updated by the function to
1591     point to the beginning of the first field of the record in the
1592     join buffer.
1593 
1594   RETURN VALUE
1595     TRUE    there are no more records to read from the join buffer
1596     FALSE   otherwise
1597 */
1598 
1599 bool JOIN_CACHE::get_record()
1600 {
1601   bool res;
1602   uchar *prev_rec_ptr= 0;
1603   if (with_length)
1604     pos+= size_of_rec_len;
1605   if (prev_cache)
1606   {
1607     pos+= prev_cache->get_size_of_rec_offset();
1608     prev_rec_ptr= prev_cache->get_rec_ref(pos);
1609   }
1610   curr_rec_pos= pos;
1611   if (!(res= read_all_record_fields() == NO_MORE_RECORDS_IN_BUFFER))
1612   {
1613     pos+= referenced_fields*size_of_fld_ofs;
1614     if (prev_cache)
1615       prev_cache->get_record_by_pos(prev_rec_ptr);
1616   }
1617   return res;
1618 }
1619 
1620 
1621 /*
1622   Read a positioned record from the join buffer: the default implementation
1623 
1624   SYNOPSIS
1625     get_record_by_pos()
1626       rec_ptr  position of the first field of the record in the join buffer
1627 
1628   DESCRIPTION
1629     This default implementation of the virtual function get_record_pos
1630     reads the fields of the record positioned at 'rec_ptr' from the join buffer.
1631     The function also reads all other fields associated with this record
1632     from the the join buffers of the previous caches. The fields are read
1633     into the corresponding record buffers.
1634 
1635   RETURN VALUE
1636     none
1637 */
1638 
1639 void JOIN_CACHE::get_record_by_pos(uchar *rec_ptr)
1640 {
1641   uchar *save_pos= pos;
1642   pos= rec_ptr;
1643   read_all_record_fields();
1644   pos= save_pos;
1645   if (prev_cache)
1646   {
1647     uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
1648     prev_cache->get_record_by_pos(prev_rec_ptr);
1649   }
1650 }
1651 
1652 
1653 /*
1654   Get the match flag from the referenced record: the default implementation
1655 
1656   SYNOPSIS
1657     get_match_flag_by_pos()
1658       rec_ptr  position of the first field of the record in the join buffer
1659 
1660   DESCRIPTION
1661     This default implementation of the virtual function get_match_flag_by_pos
1662     get the match flag for the record pointed by the reference at the position
1663     rec_ptr. If the match flag is placed in one of the previous buffers the
1664     function first reaches the linked record fields in this buffer.
1665     The function returns the value of the first encountered match flag.
1666 
1667   RETURN VALUE
1668     match flag for the record at the position rec_ptr
1669 */
1670 
1671 enum JOIN_CACHE::Match_flag JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr)
1672 {
1673   Match_flag match_fl= MATCH_NOT_FOUND;
1674   if (with_match_flag)
1675   {
1676     match_fl= (enum Match_flag) rec_ptr[0];
1677     return match_fl;
1678   }
1679   if (prev_cache)
1680   {
1681     uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
1682     return prev_cache->get_match_flag_by_pos(prev_rec_ptr);
1683   }
1684   DBUG_ASSERT(0);
1685   return match_fl;
1686 }
1687 
1688 
1689 /*
1690   Get the match flag for the referenced record from specified join buffer
1691 
1692   SYNOPSIS
1693     get_match_flag_by_pos_from_join_buffer()
1694       rec_ptr  position of the first field of the record in the join buffer
1695       tab      join table with join buffer where to look for the match flag
1696 
1697   DESCRIPTION
1698     This default implementation of the get_match_flag_by_pos_from_join_buffer
1699     method gets the match flag for the record pointed by the reference at the
1700     position rec_ptr from the join buffer attached to the join table tab.
1701 
1702   RETURN VALUE
1703     match flag for the record at the position rec_ptr from the join
1704     buffer attached to the table tab.
1705 */
1706 
1707 enum JOIN_CACHE::Match_flag
1708 JOIN_CACHE::get_match_flag_by_pos_from_join_buffer(uchar *rec_ptr,
1709                                                    JOIN_TAB *tab)
1710 {
1711   DBUG_ASSERT(tab->cache && tab->cache->with_match_flag);
1712   for (JOIN_CACHE *cache= this; ; )
1713   {
1714     if (cache->join_tab == tab)
1715       return (enum Match_flag) rec_ptr[0];
1716     cache= cache->prev_cache;
1717     rec_ptr= cache->get_rec_ref(rec_ptr);
1718   }
1719 }
1720 
1721 
1722 /*
1723   Calculate the increment of the auxiliary buffer for a record write
1724 
1725   SYNOPSIS
1726     aux_buffer_incr()
1727       recno   the number of the record the increment to be calculated for
1728 
1729   DESCRIPTION
1730     This function calls the aux_buffer_incr the method of the
1731     companion member join_tab_scan to calculate the growth of the
1732     auxiliary buffer when the recno-th record is added to the
1733     join_buffer of this cache.
1734 
1735   RETURN VALUE
1736     the number of bytes in the increment
1737 */
1738 
1739 uint JOIN_CACHE::aux_buffer_incr(size_t recno)
1740 {
1741   return join_tab_scan->aux_buffer_incr(recno);
1742 }
1743 
1744 /*
1745   Read all flag and data fields of a record from the join buffer
1746 
1747   SYNOPSIS
1748     read_all_record_fields()
1749 
1750   DESCRIPTION
1751     The function reads all flag and data fields of a record from the join
1752     buffer into the corresponding record buffers.
1753     The fields are read starting from the position 'pos' which is
1754     supposed to point to the beginning of the first record field.
1755     The function increments the value of 'pos' by the length of the
1756     read data.
1757 
1758   RETURN VALUE
1759     (-1)   if there is no more records in the join buffer
1760     length of the data read from the join buffer - otherwise
1761 */
1762 
1763 uint JOIN_CACHE::read_all_record_fields()
1764 {
1765   uchar *init_pos= pos;
1766 
1767   if (pos > last_rec_pos || !records)
1768     return NO_MORE_RECORDS_IN_BUFFER;
1769 
1770   /* First match flag, read null bitmaps and null_row flag for each table */
1771   read_flag_fields();
1772 
1773   /* Now read the remaining table fields if needed */
1774   CACHE_FIELD *copy= field_descr+flag_fields;
1775   CACHE_FIELD *copy_end= field_descr+fields;
1776   bool blob_in_rec_buff= blob_data_is_in_rec_buff(init_pos);
1777   for ( ; copy < copy_end; copy++)
1778     read_record_field(copy, blob_in_rec_buff);
1779 
1780   return (uint) (pos-init_pos);
1781 }
1782 
1783 
1784 /*
1785   Read all flag fields of a record from the join buffer
1786 
1787   SYNOPSIS
1788     read_flag_fields()
1789 
1790   DESCRIPTION
1791     The function reads all flag fields of a record from the join
1792     buffer into the corresponding record buffers.
1793     The fields are read starting from the position 'pos'.
1794     The function increments the value of 'pos' by the length of the
1795     read data.
1796 
1797   RETURN VALUE
1798     length of the data read from the join buffer
1799 */
1800 
1801 uint JOIN_CACHE::read_flag_fields()
1802 {
1803   uchar *init_pos= pos;
1804   CACHE_FIELD *copy= field_descr;
1805   CACHE_FIELD *copy_end= copy+flag_fields;
1806   if (with_match_flag)
1807   {
1808     copy->str[0]= MY_TEST((Match_flag) pos[0] == MATCH_FOUND);
1809     pos+= copy->length;
1810     copy++;
1811   }
1812   for ( ; copy < copy_end; copy++)
1813   {
1814     memcpy(copy->str, pos, copy->length);
1815     pos+= copy->length;
1816   }
1817   return (uint)(pos-init_pos);
1818 }
1819 
1820 
1821 /*
1822   Read a data record field from the join buffer
1823 
1824   SYNOPSIS
1825     read_record_field()
1826       copy             the descriptor of the data field to be read
1827       blob_in_rec_buff indicates whether this is the field from the record
1828                        whose blob data are in record buffers
1829 
1830   DESCRIPTION
1831     The function reads the data field specified by the parameter copy
1832     from the join buffer into the corresponding record buffer.
1833     The field is read starting from the position 'pos'.
1834     The data of blob values is not copied from the join buffer.
1835     The function increments the value of 'pos' by the length of the
1836     read data.
1837 
1838   RETURN VALUE
1839     length of the data read from the join buffer
1840 */
1841 
1842 uint JOIN_CACHE::read_record_field(CACHE_FIELD *copy, bool blob_in_rec_buff)
1843 {
1844   uint len;
1845   /* Do not copy the field if its value is null */
1846   if (copy->field && copy->field->maybe_null() && copy->field->is_null())
1847     return 0;
1848   switch (copy->type) {
1849   case CACHE_BLOB:
1850     {
1851       Field_blob *blob_field= (Field_blob *) copy->field;
1852       /*
1853         Copy the length and the pointer to data but not the blob data
1854         itself to the record buffer
1855       */
1856       if (blob_in_rec_buff)
1857       {
1858         blob_field->set_image(pos, copy->length + sizeof(char*),
1859                               blob_field->charset());
1860         len= copy->length + sizeof(char*);
1861       }
1862       else
1863       {
1864         blob_field->set_ptr(pos, pos+copy->length);
1865         len= copy->length + blob_field->get_length();
1866       }
1867     }
1868     break;
1869   case CACHE_VARSTR1:
1870     /* Copy the significant part of the short varstring field */
1871     len= (uint) pos[0] + 1;
1872     memcpy(copy->str, pos, len);
1873     break;
1874   case CACHE_VARSTR2:
1875     /* Copy the significant part of the long varstring field */
1876     len= uint2korr(pos) + 2;
1877     memcpy(copy->str, pos, len);
1878     break;
1879   case CACHE_STRIPPED:
1880     /* Pad the value by spaces that has been stripped off */
1881     len= uint2korr(pos);
1882     memcpy(copy->str, pos+2, len);
1883     memset(copy->str+len, ' ', copy->length-len);
1884     len+= 2;
1885     break;
1886   case CACHE_ROWID:
1887     if (!copy->str)
1888     {
1889       len= copy->length;
1890       break;
1891     }
1892     /* fall through */
1893   default:
1894     /* Copy the entire image of the field from the record buffer */
1895     len= copy->length;
1896     memcpy(copy->str, pos, len);
1897   }
1898   pos+= len;
1899   return len;
1900 }
1901 
1902 
1903 /*
1904   Read a referenced field from the join buffer
1905 
1906   SYNOPSIS
1907     read_referenced_field()
1908       copy         pointer to the descriptor of the referenced field
1909       rec_ptr      pointer to the record that may contain this field
1910       len  IN/OUT  total length of the record fields
1911 
1912   DESCRIPTION
1913     The function checks whether copy points to a data field descriptor
1914     for this cache object. If it does not then the function returns
1915     FALSE. Otherwise the function reads the field of the record in
1916     the join buffer pointed by 'rec_ptr' into the corresponding record
1917     buffer and returns TRUE.
1918     If the value of *len is 0 then the function sets it to the total
1919     length of the record fields including possible trailing offset
1920     values. Otherwise *len is supposed to provide this value that
1921     has been obtained earlier.
1922 
1923   NOTE
1924     If the value of the referenced field is null then the offset
1925     for the value is set to 0. If the value of a field can be null
1926     then the value of flag_fields is always positive. So the offset
1927     for any non-null value cannot be 0 in this case.
1928 
1929   RETURN VALUE
1930     TRUE   'copy' points to a data descriptor of this join cache
1931     FALSE  otherwise
1932 */
1933 
1934 bool JOIN_CACHE::read_referenced_field(CACHE_FIELD *copy,
1935                                        uchar *rec_ptr,
1936                                        uint *len)
1937 {
1938   uchar *ptr;
1939   uint offset;
1940   if (copy < field_descr || copy >= field_descr+fields)
1941     return FALSE;
1942   if (!*len)
1943   {
1944     /* Get the total length of the record fields */
1945     uchar *len_ptr= rec_ptr;
1946     if (prev_cache)
1947       len_ptr-= prev_cache->get_size_of_rec_offset();
1948     *len= get_rec_length(len_ptr-size_of_rec_len);
1949   }
1950 
1951   ptr= rec_ptr-(prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
1952   offset= get_fld_offset(ptr+ *len -
1953                          size_of_fld_ofs*
1954                          (referenced_fields+1-copy->referenced_field_no));
1955   bool is_null= FALSE;
1956   Field *field= copy->field;
1957   if (offset == 0 && flag_fields)
1958     is_null= TRUE;
1959   if (is_null)
1960   {
1961     field->set_null();
1962     if (!field->real_maybe_null())
1963       field->table->null_row= 1;
1964   }
1965   else
1966   {
1967     uchar *save_pos= pos;
1968     field->set_notnull();
1969     if (!field->real_maybe_null())
1970       field->table->null_row= 0;
1971     pos= rec_ptr+offset;
1972     read_record_field(copy, blob_data_is_in_rec_buff(rec_ptr));
1973     pos= save_pos;
1974   }
1975   return TRUE;
1976 }
1977 
1978 
1979 /*
1980   Skip record from join buffer if's already matched: default implementation
1981 
1982   SYNOPSIS
1983     skip_if_matched()
1984 
1985   DESCRIPTION
1986     This default implementation of the virtual function skip_if_matched
1987     skips the next record from the join buffer if its  match flag is set to
1988     MATCH_FOUND.
1989     If the record is skipped the value of 'pos' is set to point to the position
1990     right after the record.
1991 
1992   NOTE
1993     Currently this function is called only when generating null complemented
1994     records for outer joins (=> only when join_tab->first_unmatched != NULL).
1995 
1996   RETURN VALUE
1997     TRUE   the match flag is set to MATCH_FOUND and the record has been skipped
1998     FALSE  otherwise
1999 */
2000 
2001 bool JOIN_CACHE::skip_if_matched()
2002 {
2003   DBUG_ASSERT(with_length);
2004   uint offset= size_of_rec_len;
2005   if (prev_cache)
2006     offset+= prev_cache->get_size_of_rec_offset();
2007   /* Check whether the match flag is MATCH_FOUND */
2008   if (get_match_flag_by_pos_from_join_buffer(pos+offset,
2009                                              join_tab->first_unmatched) ==
2010       MATCH_FOUND)
2011   {
2012     pos+= size_of_rec_len + get_rec_length(pos);
2013     return TRUE;
2014   }
2015   return FALSE;
2016 }
2017 
2018 
2019 /*
2020   Skip record from join buffer if the match isn't needed: default implementation
2021 
2022   SYNOPSIS
2023     skip_if_not_needed_match()
2024 
2025   DESCRIPTION
2026     This default implementation of the virtual function skip_if_not_needed_match
2027     skips the next record from the join when generating join extensions
2028     for the records in the join buffer depending on the value of the match flag.
2029       - In the case of a semi-nest the match flag may be in two states
2030         {MATCH_NOT_FOUND, MATCH_FOUND}. The record is skipped if the flag is set
2031         to MATCH_FOUND.
2032       - In the case of a outer join nest when not_exists optimization is applied
2033         the match may be in three states {MATCH_NOT_FOUND, MATCH_IMPOSSIBLE,
2034         MATCH_FOUND. The record is skipped if the flag is set to MATCH_FOUND or
2035         to MATCH_IMPOSSIBLE.
2036 
2037     If the record is skipped the value of 'pos' is set to point to the position
2038     right after the record.
2039 
2040   NOTE
2041     Currently the function is called only when generating non-null complemented
2042     extensions for records in the join buffer.
2043 
2044   RETURN VALUE
2045     TRUE    the record has to be skipped
2046     FALSE   otherwise
2047 */
2048 
2049 bool JOIN_CACHE::skip_if_not_needed_match()
2050 {
2051   DBUG_ASSERT(with_length);
2052   enum Match_flag match_fl;
2053   uint offset= size_of_rec_len;
2054   bool skip= FALSE;
2055   if (prev_cache)
2056     offset+= prev_cache->get_size_of_rec_offset();
2057 
2058   if (!join_tab->check_only_first_match())
2059     return FALSE;
2060 
2061   match_fl= get_match_flag_by_pos(pos+offset);
2062   skip= join_tab->first_sj_inner_tab ?
2063         match_fl == MATCH_FOUND :           // the case of semi-join
2064         match_fl != MATCH_NOT_FOUND;        // the case of outer-join
2065 
2066   if (skip)
2067   {
2068     pos+= size_of_rec_len + get_rec_length(pos);
2069     return TRUE;
2070   }
2071   return FALSE;
2072 }
2073 
2074 
2075 /*
2076   Restore the fields of the last record from the join buffer
2077 
2078   SYNOPSIS
2079     restore_last_record()
2080 
2081   DESCRIPTION
2082     This function restore the values of the fields of the last record put
2083     into join buffer in record buffers. The values most probably have been
2084     overwritten by the field values from other records when they were read
2085     from the join buffer into the record buffer in order to check pushdown
2086     predicates.
2087 
2088   RETURN
2089     none
2090 */
2091 
2092 void JOIN_CACHE::restore_last_record()
2093 {
2094   if (records)
2095     get_record_by_pos(last_rec_pos);
2096 }
2097 
2098 
2099 /*
2100   Join records from the join buffer with records from the next join table
2101 
2102   SYNOPSIS
2103     join_records()
2104       skip_last    do not find matches for the last record from the buffer
2105 
2106   DESCRIPTION
2107     The functions extends all records from the join buffer by the matched
2108     records from join_tab. In the case of outer join operation it also
2109     adds null complementing extensions for the records from the join buffer
2110     that have no match.
2111     No extensions are generated for the last record from the buffer if
2112     skip_last is true.
2113 
2114   NOTES
2115     The function must make sure that if linked join buffers are used then
2116     a join buffer cannot be refilled again until all extensions in the
2117     buffers chained to this one are generated.
2118     Currently an outer join operation with several inner tables always uses
2119     at least two linked buffers with the match join flags placed in the
2120     first buffer. Any record composed of rows of the inner tables that
2121     matches a record in this buffer must refer to the position of the
2122     corresponding match flag.
2123 
2124   IMPLEMENTATION
2125     When generating extensions for outer tables of an outer join operation
2126     first we generate all extensions for those records from the join buffer
2127     that have matches, after which null complementing extension for all
2128     unmatched records from the join buffer are generated.
2129 
2130   RETURN VALUE
2131     return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS.
2132 */
2133 
2134 enum_nested_loop_state JOIN_CACHE::join_records(bool skip_last)
2135 {
2136   JOIN_TAB *tab;
2137   enum_nested_loop_state rc= NESTED_LOOP_OK;
2138   bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
2139   DBUG_ENTER("JOIN_CACHE::join_records");
2140 
2141   if (outer_join_first_inner && !join_tab->first_unmatched)
2142     join_tab->not_null_compl= TRUE;
2143 
2144   if (!join_tab->first_unmatched)
2145   {
2146     /* Find all records from join_tab that match records from join buffer */
2147     rc= join_matching_records(skip_last);
2148     if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2149       goto finish;
2150     if (outer_join_first_inner)
2151     {
2152       if (next_cache && join_tab != join_tab->last_inner)
2153       {
2154         /*
2155           Ensure that all matches for outer records from join buffer are to be
2156           found. Now we ensure that all full records are found for records from
2157           join buffer. Generally this is an overkill.
2158           TODO: Ensure that only matches of the inner table records have to be
2159           found for the records from join buffer.
2160 	*/
2161         rc= next_cache->join_records(skip_last);
2162         if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2163           goto finish;
2164       }
2165       join_tab->not_null_compl= FALSE;
2166       /*
2167         Prepare for generation of null complementing extensions.
2168         For all inner tables of the outer join operation for which
2169         regular matches have been just found the field 'first_unmatched'
2170         is set to point the the first inner table. After all null
2171         complement rows are generated for this outer join this field
2172         is set back to NULL.
2173       */
2174       for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
2175         tab->first_unmatched= join_tab->first_inner;
2176     }
2177   }
2178   if (join_tab->first_unmatched)
2179   {
2180     if (is_key_access())
2181       restore_last_record();
2182 
2183     /*
2184       Generate all null complementing extensions for the records from
2185       join buffer that don't have any matching rows from the inner tables.
2186     */
2187     reset(FALSE);
2188     rc= join_null_complements(skip_last);
2189     if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2190       goto finish;
2191   }
2192   if(next_cache)
2193   {
2194     /*
2195       When using linked caches we must ensure the records in the next caches
2196       that refer to the records in the join buffer are fully extended.
2197       Otherwise we could have references to the records that have been
2198       already erased from the join buffer and replaced for new records.
2199     */
2200     rc= next_cache->join_records(skip_last);
2201     if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2202       goto finish;
2203   }
2204 
2205   if (skip_last)
2206   {
2207     DBUG_ASSERT(!is_key_access());
2208     /*
2209        Restore the last record from the join buffer to generate
2210        all extensions for it.
2211     */
2212     get_record();
2213   }
2214 
2215 finish:
2216   if (outer_join_first_inner &&
2217       join_tab->first_inner == join_tab->first_unmatched)
2218   {
2219     /*
2220       All null complemented rows have been already generated for all
2221       outer records from join buffer. Restore the state of the
2222       first_unmatched values to 0 to avoid another null complementing.
2223     */
2224     for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
2225       tab->first_unmatched= 0;
2226   }
2227   restore_last_record();
2228   reset(TRUE);
2229   DBUG_PRINT("exit", ("rc: %d", rc));
2230   DBUG_RETURN(rc);
2231 }
2232 
2233 
2234 /*
2235   Find matches from the next table for records from the join buffer
2236 
2237   SYNOPSIS
2238     join_matching_records()
2239       skip_last    do not look for matches for the last partial join record
2240 
2241   DESCRIPTION
2242     The function retrieves rows of the join_tab table and checks whether they
2243     match partial join records from the join buffer. If a match is found
2244     the function will call the sub_select function trying to look for matches
2245     for the remaining join operations.
2246     This function currently is called only from the function join_records.
2247     If the value of skip_last is true the function writes the partial join
2248     record from the record buffer into the join buffer to save its value for
2249     the future processing in the caller function.
2250 
2251   NOTES
2252     If employed by BNL or BNLH join algorithms the function performs a full
2253     scan of join_tab for each refill of the join buffer. If BKA or BKAH
2254     algorithms are used then the function iterates only over those records
2255     from join_tab that can be accessed by keys built over records in the join
2256     buffer. To apply a proper method of iteration the function just calls
2257     virtual iterator methods (open, next, close) of the member join_tab_scan.
2258     The member can be either of the JOIN_TAB_SCAN or JOIN_TAB_SCAN_MMR type.
2259     The class JOIN_TAB_SCAN provides the iterator methods for BNL/BNLH join
2260     algorithms. The class JOIN_TAB_SCAN_MRR provides the iterator methods
2261     for BKA/BKAH join algorithms.
2262     When the function looks for records from the join buffer that would
2263     match a record from join_tab it iterates either over all records in
2264     the buffer or only over selected records. If BNL join operation is
2265     performed all records are checked for the match. If BNLH or BKAH
2266     algorithm is employed to join join_tab then the function looks only
2267     through the records with the same join key as the record from join_tab.
2268     With the BKA join algorithm only one record from the join buffer is checked
2269     for a match for any record from join_tab. To iterate over the candidates
2270     for a match the virtual function get_next_candidate_for_match is used,
2271     while the virtual function prepare_look_for_matches is called to prepare
2272     for such iteration process.
2273 
2274   NOTES
2275     The function produces all matching extensions for the records in the
2276     join buffer following the path of the employed blocked algorithm.
2277     When an outer join operation is performed all unmatched records from
2278     the join buffer must be extended by null values. The function
2279     'join_null_complements' serves this purpose.
2280 
2281   RETURN VALUE
2282     return one of enum_nested_loop_state
2283 */
2284 
2285 enum_nested_loop_state JOIN_CACHE::join_matching_records(bool skip_last)
2286 {
2287   int error;
2288   enum_nested_loop_state rc= NESTED_LOOP_OK;
2289   join_tab->table->null_row= 0;
2290   bool check_only_first_match=
2291     join_tab->check_only_first_match() &&
2292     (!join_tab->first_inner ||                            // semi-join case
2293      join_tab->first_inner == join_tab->first_unmatched); // outer join case
2294   bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
2295   DBUG_ENTER("JOIN_CACHE::join_matching_records");
2296 
2297   /* Return at once if there are no records in the join buffer */
2298   if (!records)
2299     DBUG_RETURN(NESTED_LOOP_OK);
2300 
2301   /*
2302     When joining we read records from the join buffer back into record buffers.
2303     If matches for the last partial join record are found through a call to
2304     the sub_select function then this partial join record must be saved in the
2305     join buffer in order to be restored just before the sub_select call.
2306   */
2307   if (skip_last)
2308     put_record();
2309 
2310   if (join_tab->use_quick == 2 && join_tab->select->quick)
2311   {
2312     /* A dynamic range access was used last. Clean up after it */
2313     delete join_tab->select->quick;
2314     join_tab->select->quick= 0;
2315   }
2316 
2317   if ((rc= join_tab_execution_startup(join_tab)) < 0)
2318     goto finish2;
2319 
2320   /* Prepare to retrieve all records of the joined table */
2321   if (unlikely((error= join_tab_scan->open())))
2322   {
2323     /*
2324       TODO: if we get here, we will assert in net_send_statement(). Add test
2325       coverage and fix.
2326     */
2327     goto finish;
2328   }
2329 
2330   while (!(error= join_tab_scan->next()))
2331   {
2332     if (unlikely(join->thd->check_killed()))
2333     {
2334       /* The user has aborted the execution of the query */
2335       rc= NESTED_LOOP_KILLED;
2336       goto finish;
2337     }
2338 
2339     if (join_tab->keep_current_rowid)
2340       join_tab->table->file->position(join_tab->table->record[0]);
2341 
2342     /* Prepare to read matching candidates from the join buffer */
2343     if (prepare_look_for_matches(skip_last))
2344       continue;
2345     join_tab->jbuf_tracker->r_scans++;
2346 
2347     uchar *rec_ptr;
2348     /* Read each possible candidate from the buffer and look for matches */
2349     while ((rec_ptr= get_next_candidate_for_match()))
2350     {
2351       join_tab->jbuf_tracker->r_rows++;
2352       /*
2353         If only the first match is needed, and, it has been already found for
2354         the next record read from the join buffer, then the record is skipped.
2355         Also those records that must be null complemented are not considered
2356         as candidates for matches.
2357       */
2358       if ((!check_only_first_match && !outer_join_first_inner) ||
2359           !skip_next_candidate_for_match(rec_ptr))
2360       {
2361 	read_next_candidate_for_match(rec_ptr);
2362         rc= generate_full_extensions(rec_ptr);
2363         if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2364 	  goto finish;
2365       }
2366     }
2367   }
2368 
2369 finish:
2370   if (error)
2371     rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
2372 finish2:
2373   join_tab_scan->close();
2374   DBUG_RETURN(rc);
2375 }
2376 
2377 
2378 /*
2379   Set match flag for a record in join buffer if it has not been set yet
2380 
2381   SYNOPSIS
2382     set_match_flag_if_none()
2383       first_inner     the join table to which this flag is attached to
2384       rec_ptr         pointer to the record in the join buffer
2385 
2386   DESCRIPTION
2387     If the records of the table are accumulated in a join buffer the function
2388     sets the match flag for the record in the buffer that is referred to by
2389     the record from this cache positioned at 'rec_ptr'.
2390     The function also sets the match flag 'found' of the table first inner
2391     if it has not been set before.
2392 
2393   NOTES
2394     The function assumes that the match flag for any record in any cache
2395     is placed in the first byte occupied by the record fields.
2396 
2397   RETURN VALUE
2398     TRUE   the match flag is set by this call for the first time
2399     FALSE  the match flag has been set before this call
2400 */
2401 
2402 bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner,
2403                                         uchar *rec_ptr)
2404 {
2405   if (!first_inner->cache)
2406   {
2407     /*
2408       Records of the first inner table to which the flag is attached to
2409       are not accumulated in a join buffer.
2410     */
2411     if (first_inner->found)
2412       return FALSE;
2413     else
2414     {
2415       first_inner->found= 1;
2416       return TRUE;
2417     }
2418   }
2419   JOIN_CACHE *cache= this;
2420   while (cache->join_tab != first_inner)
2421   {
2422     cache= cache->prev_cache;
2423     DBUG_ASSERT(cache);
2424     rec_ptr= cache->get_rec_ref(rec_ptr);
2425   }
2426   if ((Match_flag) rec_ptr[0] != MATCH_FOUND)
2427   {
2428     rec_ptr[0]= MATCH_FOUND;
2429     first_inner->found= 1;
2430     return TRUE;
2431   }
2432   return FALSE;
2433 }
2434 
2435 
2436 /*
2437   Generate all full extensions for a partial join record in the buffer
2438 
2439   SYNOPSIS
2440     generate_full_extensions()
2441       rec_ptr     pointer to the record from join buffer to generate extensions
2442 
2443   DESCRIPTION
2444     The function first checks whether the current record of 'join_tab' matches
2445     the partial join record from join buffer located at 'rec_ptr'. If it is the
2446     case the function calls the join_tab->next_select method to generate
2447     all full extension for this partial join match.
2448 
2449   RETURN VALUE
2450     return one of enum_nested_loop_state.
2451 */
2452 
2453 enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr)
2454 {
2455   enum_nested_loop_state rc= NESTED_LOOP_OK;
2456   DBUG_ENTER("JOIN_CACHE::generate_full_extensions");
2457 
2458   /*
2459     Check whether the extended partial join record meets
2460     the pushdown conditions.
2461   */
2462   if (check_match(rec_ptr))
2463   {
2464     int res= 0;
2465 
2466     if (!join_tab->check_weed_out_table ||
2467         !(res= join_tab->check_weed_out_table->sj_weedout_check_row(join->thd)))
2468     {
2469       set_curr_rec_link(rec_ptr);
2470       rc= (join_tab->next_select)(join, join_tab+1, 0);
2471       if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2472       {
2473         reset(TRUE);
2474         DBUG_RETURN(rc);
2475       }
2476     }
2477     if (res == -1)
2478     {
2479       rc= NESTED_LOOP_ERROR;
2480       DBUG_RETURN(rc);
2481     }
2482   }
2483   else if (unlikely(join->thd->is_error()))
2484     rc= NESTED_LOOP_ERROR;
2485   DBUG_RETURN(rc);
2486 }
2487 
2488 
2489 /*
2490   Check matching to a partial join record from the join buffer
2491 
2492   SYNOPSIS
2493     check_match()
2494       rec_ptr     pointer to the record from join buffer to check matching to
2495 
2496   DESCRIPTION
2497     The function checks whether the current record of 'join_tab' matches
2498     the partial join record from join buffer located at 'rec_ptr'. If this is
2499     the case and 'join_tab' is the last inner table of a semi-join or an outer
2500     join the function turns on the match flag for the 'rec_ptr' record unless
2501     it has been already set.
2502 
2503   NOTES
2504     Setting the match flag on can trigger re-evaluation of pushdown conditions
2505     for the record when join_tab is the last inner table of an outer join.
2506 
2507   RETURN VALUE
2508     TRUE   there is a match
2509     FALSE  there is no match
2510            In this case the caller must also check thd->is_error() to see
2511            if there was a fatal error for the query.
2512 */
2513 
2514 inline bool JOIN_CACHE::check_match(uchar *rec_ptr)
2515 {
2516   /* Check whether pushdown conditions are satisfied */
2517   DBUG_ENTER("JOIN_CACHE:check_match");
2518 
2519   if (join_tab->select && join_tab->select->skip_record(join->thd) <= 0)
2520     DBUG_RETURN(FALSE);
2521 
2522   join_tab->jbuf_tracker->r_rows_after_where++;
2523 
2524   if (!join_tab->is_last_inner_table())
2525     DBUG_RETURN(TRUE);
2526 
2527   /*
2528      This is the last inner table of an outer join,
2529      and maybe of other embedding outer joins, or
2530      this is the last inner table of a semi-join.
2531   */
2532   JOIN_TAB *first_inner= join_tab->get_first_inner_table();
2533   do
2534   {
2535     set_match_flag_if_none(first_inner, rec_ptr);
2536     if (first_inner->check_only_first_match() &&
2537         !join_tab->first_inner)
2538       DBUG_RETURN(TRUE);
2539     /*
2540       This is the first match for the outer table row.
2541       The function set_match_flag_if_none has turned the flag
2542       first_inner->found on. The pushdown predicates for
2543       inner tables must be re-evaluated with this flag on.
2544       Note that, if first_inner is the first inner table
2545       of a semi-join, but is not an inner table of an outer join
2546       such that 'not exists' optimization can  be applied to it,
2547       the re-evaluation of the pushdown predicates is not needed.
2548     */
2549     for (JOIN_TAB *tab= first_inner; tab <= join_tab; tab++)
2550     {
2551       if (tab->select && tab->select->skip_record(join->thd) <= 0)
2552         DBUG_RETURN(FALSE);
2553     }
2554   }
2555   while ((first_inner= first_inner->first_upper) &&
2556          first_inner->last_inner == join_tab);
2557     DBUG_RETURN(TRUE);
2558 }
2559 
2560 
2561 /*
2562   Add null complements for unmatched outer records from join buffer
2563 
2564   SYNOPSIS
2565     join_null_complements()
2566       skip_last    do not add null complements for the last record
2567 
2568   DESCRIPTION
2569     This function is called only for inner tables of outer joins.
2570     The function retrieves all rows from the join buffer and adds null
2571     complements for those of them that do not have matches for outer
2572     table records.
2573     If the 'join_tab' is the last inner table of the embedding outer
2574     join and the null complemented record satisfies the outer join
2575     condition then the the corresponding match flag is turned on
2576     unless it has been set earlier. This setting may trigger
2577     re-evaluation of pushdown conditions for the record.
2578 
2579   NOTES
2580     The same implementation of the virtual method join_null_complements
2581     is used for BNL/BNLH/BKA/BKA join algorthm.
2582 
2583   RETURN VALUE
2584     return one of enum_nested_loop_state.
2585 */
2586 
2587 enum_nested_loop_state JOIN_CACHE::join_null_complements(bool skip_last)
2588 {
2589   ulonglong cnt;
2590   enum_nested_loop_state rc= NESTED_LOOP_OK;
2591   bool is_first_inner= join_tab == join_tab->first_unmatched;
2592   DBUG_ENTER("JOIN_CACHE::join_null_complements");
2593 
2594   /* Return at once if there are no records in the join buffer */
2595   if (!records)
2596     DBUG_RETURN(NESTED_LOOP_OK);
2597 
2598   cnt= records - (is_key_access() ? 0 : MY_TEST(skip_last));
2599 
2600   /* This function may be called only for inner tables of outer joins */
2601   DBUG_ASSERT(join_tab->first_inner);
2602 
2603   for ( ; cnt; cnt--)
2604   {
2605     if (unlikely(join->thd->check_killed()))
2606     {
2607       /* The user has aborted the execution of the query */
2608       rc= NESTED_LOOP_KILLED;
2609       goto finish;
2610     }
2611     /* Just skip the whole record if a match for it has been already found */
2612     if (!is_first_inner || !skip_if_matched())
2613     {
2614       get_record();
2615       /* The outer row is complemented by nulls for each inner table */
2616       restore_record(join_tab->table, s->default_values);
2617       mark_as_null_row(join_tab->table);
2618       rc= generate_full_extensions(get_curr_rec());
2619       if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
2620         goto finish;
2621     }
2622   }
2623 
2624 finish:
2625   DBUG_RETURN(rc);
2626 }
2627 
2628 
2629 /*
2630   Save data on the join algorithm employed by the join cache
2631 
2632   SYNOPSIS
2633     save_explain_data()
2634       str  string to add the comment on the employed join algorithm to
2635 
2636   DESCRIPTION
2637     This function puts info about the type of the used join buffer (flat or
2638     incremental) and on the type of the the employed join algorithm (BNL,
2639     BNLH, BKA or BKAH) to the data structure
2640 
2641   RETURN VALUE
2642    0 ok
2643    1 error
2644 */
2645 
2646 bool JOIN_CACHE::save_explain_data(EXPLAIN_BKA_TYPE *explain)
2647 {
2648   explain->incremental= MY_TEST(prev_cache);
2649 
2650   explain->join_buffer_size= get_join_buffer_size();
2651 
2652   switch (get_join_alg()) {
2653   case BNL_JOIN_ALG:
2654     explain->join_alg= "BNL";
2655     break;
2656   case BNLH_JOIN_ALG:
2657     explain->join_alg= "BNLH";
2658     break;
2659   case BKA_JOIN_ALG:
2660     explain->join_alg= "BKA";
2661     break;
2662   case BKAH_JOIN_ALG:
2663     explain->join_alg= "BKAH";
2664     break;
2665   default:
2666     DBUG_ASSERT(0);
2667   }
2668   return 0;
2669 }
2670 
2671 /**
2672   get thread handle.
2673 */
2674 
2675 THD *JOIN_CACHE::thd()
2676 {
2677   return join->thd;
2678 }
2679 
2680 
2681 static bool add_mrr_explain_info(String *str, uint mrr_mode, handler *file)
2682 {
2683   char mrr_str_buf[128]={0};
2684   int len;
2685   len= file->multi_range_read_explain_info(mrr_mode, mrr_str_buf,
2686                                            sizeof(mrr_str_buf));
2687   if (len > 0)
2688   {
2689     if (str->length())
2690     {
2691       if (str->append(STRING_WITH_LEN("; ")))
2692         return 1;
2693     }
2694     if (str->append(mrr_str_buf, len))
2695       return 1;
2696   }
2697   return 0;
2698 }
2699 
2700 
2701 bool JOIN_CACHE_BKA::save_explain_data(EXPLAIN_BKA_TYPE *explain)
2702 {
2703   if (JOIN_CACHE::save_explain_data(explain))
2704     return 1;
2705   return add_mrr_explain_info(&explain->mrr_type, mrr_mode, join_tab->table->file);
2706 }
2707 
2708 
2709 bool JOIN_CACHE_BKAH::save_explain_data(EXPLAIN_BKA_TYPE *explain)
2710 {
2711   if (JOIN_CACHE::save_explain_data(explain))
2712     return 1;
2713   return add_mrr_explain_info(&explain->mrr_type, mrr_mode, join_tab->table->file);
2714 }
2715 
2716 
2717 /*
2718   Initialize a hashed join cache
2719 
2720   SYNOPSIS
2721     init()
2722       for_explain       join buffer is initialized for explain only
2723 
2724   DESCRIPTION
2725     The function initializes the cache structure with a hash table in it.
2726     The hash table will be used to store key values for the records from
2727     the join buffer.
2728     The function allocates memory for the join buffer and for descriptors of
2729     the record fields stored in the buffer.
2730     The function also initializes a hash table for record keys within the join
2731     buffer space.
2732 
2733   NOTES VALUE
2734     The function is supposed to be called by the init methods of the classes
2735     derived from JOIN_CACHE_HASHED.
2736 
2737   RETURN VALUE
2738     0   initialization with buffer allocations has been succeeded
2739     1   otherwise
2740 */
2741 
2742 int JOIN_CACHE_HASHED::init(bool for_explain)
2743 {
2744   int rc= 0;
2745   TABLE_REF *ref= &join_tab->ref;
2746 
2747   DBUG_ENTER("JOIN_CACHE_HASHED::init");
2748 
2749   hash_table= 0;
2750   key_entries= 0;
2751 
2752   key_length= ref->key_length;
2753 
2754   if ((rc= JOIN_CACHE::init(for_explain)) || for_explain)
2755     DBUG_RETURN (rc);
2756 
2757   if (!(key_buff= (uchar*) join->thd->alloc(key_length)))
2758     DBUG_RETURN(1);
2759 
2760   /* Take into account a reference to the next record in the key chain */
2761   pack_length+= get_size_of_rec_offset();
2762   pack_length_with_blob_ptrs+= get_size_of_rec_offset();
2763 
2764   ref_key_info= join_tab->get_keyinfo_by_key_no(join_tab->ref.key);
2765   ref_used_key_parts= join_tab->ref.key_parts;
2766 
2767   hash_func= &JOIN_CACHE_HASHED::get_hash_idx_simple;
2768   hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_simple;
2769 
2770   KEY_PART_INFO *key_part= ref_key_info->key_part;
2771   KEY_PART_INFO *key_part_end= key_part+ref_used_key_parts;
2772   for ( ; key_part < key_part_end; key_part++)
2773   {
2774     if (!key_part->field->eq_cmp_as_binary())
2775     {
2776       hash_func= &JOIN_CACHE_HASHED::get_hash_idx_complex;
2777       hash_cmp_func= &JOIN_CACHE_HASHED::equal_keys_complex;
2778       break;
2779     }
2780   }
2781 
2782   init_hash_table();
2783 
2784   rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
2785                      (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
2786 
2787   data_fields_offset= 0;
2788   if (use_emb_key)
2789   {
2790     CACHE_FIELD *copy= field_descr;
2791     CACHE_FIELD *copy_end= copy+flag_fields;
2792     for ( ; copy < copy_end; copy++)
2793       data_fields_offset+= copy->length;
2794   }
2795 
2796   DBUG_RETURN(0);
2797 }
2798 
2799 
2800 /*
2801   Initialize the hash table of a hashed join cache
2802 
2803   SYNOPSIS
2804     init_hash_table()
2805 
2806   DESCRIPTION
2807     The function estimates the number of hash table entries in the hash
2808     table to be used and initializes this hash table within the join buffer
2809     space.
2810 
2811   RETURN VALUE
2812     Currently the function always returns 0;
2813 */
2814 
2815 int JOIN_CACHE_HASHED::init_hash_table()
2816 {
2817   hash_table= 0;
2818   key_entries= 0;
2819 
2820   /* Calculate the minimal possible value of size_of_key_ofs greater than 1 */
2821   uint max_size_of_key_ofs= MY_MAX(2, get_size_of_rec_offset());
2822   for (size_of_key_ofs= 2;
2823        size_of_key_ofs <= max_size_of_key_ofs;
2824        size_of_key_ofs+= 2)
2825   {
2826     key_entry_length= get_size_of_rec_offset() + // key chain header
2827                       size_of_key_ofs +          // reference to the next key
2828                       (use_emb_key ?  get_size_of_rec_offset() : key_length);
2829 
2830     size_t space_per_rec= avg_record_length +
2831                          avg_aux_buffer_incr +
2832                          key_entry_length+size_of_key_ofs;
2833     size_t n= buff_size / space_per_rec;
2834 
2835     /*
2836       TODO: Make a better estimate for this upper bound of
2837             the number of records in in the join buffer.
2838     */
2839     size_t max_n= buff_size / (pack_length-length+
2840                              key_entry_length+size_of_key_ofs);
2841 
2842     hash_entries= (uint) (n / 0.7);
2843     set_if_bigger(hash_entries, 1);
2844 
2845     if (offset_size((uint)(max_n*key_entry_length)) <=
2846         size_of_key_ofs)
2847       break;
2848   }
2849 
2850   /* Initialize the hash table */
2851   hash_table= buff + (buff_size-hash_entries*size_of_key_ofs);
2852   cleanup_hash_table();
2853   curr_key_entry= hash_table;
2854 
2855   return 0;
2856 }
2857 
2858 
2859 /*
2860   Reallocate the join buffer of a hashed join cache
2861 
2862   SYNOPSIS
2863     realloc_buffer()
2864 
2865   DESCRITION
2866     The function reallocates the join buffer of the hashed join cache.
2867     After this it initializes a hash table within the buffer space and
2868     resets the join cache for writing.
2869 
2870   NOTES
2871     The function assumes that buff_size contains the new value for the join
2872     buffer size.
2873 
2874   RETURN VALUE
2875     0   if the buffer has been successfully reallocated
2876     1   otherwise
2877 */
2878 
2879 int JOIN_CACHE_HASHED::realloc_buffer()
2880 {
2881   int rc;
2882   free();
2883   rc= MY_TEST(!(buff= (uchar*) my_malloc(buff_size, MYF(MY_THREAD_SPECIFIC))));
2884   init_hash_table();
2885   reset(TRUE);
2886   return rc;
2887 }
2888 
2889 
2890 /*
2891   Get maximum size of the additional space per record used for record keys
2892 
2893   SYNOPSYS
2894     get_max_key_addon_space_per_record()
2895 
2896   DESCRIPTION
2897     The function returns the size of the space occupied by one key entry
2898     and one hash table entry.
2899 
2900   RETURN VALUE
2901     maximum size of the additional space per record that is used to store
2902     record keys in the hash table
2903 */
2904 
2905 uint JOIN_CACHE_HASHED::get_max_key_addon_space_per_record()
2906 {
2907   ulong len;
2908   TABLE_REF *ref= &join_tab->ref;
2909   /*
2910     The total number of hash entries in the hash tables is bounded by
2911     ceiling(N/0.7) where N is the maximum number of records in the buffer.
2912     That's why the multiplier 2 is used in the formula below.
2913   */
2914   len= (use_emb_key ?  get_size_of_rec_offset() : ref->key_length) +
2915         size_of_rec_ofs +    // size of the key chain header
2916         size_of_rec_ofs +    // >= size of the reference to the next key
2917         2*size_of_rec_ofs;   // >= 2*( size of hash table entry)
2918   return len;
2919 }
2920 
2921 
2922 /*
2923   Reset the buffer of a hashed join cache for reading/writing
2924 
2925   SYNOPSIS
2926     reset()
2927       for_writing  if it's TRUE the function reset the buffer for writing
2928 
2929   DESCRIPTION
2930     This implementation of the virtual function reset() resets the join buffer
2931     of the JOIN_CACHE_HASHED class for reading or writing.
2932     Additionally to what the default implementation does this function
2933     cleans up the hash table allocated within the buffer.
2934 
2935   RETURN VALUE
2936     none
2937 */
2938 
2939 void JOIN_CACHE_HASHED::reset(bool for_writing)
2940 {
2941   this->JOIN_CACHE::reset(for_writing);
2942   if (for_writing && hash_table)
2943     cleanup_hash_table();
2944   curr_key_entry= hash_table;
2945 }
2946 
2947 
2948 /*
2949   Add a record into the buffer of a hashed join cache
2950 
2951   SYNOPSIS
2952     put_record()
2953 
2954   DESCRIPTION
2955     This implementation of the virtual function put_record writes the next
2956     matching record into the join buffer of the JOIN_CACHE_HASHED class.
2957     Additionally to what the default implementation does this function
2958     performs the following.
2959     It extracts from the record the key value used in lookups for matching
2960     records and searches for this key in the hash tables from the join cache.
2961     If it finds the key in the hash table it joins the record to the chain
2962     of records with this key. If the key is not found in the hash table the
2963     key is placed into it and a chain containing only the newly added record
2964     is attached to the key entry. The key value is either placed in the hash
2965     element added for the key or, if the use_emb_key flag is set, remains in
2966     the record from the partial join.
2967     If the match flag field of a record contains MATCH_IMPOSSIBLE the key is
2968     not created for this record.
2969 
2970   RETURN VALUE
2971     TRUE    if it has been decided that it should be the last record
2972             in the join buffer,
2973     FALSE   otherwise
2974 */
2975 
2976 bool JOIN_CACHE_HASHED::put_record()
2977 {
2978   bool is_full;
2979   uchar *key;
2980   uint key_len= key_length;
2981   uchar *key_ref_ptr;
2982   uchar *link= 0;
2983   TABLE_REF *ref= &join_tab->ref;
2984   uchar *next_ref_ptr= pos;
2985 
2986   pos+= get_size_of_rec_offset();
2987   /* Write the record into the join buffer */
2988   if (prev_cache)
2989     link= prev_cache->get_curr_rec_link();
2990   write_record_data(link, &is_full);
2991 
2992   if (last_written_is_null_compl)
2993     return is_full;
2994 
2995   if (use_emb_key)
2996     key= get_curr_emb_key();
2997   else
2998   {
2999     /* Build the key over the fields read into the record buffers */
3000     cp_buffer_from_ref(join->thd, join_tab->table, ref);
3001     key= ref->key_buff;
3002   }
3003 
3004   /* Look for the key in the hash table */
3005   if (key_search(key, key_len, &key_ref_ptr))
3006   {
3007     uchar *last_next_ref_ptr;
3008     /*
3009       The key is found in the hash table.
3010       Add the record to the circular list of the records attached to this key.
3011       Below 'rec' is the record to be added into the record chain for the found
3012       key, 'key_ref' points to a flatten representation of the st_key_entry
3013       structure that contains the key and the head of the record chain.
3014     */
3015     last_next_ref_ptr= get_next_rec_ref(key_ref_ptr+get_size_of_key_offset());
3016     /* rec->next_rec= key_entry->last_rec->next_rec */
3017     memcpy(next_ref_ptr, last_next_ref_ptr, get_size_of_rec_offset());
3018     /* key_entry->last_rec->next_rec= rec */
3019     store_next_rec_ref(last_next_ref_ptr, next_ref_ptr);
3020     /* key_entry->last_rec= rec */
3021     store_next_rec_ref(key_ref_ptr+get_size_of_key_offset(), next_ref_ptr);
3022   }
3023   else
3024   {
3025     /*
3026       The key is not found in the hash table.
3027       Put the key into the join buffer linking it with the keys for the
3028       corresponding hash entry. Create a circular list with one element
3029       referencing the record and attach the list to the key in the buffer.
3030     */
3031     uchar *cp= last_key_entry;
3032     cp-= get_size_of_rec_offset()+get_size_of_key_offset();
3033     store_next_key_ref(key_ref_ptr, cp);
3034     store_null_key_ref(cp);
3035     store_next_rec_ref(next_ref_ptr, next_ref_ptr);
3036     store_next_rec_ref(cp+get_size_of_key_offset(), next_ref_ptr);
3037     if (use_emb_key)
3038     {
3039       cp-= get_size_of_rec_offset();
3040       store_emb_key_ref(cp, key);
3041     }
3042     else
3043     {
3044       cp-= key_len;
3045       memcpy(cp, key, key_len);
3046     }
3047     last_key_entry= cp;
3048     DBUG_ASSERT(last_key_entry >= end_pos);
3049     /* Increment the counter of key_entries in the hash table */
3050     key_entries++;
3051   }
3052   return is_full;
3053 }
3054 
3055 
3056 /*
3057   Read the next record from the buffer of a hashed join cache
3058 
3059   SYNOPSIS
3060     get_record()
3061 
3062   DESCRIPTION
3063     Additionally to what the default implementation of the virtual
3064     function get_record does this implementation skips the link element
3065     used to connect the records with the same key into a chain.
3066 
3067   RETURN VALUE
3068     TRUE    there are no more records to read from the join buffer
3069     FALSE   otherwise
3070 */
3071 
3072 bool JOIN_CACHE_HASHED::get_record()
3073 {
3074   pos+= get_size_of_rec_offset();
3075   return this->JOIN_CACHE::get_record();
3076 }
3077 
3078 
3079 /*
3080   Skip record from a hashed join buffer if its match flag is set to MATCH_FOUND
3081 
3082   SYNOPSIS
3083     skip_if_matched()
3084 
3085   DESCRIPTION
3086     This implementation of the virtual function skip_if_matched does
3087     the same as the default implementation does, but it takes into account
3088     the link element used to connect the records with the same key into a chain.
3089 
3090   RETURN VALUE
3091     TRUE    the match flag is MATCH_FOUND  and the record has been skipped
3092     FALSE   otherwise
3093 */
3094 
3095 bool JOIN_CACHE_HASHED::skip_if_matched()
3096 {
3097   uchar *save_pos= pos;
3098   pos+= get_size_of_rec_offset();
3099   if (!this->JOIN_CACHE::skip_if_matched())
3100   {
3101     pos= save_pos;
3102     return FALSE;
3103   }
3104   return TRUE;
3105 }
3106 
3107 
3108 /*
3109   Skip record from a hashed join buffer if its match flag dictates to do so
3110 
3111   SYNOPSIS
3112     skip_if_uneeded_match()
3113 
3114   DESCRIPTION
3115     This implementation of the virtual function skip_if_not_needed_match does
3116     the same as the default implementation does, but it takes into account
3117     the link element used to connect the records with the same key into a chain.
3118 
3119   RETURN VALUE
3120     TRUE    the match flag dictates to skip the record
3121     FALSE   the match flag is off
3122 */
3123 
3124 bool JOIN_CACHE_HASHED::skip_if_not_needed_match()
3125 {
3126   uchar *save_pos= pos;
3127   pos+= get_size_of_rec_offset();
3128   if (!this->JOIN_CACHE::skip_if_not_needed_match())
3129   {
3130     pos= save_pos;
3131     return FALSE;
3132   }
3133   return TRUE;
3134 }
3135 
3136 
3137 /*
3138   Search for a key in the hash table of the join buffer
3139 
3140   SYNOPSIS
3141     key_search()
3142       key             pointer to the key value
3143       key_len         key value length
3144       key_ref_ptr OUT position of the reference to the next key from
3145                       the hash element for the found key , or
3146                       a position where the reference to the the hash
3147                       element for the key is to be added in the
3148                       case when the key has not been found
3149 
3150   DESCRIPTION
3151     The function looks for a key in the hash table of the join buffer.
3152     If the key is found the functionreturns the position of the reference
3153     to the next key from  to the hash element for the given key.
3154     Otherwise the function returns the position where the reference to the
3155     newly created hash element for the given key is to be added.
3156 
3157   RETURN VALUE
3158     TRUE    the key is found in the hash table
3159     FALSE   otherwise
3160 */
3161 
3162 bool JOIN_CACHE_HASHED::key_search(uchar *key, uint key_len,
3163                                    uchar **key_ref_ptr)
3164 {
3165   bool is_found= FALSE;
3166   uint idx= (this->*hash_func)(key, key_length);
3167   uchar *ref_ptr= hash_table+size_of_key_ofs*idx;
3168   while (!is_null_key_ref(ref_ptr))
3169   {
3170     uchar *next_key;
3171     ref_ptr= get_next_key_ref(ref_ptr);
3172     next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) :
3173                             ref_ptr-key_length;
3174 
3175     if ((this->*hash_cmp_func)(next_key, key, key_len))
3176     {
3177       is_found= TRUE;
3178       break;
3179     }
3180   }
3181   *key_ref_ptr= ref_ptr;
3182   return is_found;
3183 }
3184 
3185 
3186 /*
3187   Hash function that considers a key in the hash table as byte array
3188 
3189   SYNOPSIS
3190     get_hash_idx_simple()
3191       key             pointer to the key value
3192       key_len         key value length
3193 
3194   DESCRIPTION
3195     The function calculates an index of the hash entry in the hash table
3196     of the join buffer for the given key. It considers the key just as
3197     a sequence of bytes of the length key_len.
3198 
3199   RETURN VALUE
3200     the calculated index of the hash entry for the given key
3201 */
3202 
3203 inline
3204 uint JOIN_CACHE_HASHED::get_hash_idx_simple(uchar* key, uint key_len)
3205 {
3206   ulong nr= 1;
3207   ulong nr2= 4;
3208   uchar *pos= key;
3209   uchar *end= key+key_len;
3210   for (; pos < end ; pos++)
3211   {
3212     nr^= (ulong) ((((uint) nr & 63)+nr2)*((uint) *pos))+ (nr << 8);
3213     nr2+= 3;
3214   }
3215   return nr % hash_entries;
3216 }
3217 
3218 
3219 /*
3220   Hash function that takes into account collations of the components of the key
3221 
3222   SYNOPSIS
3223     get_hash_idx_complex()
3224       key             pointer to the key value
3225       key_len         key value length
3226 
3227   DESCRIPTION
3228     The function calculates an index of the hash entry in the hash table
3229     of the join buffer for the given key. It takes into account that the
3230     components of the key may be of a varchar type with different collations.
3231     The function guarantees that the same hash value for any two equal
3232     keys that may differ as byte sequences.
3233     The function takes the info about the components of the key, their
3234     types and used collations from the class member ref_key_info containing
3235     a pointer to the descriptor of the index that can be used for the join
3236     operation.
3237 
3238   RETURN VALUE
3239     the calculated index of the hash entry for the given key
3240 */
3241 
3242 inline
3243 uint JOIN_CACHE_HASHED::get_hash_idx_complex(uchar *key, uint key_len)
3244 {
3245   return
3246     (uint) (key_hashnr(ref_key_info, ref_used_key_parts, key) % hash_entries);
3247 }
3248 
3249 
3250 /*
3251   Compare two key entries in the hash table as sequence of bytes
3252 
3253   SYNOPSIS
3254     equal_keys_simple()
3255       key1            pointer to the first key entry
3256       key2            pointer to the second key entry
3257       key_len         the length of the key values
3258 
3259   DESCRIPTION
3260     The function compares two key entries in the hash table key1 and key2
3261     as two sequences bytes of the length key_len
3262 
3263   RETURN VALUE
3264     TRUE       key1 coincides with key2
3265     FALSE      otherwise
3266 */
3267 
3268 inline
3269 bool JOIN_CACHE_HASHED::equal_keys_simple(uchar *key1, uchar *key2,
3270                                           uint key_len)
3271 {
3272   return memcmp(key1, key2, key_len) == 0;
3273 }
3274 
3275 
3276 /*
3277   Compare two key entries taking into account the used collation
3278 
3279   SYNOPSIS
3280     equal_keys_complex()
3281       key1            pointer to the first key entry
3282       key2            pointer to the second key entry
3283       key_len         the length of the key values
3284 
3285   DESCRIPTION
3286     The function checks whether two key entries in the hash table
3287     key1 and key2 are equal as, possibly, compound keys of a certain
3288     structure whose components may be of a varchar type and may
3289     employ different collations.
3290     The descriptor of the key structure is taken from the class
3291     member ref_key_info.
3292 
3293   RETURN VALUE
3294     TRUE       key1 is equal tokey2
3295     FALSE      otherwise
3296 */
3297 
3298 inline
3299 bool JOIN_CACHE_HASHED::equal_keys_complex(uchar *key1, uchar *key2,
3300                                           uint key_len)
3301 {
3302   return key_buf_cmp(ref_key_info, ref_used_key_parts, key1, key2) == 0;
3303 }
3304 
3305 
3306 /*
3307   Clean up the hash table of the join buffer
3308 
3309   SYNOPSIS
3310     cleanup_hash_table()
3311       key             pointer to the key value
3312       key_len         key value length
3313 
3314   DESCRIPTION
3315     The function cleans up the hash table in the join buffer removing all
3316     hash elements from the table.
3317 
3318   RETURN VALUE
3319     none
3320 */
3321 
3322 void JOIN_CACHE_HASHED:: cleanup_hash_table()
3323 {
3324   last_key_entry= hash_table;
3325   bzero(hash_table, (buff+buff_size)-hash_table);
3326   key_entries= 0;
3327 }
3328 
3329 
3330 /*
3331   Check whether all records in a key chain have their match flags set on
3332 
3333   SYNOPSIS
3334     check_all_match_flags_for_key()
3335       key_chain_ptr
3336 
3337   DESCRIPTION
3338     This function retrieves records in the given circular chain and checks
3339     whether their match flags are set on. The parameter key_chain_ptr shall
3340     point to the position in the join buffer storing the reference to the
3341     last element of this chain.
3342 
3343   RETURN VALUE
3344     TRUE   if each retrieved record has its match flag set to MATCH_FOUND
3345     FALSE  otherwise
3346 */
3347 
3348 bool JOIN_CACHE_HASHED::check_all_match_flags_for_key(uchar *key_chain_ptr)
3349 {
3350   uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
3351   uchar *next_rec_ref_ptr= last_rec_ref_ptr;
3352   do
3353   {
3354     next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
3355     uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
3356     if (get_match_flag_by_pos(rec_ptr) != MATCH_FOUND)
3357       return FALSE;
3358   }
3359   while (next_rec_ref_ptr != last_rec_ref_ptr);
3360   return TRUE;
3361 }
3362 
3363 
3364 /*
3365   Get the next key built for the records from the buffer of a hashed join cache
3366 
3367   SYNOPSIS
3368     get_next_key()
3369       key    pointer to the buffer where the key value is to be placed
3370 
3371   DESCRIPTION
3372     The function reads the next key value stored in the hash table of the
3373     join buffer. Depending on the value of the use_emb_key flag of the
3374     join cache the value is read either from the table itself or from
3375     the record field where it occurs.
3376 
3377   RETURN VALUE
3378     length of the key value - if the starting value of 'cur_key_entry' refers
3379     to the position after that referred by the the value of 'last_key_entry',
3380     0 - otherwise.
3381 */
3382 
3383 uint JOIN_CACHE_HASHED::get_next_key(uchar ** key)
3384 {
3385   if (curr_key_entry == last_key_entry)
3386     return 0;
3387 
3388   curr_key_entry-= key_entry_length;
3389 
3390   *key = use_emb_key ? get_emb_key(curr_key_entry) : curr_key_entry;
3391 
3392   DBUG_ASSERT(*key >= buff && *key < hash_table);
3393 
3394   return key_length;
3395 }
3396 
3397 
3398 /*
3399   Initiate an iteration process over records in the joined table
3400 
3401   SYNOPSIS
3402     open()
3403 
3404   DESCRIPTION
3405     The function initiates the process of iteration over records from the
3406     joined table recurrently performed by the BNL/BKLH join algorithm.
3407 
3408   RETURN VALUE
3409     0            the initiation is a success
3410     error code   otherwise
3411 */
3412 
3413 int JOIN_TAB_SCAN::open()
3414 {
3415   save_or_restore_used_tabs(join_tab, FALSE);
3416   is_first_record= TRUE;
3417   join_tab->tracker->r_scans++;
3418   return join_init_read_record(join_tab);
3419 }
3420 
3421 
3422 /*
3423   Read the next record that can match while scanning the joined table
3424 
3425   SYNOPSIS
3426     next()
3427 
3428   DESCRIPTION
3429     The function reads the next record from the joined table that can
3430     match some records in the buffer of the join cache 'cache'. To do
3431     this the function calls the function that scans table records and
3432     looks for the next one that meets the condition pushed to the
3433     joined table join_tab.
3434 
3435   NOTES
3436     The function catches the signal that kills the query.
3437 
3438   RETURN VALUE
3439     0            the next record exists and has been successfully read
3440     error code   otherwise
3441 */
3442 
3443 int JOIN_TAB_SCAN::next()
3444 {
3445   int err= 0;
3446   int skip_rc;
3447   READ_RECORD *info= &join_tab->read_record;
3448   SQL_SELECT *select= join_tab->cache_select;
3449   THD *thd= join->thd;
3450 
3451   if (is_first_record)
3452     is_first_record= FALSE;
3453   else
3454     err= info->read_record();
3455 
3456   if (!err)
3457   {
3458     join_tab->tracker->r_rows++;
3459   }
3460 
3461   while (!err && select && (skip_rc= select->skip_record(thd)) <= 0)
3462   {
3463     if (unlikely(thd->check_killed()) || skip_rc < 0)
3464       return 1;
3465     /*
3466       Move to the next record if the last retrieved record does not
3467       meet the condition pushed to the table join_tab.
3468     */
3469     err= info->read_record();
3470     if (!err)
3471     {
3472       join_tab->tracker->r_rows++;
3473     }
3474   }
3475 
3476   if (!err)
3477     join_tab->tracker->r_rows_after_where++;
3478   return err;
3479 }
3480 
3481 
3482 /*
3483   Walk back in join order from join_tab until we encounter a join tab with
3484   tab->cache!=NULL, and save/restore tab->table->status along the way.
3485 
3486   @param save TRUE   save
3487               FALSE  restore
3488 */
3489 
3490 static void save_or_restore_used_tabs(JOIN_TAB *join_tab, bool save)
3491 {
3492   JOIN_TAB *first= join_tab->bush_root_tab?
3493                      join_tab->bush_root_tab->bush_children->start :
3494                      join_tab->join->join_tab + join_tab->join->const_tables;
3495 
3496   for (JOIN_TAB *tab= join_tab-1; tab != first && !tab->cache; tab--)
3497   {
3498     if (tab->bush_children)
3499     {
3500       for (JOIN_TAB *child= tab->bush_children->start;
3501            child != tab->bush_children->end;
3502            child++)
3503       {
3504         if (save)
3505           child->table->status= child->status;
3506         else
3507         {
3508           tab->status= tab->table->status;
3509           tab->table->status= 0;
3510         }
3511       }
3512     }
3513 
3514     if (save)
3515       tab->table->status= tab->status;
3516     else
3517     {
3518       tab->status= tab->table->status;
3519       tab->table->status= 0;
3520     }
3521   }
3522 }
3523 
3524 
3525 /*
3526   Perform finalizing actions for a scan over the table records
3527 
3528   SYNOPSIS
3529     close()
3530 
3531   DESCRIPTION
3532     The function performs the necessary restoring actions after
3533     the table scan over the joined table has been finished.
3534 
3535   RETURN VALUE
3536     none
3537 */
3538 
3539 void JOIN_TAB_SCAN::close()
3540 {
3541   save_or_restore_used_tabs(join_tab, TRUE);
3542 }
3543 
3544 
3545 /*
3546   Prepare to iterate over the BNL join cache buffer to look for matches
3547 
3548   SYNOPSIS
3549     prepare_look_for_matches()
3550       skip_last   <-> ignore the last record in the buffer
3551 
3552   DESCRIPTION
3553     The function prepares the join cache for an iteration over the
3554     records in the join buffer. The iteration is performed when looking
3555     for matches for the record from the joined table join_tab that
3556     has been placed into the record buffer of the joined table.
3557     If the value of the parameter skip_last is TRUE then the last
3558     record from the join buffer is ignored.
3559     The function initializes the counter of the records that have been
3560     not iterated over yet.
3561 
3562   RETURN VALUE
3563     TRUE    there are no records in the buffer to iterate over
3564     FALSE   otherwise
3565 */
3566 
3567 bool JOIN_CACHE_BNL::prepare_look_for_matches(bool skip_last)
3568 {
3569   if (!records)
3570     return TRUE;
3571   reset(FALSE);
3572   rem_records= (uint)records - MY_TEST(skip_last);
3573   return rem_records == 0;
3574 }
3575 
3576 
3577 /*
3578   Get next record from the BNL join cache buffer when looking for matches
3579 
3580   SYNOPSIS
3581     get_next_candidate_for_match
3582 
3583   DESCRIPTION
3584     This method is used for iterations over the records from the join
3585     cache buffer when looking for matches for records from join_tab.
3586     The methods performs the necessary preparations to read the next record
3587     from the join buffer into the record buffer by the method
3588     read_next_candidate_for_match, or, to skip the next record from the join
3589     buffer by the method skip_recurrent_candidate_for_match.
3590     This implementation of the virtual method get_next_candidate_for_match
3591     just  decrements the counter of the records that are to be iterated over
3592     and returns the current value of the cursor 'pos' as the position of
3593     the record to be processed.
3594 
3595   RETURN VALUE
3596     pointer to the position right after the prefix of the current record
3597     in the join buffer if the there is another record to iterate over,
3598     0 - otherwise.
3599 */
3600 
3601 uchar *JOIN_CACHE_BNL::get_next_candidate_for_match()
3602 {
3603   if (!rem_records)
3604     return 0;
3605   rem_records--;
3606   return pos+base_prefix_length;
3607 }
3608 
3609 
3610 /*
3611   Check whether the matching record from the BNL cache is to be skipped
3612 
3613   SYNOPSIS
3614     skip_next_candidate_for_match
3615     rec_ptr  pointer to the position in the join buffer right after the prefix
3616              of the current record
3617 
3618   DESCRIPTION
3619     This implementation of the virtual function just calls the
3620     method skip_if_not_needed_match to check whether the record referenced by
3621     ref_ptr has its match flag set either to MATCH_FOUND and join_tab is the
3622     first inner table of a semi-join, or it's set to MATCH_IMPOSSIBLE and
3623     join_tab is the first inner table of an outer join.
3624     If so, the function just skips this record setting the value of the
3625     cursor 'pos' to the position right after it.
3626 
3627   RETURN VALUE
3628     TRUE   the record referenced by rec_ptr has been skipped
3629     FALSE  otherwise
3630 */
3631 
3632 bool JOIN_CACHE_BNL::skip_next_candidate_for_match(uchar *rec_ptr)
3633 {
3634   pos= rec_ptr-base_prefix_length;
3635   return skip_if_not_needed_match();
3636 }
3637 
3638 
3639 /*
3640   Read next record from the BNL join cache buffer when looking for matches
3641 
3642   SYNOPSIS
3643     read_next_candidate_for_match
3644     rec_ptr  pointer to the position in the join buffer right after the prefix
3645              the current record.
3646 
3647   DESCRIPTION
3648     This implementation of the virtual method read_next_candidate_for_match
3649     calls the method get_record to read the record referenced by rec_ptr from
3650     the join buffer into the record buffer. If this record refers to the
3651     fields in the other join buffers the call of get_record ensures that
3652     these fields are read into the corresponding record buffers as well.
3653     This function is supposed to be called after a successful call of
3654     the method get_next_candidate_for_match.
3655 
3656   RETURN VALUE
3657     none
3658 */
3659 
3660 void JOIN_CACHE_BNL::read_next_candidate_for_match(uchar *rec_ptr)
3661 {
3662   pos= rec_ptr-base_prefix_length;
3663   get_record();
3664 }
3665 
3666 
3667 /*
3668   Initialize the BNL join cache
3669 
3670   SYNOPSIS
3671     init
3672       for_explain       join buffer is initialized for explain only
3673 
3674   DESCRIPTION
3675     The function initializes the cache structure. It is supposed to be called
3676     right after a constructor for the JOIN_CACHE_BNL.
3677 
3678   NOTES
3679     The function first constructs a companion object of the type JOIN_TAB_SCAN,
3680     then it calls the init method of the parent class.
3681 
3682   RETURN VALUE
3683     0   initialization with buffer allocations has been succeeded
3684     1   otherwise
3685 */
3686 
3687 int JOIN_CACHE_BNL::init(bool for_explain)
3688 {
3689   DBUG_ENTER("JOIN_CACHE_BNL::init");
3690 
3691   if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
3692     DBUG_RETURN(1);
3693 
3694   DBUG_RETURN(JOIN_CACHE::init(for_explain));
3695 }
3696 
3697 
3698 /*
3699   Get the chain of records from buffer matching the current candidate for join
3700 
3701   SYNOPSIS
3702     get_matching_chain_by_join_key()
3703 
3704   DESCRIPTION
3705     This function first build a join key for the record of join_tab that
3706     currently is in the join buffer for this table. Then it looks for
3707     the key entry with this key in the hash table of the join cache.
3708     If such a key entry is found the function returns the pointer to
3709     the head of the chain of records in the join_buffer that match this
3710     key.
3711 
3712   RETURN VALUE
3713     The pointer to the corresponding circular list of records if
3714     the key entry with the join key is found, 0 - otherwise.
3715 */
3716 
3717 uchar *JOIN_CACHE_BNLH::get_matching_chain_by_join_key()
3718 {
3719   uchar *key_ref_ptr;
3720   TABLE *table= join_tab->table;
3721   TABLE_REF *ref= &join_tab->ref;
3722   KEY *keyinfo= join_tab->get_keyinfo_by_key_no(ref->key);
3723   /* Build the join key value out of the record in the record buffer */
3724   key_copy(key_buff, table->record[0], keyinfo, key_length, TRUE);
3725   /* Look for this key in the join buffer */
3726   if (!key_search(key_buff, key_length, &key_ref_ptr))
3727     return 0;
3728   return key_ref_ptr+get_size_of_key_offset();
3729 }
3730 
3731 
3732 /*
3733   Prepare to iterate over the BNLH join cache buffer to look for matches
3734 
3735   SYNOPSIS
3736     prepare_look_for_matches()
3737       skip_last   <-> ignore the last record in the buffer
3738 
3739   DESCRIPTION
3740     The function prepares the join cache for an iteration over the
3741     records in the join buffer. The iteration is performed when looking
3742     for matches for the record from the joined table join_tab that
3743     has been placed into the record buffer of the joined table.
3744     If the value of the parameter skip_last is TRUE then the last
3745     record from the join buffer is ignored.
3746     The function builds the hashed key from the join fields of join_tab
3747     and uses this key to look in the hash table of the join cache for
3748     the chain of matching records in in the join buffer. If it finds
3749     such a chain it sets  the member last_rec_ref_ptr to point to the
3750     last link of the chain while setting the member next_rec_ref_po 0.
3751 
3752   RETURN VALUE
3753     TRUE    there are no matching records in the buffer to iterate over
3754     FALSE   otherwise
3755 */
3756 
3757 bool JOIN_CACHE_BNLH::prepare_look_for_matches(bool skip_last)
3758 {
3759   uchar *curr_matching_chain;
3760   last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
3761   if (!(curr_matching_chain= get_matching_chain_by_join_key()))
3762     return 1;
3763   last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain);
3764   return 0;
3765 }
3766 
3767 
3768 /*
3769   Get next record from the BNLH join cache buffer when looking for matches
3770 
3771   SYNOPSIS
3772     get_next_candidate_for_match
3773 
3774   DESCRIPTION
3775     This method is used for iterations over the records from the join
3776     cache buffer when looking for matches for records from join_tab.
3777     The methods performs the necessary preparations to read the next record
3778     from the join buffer into the record buffer by the method
3779     read_next_candidate_for_match, or, to skip the next record from the join
3780     buffer by the method skip_next_candidate_for_match.
3781     This implementation of the virtual method moves to the next record
3782     in the chain of all records from the join buffer that are to be
3783     equi-joined with the current record from join_tab.
3784 
3785   RETURN VALUE
3786     pointer to the beginning of the record fields in the join buffer
3787     if the there is another record to iterate over, 0 - otherwise.
3788 */
3789 
3790 uchar *JOIN_CACHE_BNLH::get_next_candidate_for_match()
3791 {
3792   if (next_matching_rec_ref_ptr == last_matching_rec_ref_ptr)
3793     return 0;
3794   next_matching_rec_ref_ptr= get_next_rec_ref(next_matching_rec_ref_ptr ?
3795                                                 next_matching_rec_ref_ptr :
3796                                                 last_matching_rec_ref_ptr);
3797   return next_matching_rec_ref_ptr+rec_fields_offset;
3798 }
3799 
3800 
3801 /*
3802   Check whether the matching record from the BNLH cache is to be skipped
3803 
3804   SYNOPSIS
3805     skip_next_candidate_for_match
3806     rec_ptr  pointer to the position in the join buffer right after
3807              the previous record
3808 
3809   DESCRIPTION
3810     This implementation of the virtual function just calls the
3811     method get_match_flag_by_pos to check whether the record referenced
3812     by ref_ptr has its match flag set to MATCH_FOUND.
3813 
3814   RETURN VALUE
3815     TRUE   the record referenced by rec_ptr has its match flag set to
3816            MATCH_FOUND
3817     FALSE  otherwise
3818 */
3819 
3820 bool JOIN_CACHE_BNLH::skip_next_candidate_for_match(uchar *rec_ptr)
3821 {
3822  return  join_tab->check_only_first_match() &&
3823           (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
3824 }
3825 
3826 
3827 /*
3828   Read next record from the BNLH join cache buffer when looking for matches
3829 
3830   SYNOPSIS
3831     read_next_candidate_for_match
3832     rec_ptr  pointer to the position in the join buffer right after
3833              the previous record
3834 
3835   DESCRIPTION
3836     This implementation of the virtual method read_next_candidate_for_match
3837     calls the method get_record_by_pos to read the record referenced by rec_ptr
3838     from the join buffer into the record buffer. If this record refers to
3839     fields in the other join buffers the call of get_record_by_po ensures that
3840     these fields are read into the corresponding record buffers as well.
3841     This function is supposed to be called after a successful call of
3842     the method get_next_candidate_for_match.
3843 
3844   RETURN VALUE
3845     none
3846 */
3847 
3848 void JOIN_CACHE_BNLH::read_next_candidate_for_match(uchar *rec_ptr)
3849 {
3850   get_record_by_pos(rec_ptr);
3851 }
3852 
3853 
3854 /*
3855   Initialize the BNLH join cache
3856 
3857   SYNOPSIS
3858     init
3859       for_explain       join buffer is initialized for explain only
3860 
3861   DESCRIPTION
3862     The function initializes the cache structure. It is supposed to be called
3863     right after a constructor for the JOIN_CACHE_BNLH.
3864 
3865   NOTES
3866     The function first constructs a companion object of the type JOIN_TAB_SCAN,
3867     then it calls the init method of the parent class.
3868 
3869   RETURN VALUE
3870     0   initialization with buffer allocations has been succeeded
3871     1   otherwise
3872 */
3873 
3874 int JOIN_CACHE_BNLH::init(bool for_explain)
3875 {
3876   DBUG_ENTER("JOIN_CACHE_BNLH::init");
3877 
3878   if (!(join_tab_scan= new JOIN_TAB_SCAN(join, join_tab)))
3879     DBUG_RETURN(1);
3880 
3881   DBUG_RETURN(JOIN_CACHE_HASHED::init(for_explain));
3882 }
3883 
3884 
3885 /*
3886   Calculate the increment of the MRR buffer for a record write
3887 
3888   SYNOPSIS
3889     aux_buffer_incr()
3890 
3891   DESCRIPTION
3892     This implementation of the virtual function aux_buffer_incr determines
3893     for how much the size of the MRR buffer should be increased when another
3894     record is added to the cache.
3895 
3896   RETURN VALUE
3897     the increment of the size of the MRR buffer for the next record
3898 */
3899 
3900 uint JOIN_TAB_SCAN_MRR::aux_buffer_incr(size_t recno)
3901 {
3902   uint incr= 0;
3903   TABLE_REF *ref= &join_tab->ref;
3904   TABLE *tab= join_tab->table;
3905   ha_rows rec_per_key=
3906     (ha_rows) tab->key_info[ref->key].actual_rec_per_key(ref->key_parts-1);
3907   set_if_bigger(rec_per_key, 1);
3908   if (recno == 1)
3909     incr=  ref->key_length + tab->file->ref_length;
3910   incr+= (uint)(tab->file->stats.mrr_length_per_rec * rec_per_key);
3911   return incr;
3912 }
3913 
3914 
3915 /*
3916   Initiate iteration over records returned by MRR for the current join buffer
3917 
3918   SYNOPSIS
3919     open()
3920 
3921   DESCRIPTION
3922     The function initiates the process of iteration over the records from
3923     join_tab returned by the MRR interface functions for records from
3924     the join buffer. Such an iteration is performed by the BKA/BKAH join
3925     algorithm for each new refill of the join buffer.
3926     The function calls the MRR handler function multi_range_read_init to
3927     initiate this process.
3928 
3929   RETURN VALUE
3930     0            the initiation is a success
3931     error code   otherwise
3932 */
3933 
3934 int JOIN_TAB_SCAN_MRR::open()
3935 {
3936   handler *file= join_tab->table->file;
3937 
3938   join_tab->table->null_row= 0;
3939 
3940 
3941   /* Dynamic range access is never used with BKA */
3942   DBUG_ASSERT(join_tab->use_quick != 2);
3943 
3944   join_tab->tracker->r_scans++;
3945   save_or_restore_used_tabs(join_tab, FALSE);
3946 
3947   init_mrr_buff();
3948 
3949   /*
3950     Prepare to iterate over keys from the join buffer and to get
3951     matching candidates obtained with MMR handler functions.
3952   */
3953   if (!file->inited)
3954     file->ha_index_init(join_tab->ref.key, 1);
3955   ranges= cache->get_number_of_ranges_for_mrr();
3956   if (!join_tab->cache_idx_cond)
3957     range_seq_funcs.skip_index_tuple= 0;
3958   return file->multi_range_read_init(&range_seq_funcs, (void*) cache,
3959                                      ranges, mrr_mode, &mrr_buff);
3960 }
3961 
3962 
3963 /*
3964   Read the next record returned by MRR for the current join buffer
3965 
3966   SYNOPSIS
3967     next()
3968 
3969   DESCRIPTION
3970     The function reads the next record from the joined table join_tab
3971     returned by the MRR handler function multi_range_read_next for
3972     the current refill of the join buffer. The record is read into
3973     the record buffer used for join_tab records in join operations.
3974 
3975   RETURN VALUE
3976     0            the next record exists and has been successfully read
3977     error code   otherwise
3978 */
3979 
3980 int JOIN_TAB_SCAN_MRR::next()
3981 {
3982   char **ptr= (char **) cache->get_curr_association_ptr();
3983 
3984   DBUG_ASSERT(sizeof(range_id_t) == sizeof(*ptr));
3985   int rc= join_tab->table->file->multi_range_read_next((range_id_t*)ptr) ? -1 : 0;
3986   if (!rc)
3987   {
3988     join_tab->tracker->r_rows++;
3989     join_tab->tracker->r_rows_after_where++;
3990     /*
3991       If a record in in an incremental cache contains no fields then the
3992       association for the last record in cache will be equal to cache->end_pos
3993     */
3994     /*
3995     psergey: this makes no sense where HA_MRR_NO_ASSOC is used.
3996     DBUG_ASSERT(cache->buff <= (uchar *) (*ptr) &&
3997                 (uchar *) (*ptr) <= cache->end_pos);
3998     */
3999   }
4000   return rc;
4001 }
4002 
4003 
4004 static
4005 void bka_range_seq_key_info(void *init_params, uint *length,
4006                           key_part_map *map)
4007 {
4008   TABLE_REF *ref= &(((JOIN_CACHE*)init_params)->join_tab->ref);
4009   *length= ref->key_length;
4010   *map= (key_part_map(1) << ref->key_parts) - 1;
4011 }
4012 
4013 
4014 /*
4015 Initialize retrieval of range sequence for BKA join algorithm
4016 
4017 SYNOPSIS
4018   bka_range_seq_init()
4019    init_params   pointer to the BKA join cache object
4020    n_ranges      the number of ranges obtained
4021    flags         combination of MRR flags
4022 
4023 DESCRIPTION
4024   The function interprets init_param as a pointer to a JOIN_CACHE_BKA
4025   object. The function prepares for an iteration over the join keys
4026   built for all records from the cache join buffer.
4027 
4028 NOTE
4029   This function are used only as a callback function.
4030 
4031 RETURN VALUE
4032   init_param value that is to be used as a parameter of bka_range_seq_next()
4033 */
4034 
4035 static
4036 range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags)
4037 {
4038   DBUG_ENTER("bka_range_seq_init");
4039   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param;
4040   cache->reset(0);
4041   DBUG_RETURN((range_seq_t) init_param);
4042 }
4043 
4044 
4045 /*
4046 Get the next range/key over records from the join buffer used by a BKA cache
4047 
4048 SYNOPSIS
4049   bka_range_seq_next()
4050     seq        the value returned by  bka_range_seq_init
4051     range  OUT reference to the next range
4052 
4053 DESCRIPTION
4054   The function interprets seq as a pointer to a JOIN_CACHE_BKA
4055   object. The function returns a pointer to the range descriptor
4056   for the key built over the next record from the join buffer.
4057 
4058 NOTE
4059   This function are used only as a callback function.
4060 
4061 RETURN VALUE
4062   FALSE   ok, the range structure filled with info about the next range/key
4063   TRUE    no more ranges
4064 */
4065 
4066 static
4067 bool bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
4068 {
4069   DBUG_ENTER("bka_range_seq_next");
4070   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
4071   TABLE_REF *ref= &cache->join_tab->ref;
4072   key_range *start_key= &range->start_key;
4073   if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
4074   {
4075     start_key->keypart_map= (1 << ref->key_parts) - 1;
4076     start_key->flag= HA_READ_KEY_EXACT;
4077     range->end_key= *start_key;
4078     range->end_key.flag= HA_READ_AFTER_KEY;
4079     range->ptr= (char *) cache->get_curr_rec();
4080     range->range_flag= EQ_RANGE;
4081     DBUG_RETURN(0);
4082   }
4083   DBUG_RETURN(1);
4084 }
4085 
4086 
4087 /*
4088 Check whether range_info orders to skip the next record from BKA buffer
4089 
4090 SYNOPSIS
4091   bka_range_seq_skip_record()
4092     seq              value returned by bka_range_seq_init()
4093     range_info       information about the next range
4094     rowid [NOT USED] rowid of the record to be checked
4095 
4096 
4097 DESCRIPTION
4098   The function interprets seq as a pointer to a JOIN_CACHE_BKA object.
4099   The function returns TRUE if the record with this range_info
4100   is to be filtered out from the stream of records returned by
4101   multi_range_read_next().
4102 
4103 NOTE
4104   This function are used only as a callback function.
4105 
4106 RETURN VALUE
4107   1    record with this range_info is to be filtered out from the stream
4108        of records returned by multi_range_read_next()
4109   0    the record is to be left in the stream
4110 */
4111 
4112 static
4113 bool bka_range_seq_skip_record(range_seq_t rseq, range_id_t range_info, uchar *rowid)
4114 {
4115   DBUG_ENTER("bka_range_seq_skip_record");
4116   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
4117   bool res= cache->get_match_flag_by_pos((uchar *) range_info) ==
4118     JOIN_CACHE::MATCH_FOUND;
4119   DBUG_RETURN(res);
4120 }
4121 
4122 
4123 /*
4124 Check if the record combination from BKA cache matches the index condition
4125 
4126 SYNOPSIS
4127   bka_skip_index_tuple()
4128     rseq             value returned by bka_range_seq_init()
4129     range_info       record chain for the next range/key returned by MRR
4130 
4131 DESCRIPTION
4132   This is wrapper for JOIN_CACHE_BKA::skip_index_tuple method,
4133   see comments there.
4134 
4135 NOTE
4136   This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
4137 
4138 RETURN VALUE
4139   0    The record combination satisfies the index condition
4140   1    Otherwise
4141 */
4142 
4143 static
4144 bool bka_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
4145 {
4146   DBUG_ENTER("bka_skip_index_tuple");
4147   JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
4148   THD *thd= cache->thd();
4149   bool res;
4150   status_var_increment(thd->status_var.ha_icp_attempts);
4151   if (!(res= cache->skip_index_tuple(range_info)))
4152     status_var_increment(thd->status_var.ha_icp_match);
4153   DBUG_RETURN(res);
4154 }
4155 
4156 
4157 /*
4158 Prepare to read the record from BKA cache matching the current joined record
4159 
4160 SYNOPSIS
4161   prepare_look_for_matches()
4162     skip_last <-> ignore the last record in the buffer (always unused here)
4163 
4164 DESCRIPTION
4165   The function prepares to iterate over records in the join cache buffer
4166   matching the record loaded into the record buffer for join_tab when
4167   performing join operation by BKA join algorithm. With BKA algorithms the
4168   record loaded into the record buffer for join_tab always has a direct
4169   reference to the matching records from the join buffer. When the regular
4170   BKA join algorithm is employed the record from join_tab can refer to
4171   only one such record.
4172   The function sets the counter of the remaining records from the cache
4173   buffer that would match the current join_tab record to 1.
4174 
4175 RETURN VALUE
4176   TRUE    there are no records in the buffer to iterate over
4177   FALSE   otherwise
4178 */
4179 
4180 bool JOIN_CACHE_BKA::prepare_look_for_matches(bool skip_last)
4181 {
4182   if (!records)
4183     return TRUE;
4184   rem_records= 1;
4185   return FALSE;
4186 }
4187 
4188 
4189 /*
4190 Get the record from the BKA cache matching the current joined record
4191 
4192 SYNOPSIS
4193   get_next_candidate_for_match
4194 
4195 DESCRIPTION
4196   This method is used for iterations over the records from the join
4197   cache buffer when looking for matches for records from join_tab.
4198   The method performs the necessary preparations to read the next record
4199   from the join buffer into the record buffer by the method
4200   read_next_candidate_for_match, or, to skip the next record from the join
4201   buffer by the method skip_if_not_needed_match.
4202   This implementation of the virtual method get_next_candidate_for_match
4203   just  decrements the counter of the records that are to be iterated over
4204   and returns the value of curr_association as a reference to the position
4205   of the beginning of the record fields in the buffer.
4206 
4207 RETURN VALUE
4208   pointer to the start of the record fields in the join buffer
4209   if the there is another record to iterate over, 0 - otherwise.
4210 */
4211 
4212 uchar *JOIN_CACHE_BKA::get_next_candidate_for_match()
4213 {
4214   if (!rem_records)
4215     return 0;
4216   rem_records--;
4217   return curr_association;
4218 }
4219 
4220 
4221 /*
4222 Check whether the matching record from the BKA cache is to be skipped
4223 
4224 SYNOPSIS
4225   skip_next_candidate_for_match
4226   rec_ptr  pointer to the position in the join buffer right after
4227            the previous record
4228 
4229 DESCRIPTION
4230   This implementation of the virtual function just calls the
4231   method get_match_flag_by_pos to check whether the record referenced
4232   by ref_ptr has its match flag set to MATCH_FOUND.
4233 
4234 RETURN VALUE
4235   TRUE   the record referenced by rec_ptr has its match flag set to
4236          MATCH_FOUND
4237   FALSE  otherwise
4238 */
4239 
4240 bool JOIN_CACHE_BKA::skip_next_candidate_for_match(uchar *rec_ptr)
4241 {
4242   return join_tab->check_only_first_match() &&
4243     (get_match_flag_by_pos(rec_ptr) == MATCH_FOUND);
4244 }
4245 
4246 
4247 /*
4248 Read the next record from the BKA join cache buffer when looking for matches
4249 
4250 SYNOPSIS
4251   read_next_candidate_for_match
4252   rec_ptr  pointer to the position in the join buffer right after
4253            the previous record
4254 
4255 DESCRIPTION
4256   This implementation of the virtual method read_next_candidate_for_match
4257   calls the method get_record_by_pos to read the record referenced by rec_ptr
4258   from the join buffer into the record buffer. If this record refers to
4259   fields in the other join buffers the call of get_record_by_po ensures that
4260   these fields are read into the corresponding record buffers as well.
4261   This function is supposed to be called after a successful call of
4262   the method get_next_candidate_for_match.
4263 
4264 RETURN VALUE
4265   none
4266 */
4267 
4268 void JOIN_CACHE_BKA::read_next_candidate_for_match(uchar *rec_ptr)
4269 {
4270   get_record_by_pos(rec_ptr);
4271 }
4272 
4273 
4274 /*
4275 Initialize the BKA join cache
4276 
4277 SYNOPSIS
4278   init
4279     for_explain       join buffer is initialized for explain only
4280 
4281 
4282 DESCRIPTION
4283   The function initializes the cache structure. It is supposed to be called
4284   right after a constructor for the JOIN_CACHE_BKA.
4285 
4286 NOTES
4287   The function first constructs a companion object of the type
4288   JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
4289 
4290 RETURN VALUE
4291   0   initialization with buffer allocations has been succeeded
4292   1   otherwise
4293 */
4294 
4295 int JOIN_CACHE_BKA::init(bool for_explain)
4296 {
4297   int res;
4298   bool check_only_first_match= join_tab->check_only_first_match();
4299 
4300   RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
4301     bka_range_seq_init,
4302     bka_range_seq_next,
4303     check_only_first_match ?  bka_range_seq_skip_record : 0,
4304     bka_skip_index_tuple };
4305 
4306   DBUG_ENTER("JOIN_CACHE_BKA::init");
4307 
4308   JOIN_TAB_SCAN_MRR *jsm;
4309   if (!(join_tab_scan= jsm= new JOIN_TAB_SCAN_MRR(join, join_tab,
4310                                                   mrr_mode, rs_funcs)))
4311     DBUG_RETURN(1);
4312 
4313   if ((res= JOIN_CACHE::init(for_explain)))
4314     DBUG_RETURN(res);
4315 
4316   if (use_emb_key)
4317     jsm->mrr_mode |= HA_MRR_MATERIALIZED_KEYS;
4318 
4319   DBUG_RETURN(0);
4320 }
4321 
4322 
4323 /*
4324 Get the key built over the next record from BKA join buffer
4325 
4326 SYNOPSIS
4327   get_next_key()
4328     key    pointer to the buffer where the key value is to be placed
4329 
4330 DESCRIPTION
4331   The function reads key fields from the current record in the join buffer.
4332   and builds the key value out of these fields that will be used to access
4333   the 'join_tab' table. Some of key fields may belong to previous caches.
4334   They are accessed via record references to the record parts stored in the
4335   previous join buffers. The other key fields always are placed right after
4336   the flag fields of the record.
4337   If the key is embedded, which means that its value can be read directly
4338   from the join buffer, then *key is set to the beginning of the key in
4339   this buffer. Otherwise the key is built in the join_tab->ref->key_buff.
4340   The function returns the length of the key if it succeeds ro read it.
4341   If is assumed that the functions starts reading at the position of
4342   the record length which is provided for each records in a BKA cache.
4343   After the key is built the 'pos' value points to the first position after
4344   the current record.
4345   The function just skips the records with MATCH_IMPOSSIBLE in the
4346   match flag field if there is any.
4347   The function returns 0 if the initial position is after the beginning
4348   of the record fields for last record from the join buffer.
4349 
4350 RETURN VALUE
4351   length of the key value - if the starting value of 'pos' points to
4352   the position before the fields for the last record,
4353   0 - otherwise.
4354 */
4355 
4356 uint JOIN_CACHE_BKA::get_next_key(uchar ** key)
4357 {
4358   uint len;
4359   uint32 rec_len;
4360   uchar *init_pos;
4361   JOIN_CACHE *cache;
4362 
4363 start:
4364 
4365   /* Any record in a BKA cache is prepended with its length */
4366   DBUG_ASSERT(with_length);
4367 
4368   if ((pos+size_of_rec_len) > last_rec_pos || !records)
4369     return 0;
4370 
4371   /* Read the length of the record */
4372   rec_len= get_rec_length(pos);
4373   pos+= size_of_rec_len;
4374   init_pos= pos;
4375 
4376   /* Read a reference to the previous cache if any */
4377   if (prev_cache)
4378     pos+= prev_cache->get_size_of_rec_offset();
4379 
4380   curr_rec_pos= pos;
4381 
4382   /* Read all flag fields of the record */
4383   read_flag_fields();
4384 
4385   if (with_match_flag &&
4386       (Match_flag) curr_rec_pos[0] == MATCH_IMPOSSIBLE )
4387   {
4388     pos= init_pos+rec_len;
4389     goto start;
4390   }
4391 
4392   if (use_emb_key)
4393   {
4394     /* An embedded key is taken directly from the join buffer */
4395     *key= pos;
4396     len= emb_key_length;
4397   }
4398   else
4399   {
4400     /* Read key arguments from previous caches if there are any such fields */
4401     if (external_key_arg_fields)
4402     {
4403       uchar *rec_ptr= curr_rec_pos;
4404       uint key_arg_count= external_key_arg_fields;
4405       CACHE_FIELD **copy_ptr= blob_ptr-key_arg_count;
4406       for (cache= prev_cache; key_arg_count; cache= cache->prev_cache)
4407       {
4408         uint len= 0;
4409         DBUG_ASSERT(cache);
4410         rec_ptr= cache->get_rec_ref(rec_ptr);
4411         while (!cache->referenced_fields)
4412         {
4413           cache= cache->prev_cache;
4414           DBUG_ASSERT(cache);
4415           rec_ptr= cache->get_rec_ref(rec_ptr);
4416         }
4417         while (key_arg_count &&
4418                cache->read_referenced_field(*copy_ptr, rec_ptr, &len))
4419         {
4420           copy_ptr++;
4421           --key_arg_count;
4422         }
4423       }
4424     }
4425 
4426     /*
4427       Read the other key arguments from the current record. The fields for
4428       these arguments are always first in the sequence of the record's fields.
4429     */
4430     CACHE_FIELD *copy= field_descr+flag_fields;
4431     CACHE_FIELD *copy_end= copy+local_key_arg_fields;
4432     bool blob_in_rec_buff= blob_data_is_in_rec_buff(curr_rec_pos);
4433     for ( ; copy < copy_end; copy++)
4434       read_record_field(copy, blob_in_rec_buff);
4435 
4436     /* Build the key over the fields read into the record buffers */
4437     TABLE_REF *ref= &join_tab->ref;
4438     cp_buffer_from_ref(join->thd, join_tab->table, ref);
4439     *key= ref->key_buff;
4440     len= ref->key_length;
4441   }
4442 
4443   pos= init_pos+rec_len;
4444 
4445   return len;
4446 }
4447 
4448 
4449 /*
4450 Check the index condition of the joined table for a record from the BKA cache
4451 
4452 SYNOPSIS
4453   skip_index_tuple()
4454     range_info       pointer to the record returned by MRR
4455 
4456 DESCRIPTION
4457   This function is invoked from MRR implementation to check if an index
4458   tuple matches the index condition. It is used in the case where the index
4459   condition actually depends on both columns of the used index and columns
4460   from previous tables.
4461 
4462 NOTES
4463   Accessing columns of the previous tables requires special handling with
4464   BKA. The idea of BKA is to collect record combinations in a buffer and
4465   then do a batch of ref access lookups, i.e. by the time we're doing a
4466   lookup its previous-records-combination is not in prev_table->record[0]
4467   but somewhere in the join buffer.
4468   We need to get it from there back into prev_table(s)->record[0] before we
4469   can evaluate the index condition, and that's why we need this function
4470   instead of regular IndexConditionPushdown.
4471 
4472 NOTES
4473   Possible optimization:
4474   Before we unpack the record from a previous table
4475   check if this table is used in the condition.
4476   If so then unpack the record otherwise skip the unpacking.
4477   This should be done by a special virtual method
4478   get_partial_record_by_pos().
4479 
4480 RETURN VALUE
4481   1    the record combination does not satisfies the index condition
4482   0    otherwise
4483 */
4484 
4485 bool JOIN_CACHE_BKA::skip_index_tuple(range_id_t range_info)
4486 {
4487   DBUG_ENTER("JOIN_CACHE_BKA::skip_index_tuple");
4488   get_record_by_pos((uchar*)range_info);
4489   DBUG_RETURN(!join_tab->cache_idx_cond->val_int());
4490 }
4491 
4492 
4493 
4494 /*
4495 Initialize retrieval of range sequence for the BKAH join algorithm
4496 
4497 SYNOPSIS
4498   bkah_range_seq_init()
4499     init_params   pointer to the BKAH join cache object
4500     n_ranges      the number of ranges obtained
4501     flags         combination of MRR flags
4502 
4503 DESCRIPTION
4504   The function interprets init_param as a pointer to a JOIN_CACHE_BKAH
4505   object. The function prepares for an iteration over distinct join keys
4506   built over the records from the cache join buffer.
4507 
4508 NOTE
4509   This function are used only as a callback function.
4510 
4511 RETURN VALUE
4512   init_param    value that is to be used as a parameter of
4513                 bkah_range_seq_next()
4514 */
4515 
4516 static
4517 range_seq_t bkah_range_seq_init(void *init_param, uint n_ranges, uint flags)
4518 {
4519   DBUG_ENTER("bkah_range_seq_init");
4520   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) init_param;
4521   cache->reset(0);
4522   DBUG_RETURN((range_seq_t) init_param);
4523 }
4524 
4525 
4526 /*
4527 Get the next range/key over records from the join buffer of a BKAH cache
4528 
4529 SYNOPSIS
4530   bkah_range_seq_next()
4531     seq        value returned by  bkah_range_seq_init()
4532     range  OUT reference to the next range
4533 
4534 DESCRIPTION
4535   The function interprets seq as a pointer to a JOIN_CACHE_BKAH
4536   object. The function returns a pointer to the range descriptor
4537   for the next unique key built over records from the join buffer.
4538 
4539 NOTE
4540   This function are used only as a callback function.
4541 
4542 RETURN VALUE
4543   FALSE  ok, the range structure filled with info about the next range/key
4544   TRUE   no more ranges
4545 */
4546 
4547 static
4548 bool bkah_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
4549 {
4550   DBUG_ENTER("bkah_range_seq_next");
4551   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
4552   TABLE_REF *ref= &cache->join_tab->ref;
4553   key_range *start_key= &range->start_key;
4554   if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
4555   {
4556     start_key->keypart_map= (1 << ref->key_parts) - 1;
4557     start_key->flag= HA_READ_KEY_EXACT;
4558     range->end_key= *start_key;
4559     range->end_key.flag= HA_READ_AFTER_KEY;
4560     range->ptr= (char *) cache->get_curr_key_chain();
4561     range->range_flag= EQ_RANGE;
4562     DBUG_RETURN(0);
4563   }
4564   DBUG_RETURN(1);
4565 }
4566 
4567 
4568 /*
4569 Check whether range_info orders to skip the next record from BKAH join buffer
4570 
4571 SYNOPSIS
4572   bkah_range_seq_skip_record()
4573     seq              value returned by bkah_range_seq_init()
4574     range_info       information about the next range/key returned by MRR
4575     rowid [NOT USED] rowid of the record to be checked (not used)
4576 
4577 DESCRIPTION
4578   The function interprets seq as a pointer to a JOIN_CACHE_BKAH
4579   object. The function returns TRUE if the record with this range_info
4580   is to be filtered out from the stream of records returned by
4581   multi_range_read_next().
4582 
4583 NOTE
4584   This function are used only as a callback function.
4585 
4586 RETURN VALUE
4587   1    record with this range_info is to be filtered out from the stream
4588        of records returned by multi_range_read_next()
4589   0    the record is to be left in the stream
4590 */
4591 
4592 static
4593 bool bkah_range_seq_skip_record(range_seq_t rseq, range_id_t range_info, uchar *rowid)
4594 {
4595   DBUG_ENTER("bkah_range_seq_skip_record");
4596   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
4597   bool res= cache->check_all_match_flags_for_key((uchar *) range_info);
4598   DBUG_RETURN(res);
4599 }
4600 
4601 
4602 /*
4603 Check if the record combination from BKAH cache matches the index condition
4604 
4605 SYNOPSIS
4606   bkah_skip_index_tuple()
4607     rseq             value returned by bka_range_seq_init()
4608     range_info       record chain for the next range/key returned by MRR
4609 
4610 DESCRIPTION
4611   This is wrapper for JOIN_CACHE_BKA_UNIQUE::skip_index_tuple method,
4612   see comments there.
4613 
4614 NOTE
4615   This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
4616 
4617 RETURN VALUE
4618   0    some records from the chain satisfy the index condition
4619   1    otherwise
4620 */
4621 
4622 static
4623 bool bkah_skip_index_tuple(range_seq_t rseq, range_id_t range_info)
4624 {
4625   DBUG_ENTER("bka_unique_skip_index_tuple");
4626   JOIN_CACHE_BKAH *cache= (JOIN_CACHE_BKAH *) rseq;
4627   THD *thd= cache->thd();
4628   bool res;
4629   status_var_increment(thd->status_var.ha_icp_attempts);
4630   if (!(res= cache->skip_index_tuple(range_info)))
4631     status_var_increment(thd->status_var.ha_icp_match);
4632   DBUG_RETURN(res);
4633 }
4634 
4635 
4636 /*
4637 Prepare to read record from BKAH cache matching the current joined record
4638 
4639 SYNOPSIS
4640   prepare_look_for_matches()
4641     skip_last <-> ignore the last record in the buffer (always unused here)
4642 
4643 DESCRIPTION
4644   The function prepares to iterate over records in the join cache buffer
4645   matching the record loaded into the record buffer for join_tab when
4646   performing join operation by BKAH join algorithm. With BKAH algorithm, if
4647   association labels are used, then record loaded into the record buffer
4648   for join_tab always has a direct reference to the chain of the mathing
4649   records from the join buffer. If association labels are not used then
4650   then the chain of the matching records is obtained by the call of the
4651   get_key_chain_by_join_key function.
4652 
4653 RETURN VALUE
4654   TRUE    there are no records in the buffer to iterate over
4655   FALSE   otherwise
4656 */
4657 
4658 bool JOIN_CACHE_BKAH::prepare_look_for_matches(bool skip_last)
4659 {
4660   last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
4661   if (no_association &&
4662       !(curr_matching_chain= get_matching_chain_by_join_key())) //psergey: added '!'
4663     return 1;
4664   last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain);
4665   return 0;
4666 }
4667 
4668 /*
4669   Initialize the BKAH join cache
4670 
4671   SYNOPSIS
4672     init
4673       for_explain       join buffer is initialized for explain only
4674 
4675   DESCRIPTION
4676     The function initializes the cache structure. It is supposed to be called
4677     right after a constructor for the JOIN_CACHE_BKAH.
4678 
4679   NOTES
4680     The function first constructs a companion object of the type
4681     JOIN_TAB_SCAN_MRR, then it calls the init method of the parent class.
4682 
4683   RETURN VALUE
4684     0   initialization with buffer allocations has been succeeded
4685     1   otherwise
4686 */
4687 
4688 int JOIN_CACHE_BKAH::init(bool for_explain)
4689 {
4690   bool check_only_first_match= join_tab->check_only_first_match();
4691 
4692   no_association= MY_TEST(mrr_mode & HA_MRR_NO_ASSOCIATION);
4693 
4694   RANGE_SEQ_IF rs_funcs= { bka_range_seq_key_info,
4695                            bkah_range_seq_init,
4696                            bkah_range_seq_next,
4697                            check_only_first_match && !no_association ?
4698                              bkah_range_seq_skip_record : 0,
4699                            bkah_skip_index_tuple };
4700 
4701   DBUG_ENTER("JOIN_CACHE_BKAH::init");
4702 
4703   if (!(join_tab_scan= new JOIN_TAB_SCAN_MRR(join, join_tab,
4704                                              mrr_mode, rs_funcs)))
4705     DBUG_RETURN(1);
4706 
4707   DBUG_RETURN(JOIN_CACHE_HASHED::init(for_explain));
4708 }
4709 
4710 
4711 /*
4712   Check the index condition of the joined table for a record from the BKA cache
4713 
4714   SYNOPSIS
4715     skip_index_tuple()
4716       range_info       record chain returned by MRR
4717 
4718   DESCRIPTION
4719     See JOIN_CACHE_BKA::skip_index_tuple().
4720     This function is the variant for use with rhe class JOIN_CACHE_BKAH.
4721     The difference from JOIN_CACHE_BKA case is that there may be multiple
4722     previous table record combinations that share the same key(MRR range).
4723     As a consequence, we need to loop through the chain of all table record
4724     combinations that match the given MRR range key range_info until we find
4725     one that satisfies the index condition.
4726 
4727   NOTE
4728     Possible optimization:
4729     Before we unpack the record from a previous table
4730     check if this table is used in the condition.
4731     If so then unpack the record otherwise skip the unpacking.
4732     This should be done by a special virtual method
4733     get_partial_record_by_pos().
4734 
4735   RETURN VALUE
4736     1    any record combination from the chain referred by range_info
4737          does not satisfy the index condition
4738     0    otherwise
4739 
4740 
4741 */
4742 
4743 bool JOIN_CACHE_BKAH::skip_index_tuple(range_id_t range_info)
4744 {
4745   uchar *last_rec_ref_ptr= get_next_rec_ref((uchar*) range_info);
4746   uchar *next_rec_ref_ptr= last_rec_ref_ptr;
4747   DBUG_ENTER("JOIN_CACHE_BKAH::skip_index_tuple");
4748   do
4749   {
4750     next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
4751     uchar *rec_ptr= next_rec_ref_ptr + rec_fields_offset;
4752     get_record_by_pos(rec_ptr);
4753     if (join_tab->cache_idx_cond->val_int())
4754       DBUG_RETURN(FALSE);
4755   } while(next_rec_ref_ptr != last_rec_ref_ptr);
4756   DBUG_RETURN(TRUE);
4757 }
4758