1 #ifndef SQL_BKA_ITERATOR_H_
2 #define SQL_BKA_ITERATOR_H_
3 
4 /* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License, version 2.0,
8    as published by the Free Software Foundation.
9 
10    This program is also distributed with certain software (including
11    but not limited to OpenSSL) that is licensed under separate terms,
12    as designated in a particular file or component or in included license
13    documentation.  The authors of MySQL hereby grant you an additional
14    permission to link the program and your derivative works with the
15    separately licensed software that they have included with MySQL.
16 
17    This program is distributed in the hope that it will be useful,
18    but WITHOUT ANY WARRANTY; without even the implied warranty of
19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20    GNU General Public License, version 2.0, for more details.
21 
22    You should have received a copy of the GNU General Public License
23    along with this program; if not, write to the Free Software
24    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
25 
26 /**
27   Batch key access (BKA) is a join strategy that uses multi-range read (MRR)
28   to get better read ordering on the table on the inner side. It reads
29   a block of rows from the outer side, picks up the join keys (refs) from
30   each row, and sends them all off in one big read request. The handler can
31   then order these and read them in whatever order it would prefer. This is
32   especially attractive when used with rotating media; the reads can then be
33   ordered such that it does not require constant seeking (disk-sweep MRR,
34   or DS-MRR).
35 
36   BKA is implemented with two iterators working in concert. The BKAIterator
37   reads rows from the outer side into a buffer. When the buffer is full or we
38   are out of rows, it then sets up the key ranges and hand it over to the
39   MultiRangeRowIterator, which does the actual request, and reads rows from it
40   until there are none left. For each inner row returned, MultiRangeRowIterator
41   loads the appropriate outer row(s) from the buffer, doing the actual join.
42 
43   The reason for this split is twofold. First, it allows us to accurately time
44   (for EXPLAIN ANALYZE) the actual table read. Second, and more importantly,
45   we can have other iterators between the BKAIterator and MultiRangeRowIterator,
46   in particular FilterIterator.
47  */
48 
49 #include <stddef.h>
50 #include <sys/types.h>
51 #include <memory>
52 #include <string>
53 #include <vector>
54 
55 #include "my_alloc.h"
56 #include "my_inttypes.h"
57 #include "sql/handler.h"
58 #include "sql/hash_join_buffer.h"
59 #include "sql/mem_root_array.h"
60 #include "sql/row_iterator.h"
61 #include "sql_string.h"
62 #include "template_utils.h"
63 
64 class Item;
65 class MultiRangeRowIterator;
66 class QEP_TAB;
67 class THD;
68 struct KEY_MULTI_RANGE;
69 struct TABLE;
70 struct TABLE_REF;
71 
72 /**
73   The BKA join iterator, with an arbitrary iterator tree on the outer side
74   and a MultiRangeRowIterator on the inner side (possibly with a filter or
75   similar in-between). See file comment for more details.
76  */
77 class BKAIterator final : public RowIterator {
78  public:
79   /**
80     @param thd Thread handle.
81     @param join The JOIN we are part of.
82     @param outer_input The iterator to read the outer rows from.
83     @param outer_input_tables QEP_TAB for each outer table involved.
84       Used to know which fields we are to read into our buffer.
85     @param inner_input The iterator to read the inner rows from.
86       Must end up in a MultiRangeRowIterator.
87     @param max_memory_available Number of bytes available for row buffers,
88       both outer rows and MRR buffers. Note that allocation is incremental,
89       so we can allocate less than this.
90     @param mrr_bytes_needed_for_single_inner_row Number of bytes MRR needs
91       space for in its buffer for holding a single row from the inner table.
92     @param expected_inner_rows_per_outer_row Number of inner rows we
93       statistically expect for each outer row. Used for dividing the buffer
94       space between inner rows and MRR row buffer (if we expect many inner
95       rows, we can't load as many outer rows).
96     @param mrr_iterator Pointer to the MRR iterator at the bottom of
97       inner_input. Used to send row ranges and buffers.
98     @param join_type What kind of join we are executing.
99    */
100   BKAIterator(THD *thd, JOIN *join,
101               unique_ptr_destroy_only<RowIterator> outer_input,
102               qep_tab_map outer_input_tables,
103               unique_ptr_destroy_only<RowIterator> inner_input,
104               size_t max_memory_available,
105               size_t mrr_bytes_needed_for_single_inner_row,
106               float expected_inner_rows_per_outer_row,
107               MultiRangeRowIterator *mrr_iterator, JoinType join_type);
108 
109   bool Init() override;
110 
111   int Read() override;
112 
SetNullRowFlag(bool is_null_row)113   void SetNullRowFlag(bool is_null_row) override {
114     m_outer_input->SetNullRowFlag(is_null_row);
115     m_inner_input->SetNullRowFlag(is_null_row);
116   }
117 
UnlockRow()118   void UnlockRow() override {
119     // Since we don't know which condition that caused the row to be rejected,
120     // we can't know whether we could also unlock the outer row
121     // (it may still be used as parts of other joined rows).
122     if (m_state == State::RETURNING_JOINED_ROWS) {
123       m_inner_input->UnlockRow();
124     }
125   }
126 
127   std::vector<std::string> DebugString() const override;
128 
children()129   std::vector<Child> children() const override {
130     return std::vector<Child>{
131         {m_outer_input.get(), "Batch input rows"},
132         {m_inner_input.get(), ""},
133     };
134   }
135 
EndPSIBatchModeIfStarted()136   void EndPSIBatchModeIfStarted() override {
137     m_outer_input->EndPSIBatchModeIfStarted();
138     m_inner_input->EndPSIBatchModeIfStarted();
139   }
140 
141  private:
142   /// Clear out the MEM_ROOT and prepare for reading rows anew.
143   void BeginNewBatch();
144 
145   /// If there are more outer rows, begin the next batch. If not,
146   /// move to the EOF state.
147   void BatchFinished();
148 
149   /// Find the next unmatched row, and load it for output as a NULL-complemented
150   /// row. (Assumes the NULL row flag has already been set on the inner table
151   /// iterator.) Returns 0 if a row was found, -1 if no row was found. (Errors
152   /// cannot happen.)
153   int MakeNullComplementedRow();
154 
155   /// Read a batch of outer rows (BeginNewBatch() must have been called
156   /// earlier). Returns -1 for no outer rows found (sets state to END_OF_ROWS),
157   /// 0 for OK (sets state to RETURNING_JOINED_ROWS) or 1 for error.
158   int ReadOuterRows();
159 
160   enum class State {
161     /**
162       We are about to start reading outer rows into our buffer.
163       A single Read() call will fill it up, so there is no
164       in-between “currently reading” state.
165      */
166     NEED_OUTER_ROWS,
167 
168     /**
169       We are returning rows from the MultiRangeRowIterator.
170       (For antijoins, we are looking up the rows, but don't actually
171       return them.)
172      */
173     RETURNING_JOINED_ROWS,
174 
175     /**
176       We are an outer join or antijoin, and we're returning NULL-complemented
177       rows for those outer rows that never had a matching inner row. Note that
178       this is done in the BKAIterator and not the MRR iterator for two reasons:
179       First, it gives more sensible EXPLAIN ANALYZE numbers. Second, the
180       NULL-complemented rows could be filtered inadvertently by a FilterIterator
181       before they reach the BKAIterator.
182      */
183     RETURNING_NULL_COMPLEMENTED_ROWS,
184 
185     /**
186       Both the outer and inner side are out of rows.
187      */
188     END_OF_ROWS
189   };
190 
191   State m_state;
192 
193   const unique_ptr_destroy_only<RowIterator> m_outer_input;
194   const unique_ptr_destroy_only<RowIterator> m_inner_input;
195 
196   /// The MEM_ROOT we are storing the outer rows on, and also allocating MRR
197   /// buffer from. In total, this should not go significantly over
198   /// m_max_memory_available bytes.
199   MEM_ROOT m_mem_root;
200 
201   /// Buffered outer rows.
202   Mem_root_array<hash_join_buffer::BufferRow> m_rows;
203 
204   /// Tables and columns needed for each outer row. Rows/columns that are not
205   /// needed are filtered out in the constructor; the rest are read and stored
206   /// in m_rows.
207   hash_join_buffer::TableCollection m_outer_input_tables;
208 
209   /// Used for serializing the row we read from the outer table(s), before it
210   /// stored into the MEM_ROOT and put into m_rows. Should there not be room in
211   /// m_rows for the row, it will stay in this variable until we start reading
212   /// the next batch of outer rows.
213   ///
214   /// If there are no BLOB/TEXT column in the join, we calculate an upper bound
215   /// of the row size that is used to preallocate this buffer. In the case of
216   /// BLOB/TEXT columns, we cannot calculate a reasonable upper bound, and the
217   /// row size is calculated per row. The allocated memory is kept for the
218   /// duration of the iterator, so that we (most likely) avoid reallocations.
219   String m_outer_row_buffer;
220 
221   /// Whether we have a row in m_outer_row_buffer from the previous batch of
222   /// rows that we haven't stored in m_rows yet.
223   bool m_has_row_from_previous_batch = false;
224 
225   /// For each outer row, how many bytes we need in the MRR buffer (ie., the
226   /// number of bytes we expect to use on rows from the inner table).
227   /// This is the expected number of inner rows per key, multiplied by the
228   /// (fixed) size of each inner row. We use this information to stop scanning
229   /// before we've used up the entire RAM allowance on outer rows, so that
230   /// we have space remaining for the inner rows (in the MRR buffer), too.
231   size_t m_mrr_bytes_needed_per_row;
232 
233   /// Estimated number of bytes used on m_mem_root so far.
234   size_t m_bytes_used = 0;
235 
236   /// Whether we've seen EOF from the outer iterator.
237   bool m_end_of_outer_rows = false;
238 
239   /// See max_memory_available in the constructor.
240   const size_t m_max_memory_available;
241 
242   /// See max_memory_available in the constructor.
243   const size_t m_mrr_bytes_needed_for_single_inner_row;
244 
245   /// See mrr_iterator in the constructor.
246   MultiRangeRowIterator *const m_mrr_iterator;
247 
248   /// The join type of the BKA join.
249   JoinType m_join_type;
250 
251   /// If we are synthesizing NULL-complemented rows (for an outer join or
252   /// antijoin), points to the next row within "m_rows" that we haven't
253   /// considered yet.
254   hash_join_buffer::BufferRow *m_current_pos;
255 };
256 
257 /**
258   The iterator actually doing the reads from the inner table during BKA.
259   See file comment.
260  */
261 class MultiRangeRowIterator final : public TableRowIterator {
262  public:
263   /**
264     @param thd Thread handle.
265     @param cache_idx_cond See m_cache_idx_cond.
266     @param table The inner table to scan.
267     @param keep_current_rowid If true, get the row ID on the inner table
268       for each row that we return. (Row IDs for outer tables will be
269       controlled by outer_input_tables.)
270     @param ref The index condition we are looking up on.
271     @param mrr_flags Flags passed on to MRR.
272    */
273   MultiRangeRowIterator(THD *thd, Item *cache_idx_cond, TABLE *table,
274                         bool keep_current_rowid, TABLE_REF *ref, int mrr_flags);
275 
276   /**
277     Tell the MRR iterator which tables are on the left side of the BKA join
278     (the MRR iterator is always alone on the right side). This is needed so
279     that it can unpack the rows into the right tables, with the right format.
280 
281     Must be called exactly once, before first Init(). Set by BKAIterator's
282     constructor; it's not easily available at the point where we construct
283     MultiRangeRowIterator.
284    */
285   void set_outer_input_tables(JOIN *join, qep_tab_map outer_input_tables);
286 
287   /**
288     Tell the MRR iterator what kind of BKA join it is part of.
289 
290     Must be called exactly once, before first Init(). Set by BKAIterator's
291     constructor; it's not easily available at the point where we construct
292     MultiRangeRowIterator.
293    */
set_join_type(JoinType join_type)294   void set_join_type(JoinType join_type) { m_join_type = join_type; }
295 
296   /**
297     Specify which outer rows to read inner rows for.
298     Must be called before Init(), and be valid until the last Read().
299    */
set_rows(const hash_join_buffer::BufferRow * begin,const hash_join_buffer::BufferRow * end)300   void set_rows(const hash_join_buffer::BufferRow *begin,
301                 const hash_join_buffer::BufferRow *end) {
302     m_begin = begin;
303     m_end = end;
304   }
305 
306   /**
307     Specify an unused chunk of memory MRR can use for the returned inner rows.
308     Must be called before Init(), and must be at least big enough to hold
309     one inner row.
310    */
set_mrr_buffer(uchar * ptr,size_t size)311   void set_mrr_buffer(uchar *ptr, size_t size) {
312     m_mrr_buffer.buffer = ptr;
313     m_mrr_buffer.buffer_end = ptr + size;
314   }
315 
316   /**
317     Specify an unused chunk of memory that we can use to mark which inner rows
318     have been read (by the parent BKA iterator) or not. This is used for outer
319     joins to know which rows need NULL-complemented versions, and for semijoins
320     and antijoins to avoid matching the same inner row more than once.
321 
322     Must be called before Init() for semijoins, outer joins and antijoins, and
323     never called otherwise. There must be room at least for one bit per row
324     given in set_rows().
325    */
set_match_flag_buffer(uchar * ptr)326   void set_match_flag_buffer(uchar *ptr) { m_match_flag_buffer = ptr; }
327 
328   /**
329     Mark that the BKA iterator has seen the last row we returned from Read().
330     (It could have been discarded by a FilterIterator before it reached them.)
331     Will be a no-op for inner joins; see set_match_flag_buffer()..
332    */
MarkLastRowAsRead()333   void MarkLastRowAsRead() {
334     if (m_match_flag_buffer != nullptr) {
335       size_t row_number = std::distance(m_begin, m_last_row_returned);
336       m_match_flag_buffer[row_number / 8] |= 1 << (row_number % 8);
337     }
338   }
339 
340   /**
341     Check whether the given row has been marked as read
342     (using MarkLastRowAsRead()) or not. Used internally when doing semijoins,
343     and also by the BKAIterator when synthesizing NULL-complemented rows for
344     outer joins or antijoins.
345    */
RowHasBeenRead(const hash_join_buffer::BufferRow * row)346   bool RowHasBeenRead(const hash_join_buffer::BufferRow *row) const {
347     DBUG_ASSERT(m_match_flag_buffer != nullptr);
348     size_t row_number = std::distance(m_begin, row);
349     return m_match_flag_buffer[row_number / 8] & (1 << (row_number % 8));
350   }
351 
352   /**
353     Do the actual multi-range read with the rows given by set_rows() and using
354     the temporary buffer given in set_mrr_buffer().
355    */
356   bool Init() override;
357 
358   /**
359     Read another inner row (if any) and load the appropriate outer row(s)
360     into the associated table buffers.
361    */
362   int Read() override;
363 
364   std::vector<std::string> DebugString() const override;
365 
366  private:
367   // Thunks from function pointers to the actual callbacks.
MrrInitCallbackThunk(void * init_params,uint n_ranges,uint flags)368   static range_seq_t MrrInitCallbackThunk(void *init_params, uint n_ranges,
369                                           uint flags) {
370     return (pointer_cast<MultiRangeRowIterator *>(init_params))
371         ->MrrInitCallback(n_ranges, flags);
372   }
MrrNextCallbackThunk(void * init_params,KEY_MULTI_RANGE * range)373   static uint MrrNextCallbackThunk(void *init_params, KEY_MULTI_RANGE *range) {
374     return (pointer_cast<MultiRangeRowIterator *>(init_params))
375         ->MrrNextCallback(range);
376   }
MrrSkipIndexTupleCallbackThunk(range_seq_t seq,char * range_info)377   static bool MrrSkipIndexTupleCallbackThunk(range_seq_t seq,
378                                              char *range_info) {
379     return (reinterpret_cast<MultiRangeRowIterator *>(seq))
380         ->MrrSkipIndexTuple(range_info);
381   }
MrrSkipRecordCallbackThunk(range_seq_t seq,char * range_info,uchar *)382   static bool MrrSkipRecordCallbackThunk(range_seq_t seq, char *range_info,
383                                          uchar *) {
384     return (reinterpret_cast<MultiRangeRowIterator *>(seq))
385         ->MrrSkipRecord(range_info);
386   }
387 
388   // Callbacks we get from the handler during the actual read.
389   range_seq_t MrrInitCallback(uint n_ranges, uint flags);
390   uint MrrNextCallback(KEY_MULTI_RANGE *range);
391   bool MrrSkipIndexTuple(char *range_info);
392   bool MrrSkipRecord(char *range_info);
393 
394   /**
395     There are certain conditions that would normally be pushed down to indexes,
396     but that depend on the values of outer tables in the BKA join (ie., they are
397     join conditions), which are not set when we actually read the inner row.[1]
398     Thus, we cannot push them all the way down to the handler; however, MRR
399     gives us a similar mechanism that we can use. Specifically, if we set
400     “skip_index_tuple” to a function pointer, we will be called back for each
401     row, and can load the outer table row(s) we need to evaluate the condition.
402     This allows us to reject the rows based on the index entry alone, without
403     loading the row itself.
404 
405     It is unclear how much benefit this gives us over simply not pushing these
406     conditions at all. The case of a join condition that is satisfiable using
407     the index tuple but not simply pushable down into the ref is rare; it has to
408     either be on a keypart we couldn't use (e.g., an index on A,B,C where we
409     join A and C but not B -- A then becomes part of our ref, but C needs to be
410     an index condition) or a condition that needs to be rechecked, which happens
411     only when mixing PAD SPACE / NO PAD in a join (e.g. looking up in a CHAR
412     column, but wanting the comparison as NO PAD). Especially the latter case
413     would seem unlikely to filter away a significant amount of rows.
414 
415     [1] In the DebugString output, we call such conditions “dependent
416         index conditions”, since they depend on values from other tables,
417         analogous to dependent subqueries. Internally, they are called
418         cache_idx_cond, presumably because BKA originated in join buffering,
419         also known as join cache.
420    */
421   Item *const m_cache_idx_cond;
422 
423   /// See constructor.
424   const bool m_keep_current_rowid;
425 
426   /// The table we are reading from.
427   TABLE *const m_table;
428 
429   /// Handler for the table we are reading from.
430   handler *const m_file;
431 
432   /// The index condition.
433   TABLE_REF *const m_ref;
434 
435   /// Flags passed on to MRR.
436   const int m_mrr_flags;
437 
438   /// Current outer rows to read inner rows for. Set by set_rows().
439   const hash_join_buffer::BufferRow *m_begin;
440   const hash_join_buffer::BufferRow *m_end;
441 
442   /// Which row we are at in the [m_begin, m_end) range.
443   /// Used during the MRR callbacks.
444   const hash_join_buffer::BufferRow *m_current_pos;
445 
446   /// What row we last returned from Read() (used for MarkLastRowAsRead()).
447   const hash_join_buffer::BufferRow *m_last_row_returned;
448 
449   /// Temporary space for storing inner rows, used by MRR.
450   /// Set by set_mrr_buffer().
451   HANDLER_BUFFER m_mrr_buffer;
452 
453   /// See set_match_flag_buffer().
454   uchar *m_match_flag_buffer = nullptr;
455 
456   /// Tables and columns needed for each outer row. Same as m_outer_input_tables
457   /// in the corresponding BKAIterator.
458   hash_join_buffer::TableCollection m_outer_input_tables;
459 
460   /// The join type of the BKA join we are part of. Same as m_join_type in the
461   /// corresponding BKAIterator.
462   JoinType m_join_type = JoinType::INNER;
463 };
464 
465 #endif  // SQL_BKA_ITERATOR_H_
466