1 #ifndef SQL_HASH_JOIN_ITERATOR_H_
2 #define SQL_HASH_JOIN_ITERATOR_H_
3 
4 /* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License, version 2.0,
8    as published by the Free Software Foundation.
9 
10    This program is also distributed with certain software (including
11    but not limited to OpenSSL) that is licensed under separate terms,
12    as designated in a particular file or component or in included license
13    documentation.  The authors of MySQL hereby grant you an additional
14    permission to link the program and your derivative works with the
15    separately licensed software that they have included with MySQL.
16 
17    This program is distributed in the hope that it will be useful,
18    but WITHOUT ANY WARRANTY; without even the implied warranty of
19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20    GNU General Public License, version 2.0, for more details.
21 
22    You should have received a copy of the GNU General Public License
23    along with this program; if not, write to the Free Software
24    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
25 
26 #include <stdio.h>
27 #include <cstdint>
28 #include <memory>
29 #include <string>
30 #include <vector>
31 
32 #include "my_alloc.h"
33 #include "my_inttypes.h"
34 #include "sql/hash_join_buffer.h"
35 #include "sql/hash_join_chunk.h"
36 #include "sql/item_cmpfunc.h"
37 #include "sql/mem_root_array.h"
38 #include "sql/row_iterator.h"
39 #include "sql/table.h"
40 #include "sql_string.h"
41 
42 class THD;
43 class QEP_TAB;
44 
45 struct ChunkPair {
46   HashJoinChunk probe_chunk;
47   HashJoinChunk build_chunk;
48 };
49 
50 /// @file
51 ///
52 /// An iterator for joining two inputs by using hashing to match rows from
53 /// the inputs.
54 ///
55 /// The iterator starts out by doing everything in-memory. If everything fits
56 /// into memory, the joining algorithm for inner joins works like this:
57 ///
58 /// 1) Designate one input as the "build" input and one input as the "probe"
59 /// input. Ideally, the smallest input measured in total size (not number of
60 /// rows) should be designated as the build input.
61 ///
62 /// 2) Read all the rows from the build input into an in-memory hash table.
63 /// The hash key used in the hash table is calculated from the join attributes,
64 /// e.g., if we have the following query where "orders" is designated as the
65 /// build input:
66 ///
67 ///   SELECT * FROM lineitem
68 ///     INNER JOIN orders ON orders.o_orderkey = lineitem.l_orderkey;
69 ///
70 /// the hash value will be calculated from the values in the column
71 /// orders.o_orderkey. Note that the optimizer recognizes implicit join
72 /// conditions, so this also works for SQL statements like:
73 ///
74 ///   SELECT * FROM orders, lineitem
75 ///     WHERE orders.o_orderkey = lineitem.l_orderkey;
76 ///
77 /// 3) Then, we read the rows from the probe input, one by one. For each row,
78 /// a hash key is calculated for the other side of the join (the probe input)
79 /// using the join attribute (lineitem.l_orderkey in the above example) and the
80 /// same hash function as in step 2. This hash key is used to do a lookup in the
81 /// hash table, and for each match, an output row is produced. Note that the row
82 /// from the probe input is already located in the table record buffers, and the
83 /// matching row stored in the hash table is restored back to the record buffers
84 /// where it originally came from. For details around how rows are stored and
85 /// restored, see comments on hash_join_buffer::StoreFromTableBuffers.
86 ///
87 /// The size of the in-memory hash table is controlled by the system variable
88 /// join_buffer_size. If we run out of memory during step 2, we degrade into a
89 /// hybrid hash join. The data already in memory is processed using regular hash
90 /// join, and the remainder is processed using on-disk hash join. It works like
91 /// this:
92 ///
93 /// 1) The rest of the rows in the build input that did not fit into the hash
94 /// table are partitioned out into a given amount of files, represented by
95 /// HashJoinChunks. We create an equal number of chunk files for both the probe
96 /// and build input. We determine which file to put a row in by calculating a
97 /// hash from the join attribute like in step 2 above, but using a different
98 /// hash function.
99 ///
100 /// 2) Then, we read the rows from the probe input, one by one. We look for a
101 /// match in the hash table as described above, but the row is also written out
102 /// to the chunk file on disk, since it might match a row from the build input
103 /// that we've written to disk.
104 ///
105 /// 3) When the entire probe input is read, we run the "classic" hash join on
106 /// each of the corresponding chunk file probe/build pairs. Since the rows are
107 /// partitioned using the same hash function for probe and build inputs, we know
108 /// that matching rows must be located in the same pair of chunk files.
109 ///
110 /// The algorithm for semijoin is quite similar to inner joins:
111 ///
112 /// 1) Designate the inner table (i.e. the IN-side of a semijoin) as the build
113 /// input. As semijoins only needs the first matching row from the inner table,
114 /// we do not store duplicate keys in the hash table.
115 ///
116 /// 2) Output all rows from the probe input where there is at least one matching
117 /// row in the hash table. In case we have degraded into on-disk hash join, we
118 /// write the probe row out to chunk file only if we did not find a matching row
119 /// in the hash table.
120 ///
121 /// The optimizer may set up semijoins with conditions that are not pure join
122 /// conditions, but that must be attached to the hash join iterator anyways.
123 /// Consider the following query and (slightly modified) execution plan:
124 ///
125 ///   SELECT c FROM t WHERE 1 IN (SELECT t.c = col1 FROM t1);
126 ///
127 ///   -> Hash semijoin (no condition), extra conditions: (1 = (t.c = t1.col1))
128 ///       -> Table scan on t
129 ///       -> Hash
130 ///           -> Table scan on t1
131 ///
132 /// In this query, the optimizer has set up the condition (1 = (t.c = t1.col1))
133 /// as the semijoin condition. We cannot use this as a join condition, since
134 /// hash join only supports equi-join conditions. However, we cannot attach this
135 /// as a filter after the join, as that would cause wrong results. We attach
136 /// these conditions as "extra" conditions to the hash join iterator, and causes
137 /// these notable behaviors:
138 ///
139 /// a. If we have any extra conditions, we cannot reject duplicate keys in the
140 ///    hash table: the first row matching the join condition could fail the
141 ///    extra condition(s).
142 ///
143 /// b. We can only output rows if all extra conditions pass. If any of the extra
144 ///    conditions fail, we must go to the next matching row in the hash table.
145 ///
146 /// c. In case of on-disk hash join, we must write the probe row to disk _after_
147 ///    we have checked that there are no rows in the hash table that match any
148 ///    of the extra conditions.
149 ///
150 /// If we are able to execute the hash join in memory (classic hash join),
151 /// the output will be sorted the same as the left (probe) input. If we start
152 /// spilling to disk, we lose any reasonable ordering properties.
153 ///
154 /// Note that we still might end up in a case where a single chunk file from
155 /// disk won't fit into memory. This is resolved by reading as much as possible
156 /// into the hash table, and then reading the entire probe chunk file for each
157 /// time the hash table is reloaded. This might happen if we have a very skewed
158 /// data set, for instance.
159 ///
160 /// When we start spilling to disk, we allocate a maximum of "kMaxChunks"
161 /// chunk files on disk for each of the two inputs. The reason for having an
162 /// upper limit is to avoid running out of file descriptors.
163 ///
164 /// There is also a flag we can set to avoid hash join spilling to disk
165 /// regardless of the input size. If the flag is set, the join algorithm works
166 /// like this:
167 ///
168 /// 1) Read as many rows as possible from the build input into an in-memory hash
169 /// table.
170 /// 2) When the hash table is full (we have reached the limit set by the system
171 /// variable join_buffer_size), start reading from the beginning of the probe
172 /// input, probing for matches in the hash table. Output a row for each match
173 /// found.
174 /// 3) When the probe input is empty, see if there are any remaining rows in the
175 /// build input. If so, clear the in-memory hash table and go to step 1,
176 /// continuing from the build input where we stopped the last time. If not, the
177 /// join is done.
178 ///
179 /// Doing everything in memory can be beneficial in a few cases. Currently, it
180 /// is used when we have a LIMIT without sorting or grouping in the query. The
181 /// gain is that we start producing output rows a lot earlier than if we were to
182 /// spill both inputs out to disk. It could also be beneficial if the build
183 /// input _almost_ fits in memory; it would likely be better to read the probe
184 /// input twice instead of writing both inputs out to disk. However, we do not
185 /// currently do any such cost based optimization.
186 ///
187 /// There is a concept called "probe row saving" in the iterator. This is a
188 /// technique that is enabled in two different scenarios: when a hash join build
189 /// chunk does not fit entirely in memory and when hash join is not allowed to
190 /// spill to disk. Common for these two scenarios is that a probe row will be
191 /// read multiple times. For certain join types (semijoin), we must take care so
192 /// that the same probe row is not sent to the client multiple times. Probe row
193 /// saving takes care of this by doing the following:
194 ///
195 /// - If we realize that we are going to read the same probe row multiple times,
196 ///   we enable probe row saving.
197 /// - When a probe row is read, we write the row out to a probe row saving write
198 ///   file, given that it matches certain conditions (for semijoin we only save
199 ///   unmatched probe rows).
200 /// - After the probe input is consumed, we will swap the probe row saving
201 ///   _write_ file and the probe row saving _read_ file, making the write file
202 ///   available for writing again.
203 /// - When we are to read the probe input again, we read the probe rows from the
204 ///   probe row saving read file. This ensures that we i.e. do not output the
205 ///   same probe row twice for semijoin. Note that if the rows we read from the
206 ///   probe row saving read file will be read again (e.g., we have a big hash
207 ///   join build chunk that is many times bigger than the available hash table
208 ///   memory, causing us to process the chunk file in chunks), we will again
209 ///   write the rows to a new probe row saving write file. This reading from the
210 ///   read file and writing to a new write file continues until we know that we
211 ///   are seeing the probe rows for the last time.
212 ///
213 /// We use the same methods as on-disk hash join (HashJoinChunk) for reading and
214 /// writing rows to files. Note that probe row saving is never enabled for inner
215 /// joins, since we do want to output the same probe row multiple times if it
216 /// matches muliple rows from the build input. There are some differences
217 /// regarding when probe row saving is enabled, depending on the hash join type
218 /// (see enum HashJoinType):
219 ///
220 /// - IN_MEMORY: Probe row saving is never activated, since the probe input is
221 ///   read only once.
222 /// - SPILL_TO_DISK: If a build chunk file does not fit in memory (may happen
223 ///   with skewed data set), we will have to read the corresponding probe chunk
224 ///   multiple times. In this case, probe row saving is enabled as soon as we
225 ///   see that the build chunk does not fit in memory, and remains active until
226 ///   the entire build chunk is consumed. After the probe chunk is read once,
227 ///   we swap the probe row saving write file and probe row saving read file so
228 ///   that probe rows will be read from the probe row saving read file. Probe
229 ///   row saving is deactivated once we move to the next pair of chunk files.
230 /// - IN_MEMORY_WITH_HASH_TABLE_REFILL: Probe row saving is activated when we
231 ///   see that the build input is too large to fit in memory. Once the probe
232 ///   iterator has been consumed once, we swap the probe row saving write file
233 ///   and probe row saving read file so that probe rows will be read from the
234 ///   probe row saving read file. As long as the build input is not fully
235 ///   consumed, we write probe rows from the read file out to a new write file,
236 ///   swapping these files for every hash table refill. Probe row saving is
237 ///   never deactivated in this hash join type.
238 ///
239 /// Note that we always write the entire row when writing to probe row saving
240 /// file. It would be possible to only write the match flag, but this is tricky
241 /// as long as we have the hash join type IN_MEMORY_WITH_HASH_TABLE_REFILL. If
242 /// we were to write only match flags in this hash join type, we would have to
243 /// read the probe iterator multiple times. But there is no guarantee that rows
244 /// will come in the same order when reading an iterator multiple times (e.g.
245 /// NDB does not guarantee this), so it would require us to store match flags in
246 /// a lookup structure using a row ID as the key. Due to this, we will
247 /// reconsider this if the hash join type IN_MEMORY_WITH_HASH_TABLE_REFILL goes
248 /// away.
249 class HashJoinIterator final : public RowIterator {
250  public:
251   /// Construct a HashJoinIterator.
252   ///
253   /// @param thd
254   ///   the thread handle
255   /// @param build_input
256   ///   the iterator for the build input
257   /// @param build_input_tables
258   ///   a bitmap of all the tables in the build input. The tables are needed for
259   ///   two things:
260   ///   1) Accessing the columns when creating the join key during creation of
261   ///   the hash table,
262   ///   2) and accessing the column data when creating the row to be stored in
263   ///   the hash table and/or the chunk file on disk.
264   /// @param probe_input
265   ///   the iterator for the probe input
266   /// @param probe_input_tables
267   ///   the probe input tables. Needed for the same reasons as
268   ///   build_input_tables.
269   /// @param max_memory_available
270   ///   the amount of memory available, in bytes, for this hash join iterator.
271   ///   This can be user-controlled by setting the system variable
272   ///   join_buffer_size.
273   /// @param join_conditions
274   ///   a list of all the join conditions between the two inputs
275   /// @param allow_spill_to_disk
276   ///   whether the hash join can spill to disk. This is set to false in some
277   ///   cases where we have a LIMIT in the query
278   /// @param join_type
279   ///   The join type.
280   /// @param join
281   ///   The join we are a part of.
282   /// @param extra_conditions
283   ///   A list of extra conditions that the iterator will evaluate after a
284   ///   lookup in the hash table is done, but before the row is returned. The
285   ///   conditions are AND-ed together into a single Item.
286   HashJoinIterator(THD *thd, unique_ptr_destroy_only<RowIterator> build_input,
287                    qep_tab_map build_input_tables,
288                    unique_ptr_destroy_only<RowIterator> probe_input,
289                    qep_tab_map probe_input_tables, size_t max_memory_available,
290                    const std::vector<HashJoinCondition> &join_conditions,
291                    bool allow_spill_to_disk, JoinType join_type,
292                    const JOIN *join,
293                    const std::vector<Item *> &extra_conditions);
294 
295   bool Init() override;
296 
297   int Read() override;
298 
SetNullRowFlag(bool is_null_row)299   void SetNullRowFlag(bool is_null_row) override {
300     m_build_input->SetNullRowFlag(is_null_row);
301     m_probe_input->SetNullRowFlag(is_null_row);
302   }
303 
EndPSIBatchModeIfStarted()304   void EndPSIBatchModeIfStarted() override {
305     m_build_input->EndPSIBatchModeIfStarted();
306     m_probe_input->EndPSIBatchModeIfStarted();
307   }
308 
UnlockRow()309   void UnlockRow() override {
310     // Since both inputs may have been materialized to disk, we cannot unlock
311     // them.
312   }
313 
314   std::vector<std::string> DebugString() const override;
315 
children()316   std::vector<Child> children() const override {
317     return std::vector<Child>{{m_probe_input.get(), ""},
318                               {m_build_input.get(), "Hash"}};
319   }
320 
321  private:
322   /// Read all rows from the build input and store the rows into the in-memory
323   /// hash table. If the hash table goes full, the rest of the rows are written
324   /// out to chunk files on disk. See the class comment for more details.
325   ///
326   /// @retval true in case of error
327   bool BuildHashTable();
328 
329   /// Read all rows from the next chunk file into the in-memory hash table.
330   /// See the class comment for details.
331   ///
332   /// @retval true in case of error
333   bool ReadNextHashJoinChunk();
334 
335   /// Read a single row from the probe iterator input into the tables' record
336   /// buffers. If we have started spilling to disk, the row is written out to a
337   /// chunk file on disk as well.
338   ///
339   /// The end condition is that either:
340   /// a) a row is ready in the tables' record buffers, and the state will be set
341   ///    to READING_FIRST_ROW_FROM_HASH_TABLE.
342   /// b) There are no more rows to process from the probe input, so the iterator
343   ///    state will be LOADING_NEXT_CHUNK_PAIR.
344   ///
345   /// @retval true in case of error
346   bool ReadRowFromProbeIterator();
347 
348   /// Read a single row from the current probe chunk file into the tables'
349   /// record buffers. The end conditions are the same as for
350   /// ReadRowFromProbeIterator().
351   ///
352   /// @retval true in case of error
353   bool ReadRowFromProbeChunkFile();
354 
355   /// Read a single row from the probe row saving file into the tables' record
356   /// buffers.
357   ///
358   /// @retval true in case of error
359   bool ReadRowFromProbeRowSavingFile();
360 
361   // Do a lookup in the hash table for matching rows from the build input.
362   // The lookup is done by computing the join key from the probe input, and
363   // using that join key for doing a lookup in the hash table. If the join key
364   // contains one or more SQL NULLs, the row cannot match anything and will be
365   // skipped, and the iterator state will be READING_ROW_FROM_PROBE_INPUT. If
366   // not, the iterator state will be READING_FIRST_ROW_FROM_HASH_TABLE.
367   //
368   // After this function is called, ReadJoinedRow() will return false until
369   // there are no more matching rows for the computed join key.
370   void LookupProbeRowInHashTable();
371 
372   /// Take the next matching row from the hash table, and put the row into the
373   /// build tables' record buffers. The function expects that
374   /// LookupProbeRowInHashTable() has been called up-front. The user must
375   /// call ReadJoinedRow() as long as it returns false, as there may be
376   /// multiple matching rows from the hash table. It is up to the caller to set
377   /// a new state in case of EOF.
378   ///
379   /// @retval 0 if a match was found and the row is put in the build tables'
380   ///         record buffers
381   /// @retval -1 if there are no more matching rows in the hash table
382   int ReadJoinedRow();
383 
384   // Have we degraded into on-disk hash join?
on_disk_hash_join()385   bool on_disk_hash_join() const { return !m_chunk_files_on_disk.empty(); }
386 
387   /// Write the last row read from the probe input out to chunk files on disk,
388   /// if applicable.
389   ///
390   /// For inner joins, we must write all probe rows to chunk files, since we
391   /// need to match the row against rows from the build input that are written
392   /// out to chunk files. For semijoin, we can only write probe rows that do not
393   /// match any of the rows in the hash table. Writing a probe row with a
394   /// matching row in the hash table could cause the row to be returned multiple
395   /// times.
396   ///
397   /// @retval true in case of errors.
398   bool WriteProbeRowToDiskIfApplicable();
399 
400   /// @retval true if the last joined row passes all of the extra conditions.
401   bool JoinedRowPassesExtraConditions() const;
402 
403   /// If true, reject duplicate keys in the hash table.
404   ///
405   /// Semijoins/antijoins are only interested in the first matching row from the
406   /// hash table, so we can avoid storing duplicate keys in order to save some
407   /// memory. However, this cannot be applied if we have any "extra" conditions:
408   /// the first matching row in the hash table may fail the extra condition(s).
409   ///
410   /// @retval true if we can reject duplicate keys in the hash table.
RejectDuplicateKeys()411   bool RejectDuplicateKeys() const {
412     return m_extra_condition == nullptr &&
413            (m_join_type == JoinType::SEMI || m_join_type == JoinType::ANTI);
414   }
415 
416   /// Clear the row buffer and reset all iterators pointing to it. This may be
417   /// called multiple times to re-init the row buffer.
418   ///
419   /// @retval true in case of error. my_error has been called
420   bool InitRowBuffer();
421 
422   /// Prepare to read the probe iterator from the beginning, and enable batch
423   /// mode if applicable. The iterator state will remain unchanged.
424   ///
425   /// @retval true in case of error. my_error has been called.
426   bool InitProbeIterator();
427 
428   /// Mark that probe row saving is enabled, and prepare the probe row saving
429   /// file for writing.
430   /// @see m_write_to_probe_row_saving
431   ///
432   /// @retval true in case of error. my_error has been called.
433   bool InitWritingToProbeRowSavingFile();
434 
435   /// Mark that we should read from the probe row saving file. The probe row
436   /// saving file is rewinded to the beginning.
437   /// @see m_read_from_probe_row_saving
438   ///
439   /// @retval true in case of error. my_error has been called.
440   bool InitReadingFromProbeRowSavingFile();
441 
442   /// Set the iterator state to the correct READING_ROW_FROM_PROBE_*-state.
443   /// Which state we end up in depends on which hash join type we are executing
444   /// (in-memory, on-disk or in-memory with hash table refill).
445   void SetReadingProbeRowState();
446 
447   /// Read a joined row from the hash table, and see if it passes any extra
448   /// conditions. The last probe row read will also be written do disk if needed
449   /// (see WriteProbeRowToDiskIfApplicable).
450   ///
451   /// @retval -1 There are no more matching rows in the hash table.
452   /// @retval 0 A joined row is ready.
453   /// @retval 1 An error occured.
454   int ReadNextJoinedRowFromHashTable();
455 
456   enum class State {
457     // We are reading a row from the probe input, where the row comes from
458     // the iterator.
459     READING_ROW_FROM_PROBE_ITERATOR,
460     // We are reading a row from the probe input, where the row comes from a
461     // chunk file.
462     READING_ROW_FROM_PROBE_CHUNK_FILE,
463     // We are reading a row from the probe input, where the row comes from a
464     // probe row saving file.
465     READING_ROW_FROM_PROBE_ROW_SAVING_FILE,
466     // The iterator is moving to the next pair of chunk files, where the chunk
467     // file from the build input will be loaded into the hash table.
468     LOADING_NEXT_CHUNK_PAIR,
469     // We are reading the first row returned from the hash table lookup that
470     // also passes extra conditions.
471     READING_FIRST_ROW_FROM_HASH_TABLE,
472     // We are reading the remaining rows returned from the hash table lookup.
473     READING_FROM_HASH_TABLE,
474     // No more rows, both inputs are empty.
475     END_OF_ROWS
476   };
477 
478   State m_state;
479 
480   const unique_ptr_destroy_only<RowIterator> m_build_input;
481   const unique_ptr_destroy_only<RowIterator> m_probe_input;
482 
483   // An iterator for reading rows from the hash table.
484   // hash_join_buffer::HashJoinRowBuffer::Iterator m_hash_map_iterator;
485   hash_join_buffer::HashJoinRowBuffer::hash_map_iterator m_hash_map_iterator;
486   hash_join_buffer::HashJoinRowBuffer::hash_map_iterator m_hash_map_end;
487 
488   // These structures holds the tables and columns that are needed for the hash
489   // join. Rows/columns that are not needed are filtered out in the constructor.
490   // We need to know which tables that belong to each iterator, so that we can
491   // compute the join key when needed.
492   hash_join_buffer::TableCollection m_probe_input_tables;
493   hash_join_buffer::TableCollection m_build_input_tables;
494 
495   // An in-memory hash table that holds rows from the build input (directly from
496   // the build input iterator, or from a chunk file). See the class comment for
497   // details on how and when this is used.
498   hash_join_buffer::HashJoinRowBuffer m_row_buffer;
499 
500   // A list of the join conditions (all of them are equi-join conditions).
501   Prealloced_array<HashJoinCondition, 4> m_join_conditions;
502 
503   // Array to hold the list of chunk files on disk in case we degrade into
504   // on-disk hash join.
505   Mem_root_array<ChunkPair> m_chunk_files_on_disk;
506 
507   // Which HashJoinChunk, if any, we are currently reading from, in both
508   // LOADING_NEXT_CHUNK_PAIR and READING_ROW_FROM_PROBE_CHUNK_FILE.
509   // It is incremented during the state LOADING_NEXT_CHUNK_PAIR.
510   int m_current_chunk{-1};
511 
512   // The seeds that are used by xxHash64 when calculating the hash from a join
513   // key. We need one seed for the hashing done in the in-memory hash table,
514   // and one seed when calculating the hash that is used for determining which
515   // chunk file a row should be placed in (in case of on-disk hash join). If we
516   // were to use the same seed for both operations, we would get a really bad
517   // hash table when loading a chunk file to the hash table. The numbers are
518   // chosen randomly and have no special meaning.
519   static constexpr uint32_t kHashTableSeed{156211};
520   static constexpr uint32_t kChunkPartitioningHashSeed{899339};
521 
522   // Which row we currently are reading from each of the hash join chunk file.
523   ha_rows m_build_chunk_current_row = 0;
524   ha_rows m_probe_chunk_current_row = 0;
525 
526   // The maximum number of HashJoinChunks that is allocated for each of the
527   // inputs in case we spill to disk. We might very well end up with an amount
528   // less than this number, but we keep an upper limit so we don't risk running
529   // out of file descriptors. We always use a power of two number of files,
530   // which allows us to do some optimizations when calculating which chunk a row
531   // should be placed in.
532   static constexpr size_t kMaxChunks = 128;
533 
534   // A buffer that is used during two phases:
535   // 1) when constructing a join key from join conditions.
536   // 2) when moving a row between tables' record buffers and the hash table.
537   //
538   // There are two functions that needs this buffer; ConstructJoinKey() and
539   // StoreFromTableBuffers(). After calling one of these functions, the user
540   // must take responsiblity of the data if it is needed for a longer lifetime.
541   //
542   // If there are no BLOB/TEXT column in the join, we calculate an upper bound
543   // of the row size that is used to preallocate this buffer. In the case of
544   // BLOB/TEXT columns, we cannot calculate a reasonable upper bound, and the
545   // row size is calculated per row. The allocated memory is kept for the
546   // duration of the iterator, so that we (most likely) avoid reallocations.
547   String m_temporary_row_and_join_key_buffer;
548 
549   // Whether we should turn on batch mode for the probe input. Batch mode is
550   // enabled if the probe input consists of exactly one table, and
551   // QEP_TAB::pfs_batch_update() returns true for this table.
552   bool m_probe_input_batch_mode{false};
553 
554   // Whether we are allowed to spill to disk.
555   bool m_allow_spill_to_disk{true};
556 
557   // Whether the build iterator has more rows. This is used to stop the hash
558   // join iterator asking for more rows when we know for sure that the entire
559   // build input is consumed. The variable is only used if m_allow_spill_to_disk
560   // is false, as we have to see if there are more rows in the build input after
561   // the probe input is consumed.
562   bool m_build_iterator_has_more_rows{true};
563 
564   // What kind of join the iterator should execute.
565   const JoinType m_join_type;
566 
567   // If not nullptr, an extra condition that the iterator will evaluate after a
568   // lookup in the hash table is done, but before the row is returned. This is
569   // needed in case we have a semijoin condition that is not an equi-join
570   // condition (i.e. 't1.col1 < t2.col1').
571   Item *m_extra_condition{nullptr};
572 
573   // Whether we should write rows from the probe input to the probe row saving
574   // write file. See the class comment on HashJoinIterator for details around
575   // probe row saving.
576   bool m_write_to_probe_row_saving{false};
577 
578   // Whether we should read rows from the probe row saving read file. See the
579   // class comment on HashJoinIterator for details around probe row saving.
580   bool m_read_from_probe_row_saving{false};
581 
582   // The probe row saving files where unmatched probe rows are written to and
583   // read from.
584   HashJoinChunk m_probe_row_saving_write_file;
585   HashJoinChunk m_probe_row_saving_read_file;
586 
587   // Which row we currently are reading from in the probe row saving read file.
588   // Used to know whether we have reached the end of the file. How many files
589   // the probe row saving read file contains is contained in the HashJoinChunk
590   // (see m_probe_row_saving_read_file).
591   ha_rows m_probe_row_saving_read_file_current_row{0};
592 
593   // The "type" of hash join we are executing. We currently have three different
594   // types of hash join:
595   // - In memory: We do everything in memory without any refills of the hash
596   //   table. Each input is read only once, and nothing is written to disk.
597   // - Spill to disk: If the build input does not fit in memory, we write both
598   //   inputs out to a set of chunk files. Both inputs are partitioned using a
599   //   hash function over the join attribute, ensuring that matching rows can be
600   //   found in the same set of chunk files. Each pair of chunk file is then
601   //   processed as an in-memory hash join.
602   // - In memory with hash table refill: This is enabled if we are not allowed
603   //   to spill to disk, and the build input does not fit in memory. We read as
604   //   much as possible from the build input into the hash table. We then read
605   //   the entire probe input, probing for matching rows in the hash table.
606   //   When the probe input returns EOF, the hash table is refilled with the
607   //   rows that did not fit the first time. The entire probe input is read
608   //   again, and this is repeated until the entire build input is consumed.
609   enum class HashJoinType {
610     IN_MEMORY,
611     SPILL_TO_DISK,
612     IN_MEMORY_WITH_HASH_TABLE_REFILL
613   };
614   HashJoinType m_hash_join_type{HashJoinType::IN_MEMORY};
615 
616   // The match flag for the last probe row read from chunk file.
617   //
618   // This is needed if a outer join spills to disk; a probe row can match a row
619   // from the build input we haven't seen yet (it's been written out to disk
620   // because the hash table was full). So when reading a probe row from a chunk
621   // file, this variable holds the match flag. This flag must be a class member,
622   // since one probe row may match multiple rows from the hash table; the
623   // execution will go out of HashJoinIterator::Read() between each matching
624   // row, causing any local match flag to lose the match flag info from the last
625   // probe row read.
626   bool m_probe_row_match_flag{false};
627 };
628 
629 /// For each of the given tables, request that the row ID is filled in
630 /// (the equivalent of calling file->position()) if needed.
631 ///
632 /// @param tables The tables to request row IDs for.
633 void RequestRowId(const Prealloced_array<hash_join_buffer::Table, 4> &tables);
634 
635 #endif  // SQL_HASH_JOIN_ITERATOR_H_
636