1 #ifndef SQL_BKA_ITERATOR_H_ 2 #define SQL_BKA_ITERATOR_H_ 3 4 /* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License, version 2.0, 8 as published by the Free Software Foundation. 9 10 This program is also distributed with certain software (including 11 but not limited to OpenSSL) that is licensed under separate terms, 12 as designated in a particular file or component or in included license 13 documentation. The authors of MySQL hereby grant you an additional 14 permission to link the program and your derivative works with the 15 separately licensed software that they have included with MySQL. 16 17 This program is distributed in the hope that it will be useful, 18 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 GNU General Public License, version 2.0, for more details. 21 22 You should have received a copy of the GNU General Public License 23 along with this program; if not, write to the Free Software 24 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ 25 26 /** 27 Batch key access (BKA) is a join strategy that uses multi-range read (MRR) 28 to get better read ordering on the table on the inner side. It reads 29 a block of rows from the outer side, picks up the join keys (refs) from 30 each row, and sends them all off in one big read request. The handler can 31 then order these and read them in whatever order it would prefer. This is 32 especially attractive when used with rotating media; the reads can then be 33 ordered such that it does not require constant seeking (disk-sweep MRR, 34 or DS-MRR). 35 36 BKA is implemented with two iterators working in concert. The BKAIterator 37 reads rows from the outer side into a buffer. When the buffer is full or we 38 are out of rows, it then sets up the key ranges and hand it over to the 39 MultiRangeRowIterator, which does the actual request, and reads rows from it 40 until there are none left. For each inner row returned, MultiRangeRowIterator 41 loads the appropriate outer row(s) from the buffer, doing the actual join. 42 43 The reason for this split is twofold. First, it allows us to accurately time 44 (for EXPLAIN ANALYZE) the actual table read. Second, and more importantly, 45 we can have other iterators between the BKAIterator and MultiRangeRowIterator, 46 in particular FilterIterator. 47 */ 48 49 #include <stddef.h> 50 #include <sys/types.h> 51 #include <memory> 52 #include <string> 53 #include <vector> 54 55 #include "my_alloc.h" 56 #include "my_inttypes.h" 57 #include "sql/handler.h" 58 #include "sql/hash_join_buffer.h" 59 #include "sql/mem_root_array.h" 60 #include "sql/row_iterator.h" 61 #include "sql_string.h" 62 #include "template_utils.h" 63 64 class Item; 65 class MultiRangeRowIterator; 66 class QEP_TAB; 67 class THD; 68 struct KEY_MULTI_RANGE; 69 struct TABLE; 70 struct TABLE_REF; 71 72 /** 73 The BKA join iterator, with an arbitrary iterator tree on the outer side 74 and a MultiRangeRowIterator on the inner side (possibly with a filter or 75 similar in-between). See file comment for more details. 76 */ 77 class BKAIterator final : public RowIterator { 78 public: 79 /** 80 @param thd Thread handle. 81 @param join The JOIN we are part of. 82 @param outer_input The iterator to read the outer rows from. 83 @param outer_input_tables QEP_TAB for each outer table involved. 84 Used to know which fields we are to read into our buffer. 85 @param inner_input The iterator to read the inner rows from. 86 Must end up in a MultiRangeRowIterator. 87 @param max_memory_available Number of bytes available for row buffers, 88 both outer rows and MRR buffers. Note that allocation is incremental, 89 so we can allocate less than this. 90 @param mrr_bytes_needed_for_single_inner_row Number of bytes MRR needs 91 space for in its buffer for holding a single row from the inner table. 92 @param expected_inner_rows_per_outer_row Number of inner rows we 93 statistically expect for each outer row. Used for dividing the buffer 94 space between inner rows and MRR row buffer (if we expect many inner 95 rows, we can't load as many outer rows). 96 @param mrr_iterator Pointer to the MRR iterator at the bottom of 97 inner_input. Used to send row ranges and buffers. 98 @param join_type What kind of join we are executing. 99 */ 100 BKAIterator(THD *thd, JOIN *join, 101 unique_ptr_destroy_only<RowIterator> outer_input, 102 qep_tab_map outer_input_tables, 103 unique_ptr_destroy_only<RowIterator> inner_input, 104 size_t max_memory_available, 105 size_t mrr_bytes_needed_for_single_inner_row, 106 float expected_inner_rows_per_outer_row, 107 MultiRangeRowIterator *mrr_iterator, JoinType join_type); 108 109 bool Init() override; 110 111 int Read() override; 112 SetNullRowFlag(bool is_null_row)113 void SetNullRowFlag(bool is_null_row) override { 114 m_outer_input->SetNullRowFlag(is_null_row); 115 m_inner_input->SetNullRowFlag(is_null_row); 116 } 117 UnlockRow()118 void UnlockRow() override { 119 // Since we don't know which condition that caused the row to be rejected, 120 // we can't know whether we could also unlock the outer row 121 // (it may still be used as parts of other joined rows). 122 if (m_state == State::RETURNING_JOINED_ROWS) { 123 m_inner_input->UnlockRow(); 124 } 125 } 126 127 std::vector<std::string> DebugString() const override; 128 children()129 std::vector<Child> children() const override { 130 return std::vector<Child>{ 131 {m_outer_input.get(), "Batch input rows"}, 132 {m_inner_input.get(), ""}, 133 }; 134 } 135 EndPSIBatchModeIfStarted()136 void EndPSIBatchModeIfStarted() override { 137 m_outer_input->EndPSIBatchModeIfStarted(); 138 m_inner_input->EndPSIBatchModeIfStarted(); 139 } 140 141 private: 142 /// Clear out the MEM_ROOT and prepare for reading rows anew. 143 void BeginNewBatch(); 144 145 /// If there are more outer rows, begin the next batch. If not, 146 /// move to the EOF state. 147 void BatchFinished(); 148 149 /// Find the next unmatched row, and load it for output as a NULL-complemented 150 /// row. (Assumes the NULL row flag has already been set on the inner table 151 /// iterator.) Returns 0 if a row was found, -1 if no row was found. (Errors 152 /// cannot happen.) 153 int MakeNullComplementedRow(); 154 155 /// Read a batch of outer rows (BeginNewBatch() must have been called 156 /// earlier). Returns -1 for no outer rows found (sets state to END_OF_ROWS), 157 /// 0 for OK (sets state to RETURNING_JOINED_ROWS) or 1 for error. 158 int ReadOuterRows(); 159 160 enum class State { 161 /** 162 We are about to start reading outer rows into our buffer. 163 A single Read() call will fill it up, so there is no 164 in-between “currently reading” state. 165 */ 166 NEED_OUTER_ROWS, 167 168 /** 169 We are returning rows from the MultiRangeRowIterator. 170 (For antijoins, we are looking up the rows, but don't actually 171 return them.) 172 */ 173 RETURNING_JOINED_ROWS, 174 175 /** 176 We are an outer join or antijoin, and we're returning NULL-complemented 177 rows for those outer rows that never had a matching inner row. Note that 178 this is done in the BKAIterator and not the MRR iterator for two reasons: 179 First, it gives more sensible EXPLAIN ANALYZE numbers. Second, the 180 NULL-complemented rows could be filtered inadvertently by a FilterIterator 181 before they reach the BKAIterator. 182 */ 183 RETURNING_NULL_COMPLEMENTED_ROWS, 184 185 /** 186 Both the outer and inner side are out of rows. 187 */ 188 END_OF_ROWS 189 }; 190 191 State m_state; 192 193 const unique_ptr_destroy_only<RowIterator> m_outer_input; 194 const unique_ptr_destroy_only<RowIterator> m_inner_input; 195 196 /// The MEM_ROOT we are storing the outer rows on, and also allocating MRR 197 /// buffer from. In total, this should not go significantly over 198 /// m_max_memory_available bytes. 199 MEM_ROOT m_mem_root; 200 201 /// Buffered outer rows. 202 Mem_root_array<hash_join_buffer::BufferRow> m_rows; 203 204 /// Tables and columns needed for each outer row. Rows/columns that are not 205 /// needed are filtered out in the constructor; the rest are read and stored 206 /// in m_rows. 207 hash_join_buffer::TableCollection m_outer_input_tables; 208 209 /// Used for serializing the row we read from the outer table(s), before it 210 /// stored into the MEM_ROOT and put into m_rows. Should there not be room in 211 /// m_rows for the row, it will stay in this variable until we start reading 212 /// the next batch of outer rows. 213 /// 214 /// If there are no BLOB/TEXT column in the join, we calculate an upper bound 215 /// of the row size that is used to preallocate this buffer. In the case of 216 /// BLOB/TEXT columns, we cannot calculate a reasonable upper bound, and the 217 /// row size is calculated per row. The allocated memory is kept for the 218 /// duration of the iterator, so that we (most likely) avoid reallocations. 219 String m_outer_row_buffer; 220 221 /// Whether we have a row in m_outer_row_buffer from the previous batch of 222 /// rows that we haven't stored in m_rows yet. 223 bool m_has_row_from_previous_batch = false; 224 225 /// For each outer row, how many bytes we need in the MRR buffer (ie., the 226 /// number of bytes we expect to use on rows from the inner table). 227 /// This is the expected number of inner rows per key, multiplied by the 228 /// (fixed) size of each inner row. We use this information to stop scanning 229 /// before we've used up the entire RAM allowance on outer rows, so that 230 /// we have space remaining for the inner rows (in the MRR buffer), too. 231 size_t m_mrr_bytes_needed_per_row; 232 233 /// Estimated number of bytes used on m_mem_root so far. 234 size_t m_bytes_used = 0; 235 236 /// Whether we've seen EOF from the outer iterator. 237 bool m_end_of_outer_rows = false; 238 239 /// See max_memory_available in the constructor. 240 const size_t m_max_memory_available; 241 242 /// See max_memory_available in the constructor. 243 const size_t m_mrr_bytes_needed_for_single_inner_row; 244 245 /// See mrr_iterator in the constructor. 246 MultiRangeRowIterator *const m_mrr_iterator; 247 248 /// The join type of the BKA join. 249 JoinType m_join_type; 250 251 /// If we are synthesizing NULL-complemented rows (for an outer join or 252 /// antijoin), points to the next row within "m_rows" that we haven't 253 /// considered yet. 254 hash_join_buffer::BufferRow *m_current_pos; 255 }; 256 257 /** 258 The iterator actually doing the reads from the inner table during BKA. 259 See file comment. 260 */ 261 class MultiRangeRowIterator final : public TableRowIterator { 262 public: 263 /** 264 @param thd Thread handle. 265 @param cache_idx_cond See m_cache_idx_cond. 266 @param table The inner table to scan. 267 @param keep_current_rowid If true, get the row ID on the inner table 268 for each row that we return. (Row IDs for outer tables will be 269 controlled by outer_input_tables.) 270 @param ref The index condition we are looking up on. 271 @param mrr_flags Flags passed on to MRR. 272 */ 273 MultiRangeRowIterator(THD *thd, Item *cache_idx_cond, TABLE *table, 274 bool keep_current_rowid, TABLE_REF *ref, int mrr_flags); 275 276 /** 277 Tell the MRR iterator which tables are on the left side of the BKA join 278 (the MRR iterator is always alone on the right side). This is needed so 279 that it can unpack the rows into the right tables, with the right format. 280 281 Must be called exactly once, before first Init(). Set by BKAIterator's 282 constructor; it's not easily available at the point where we construct 283 MultiRangeRowIterator. 284 */ 285 void set_outer_input_tables(JOIN *join, qep_tab_map outer_input_tables); 286 287 /** 288 Tell the MRR iterator what kind of BKA join it is part of. 289 290 Must be called exactly once, before first Init(). Set by BKAIterator's 291 constructor; it's not easily available at the point where we construct 292 MultiRangeRowIterator. 293 */ set_join_type(JoinType join_type)294 void set_join_type(JoinType join_type) { m_join_type = join_type; } 295 296 /** 297 Specify which outer rows to read inner rows for. 298 Must be called before Init(), and be valid until the last Read(). 299 */ set_rows(const hash_join_buffer::BufferRow * begin,const hash_join_buffer::BufferRow * end)300 void set_rows(const hash_join_buffer::BufferRow *begin, 301 const hash_join_buffer::BufferRow *end) { 302 m_begin = begin; 303 m_end = end; 304 } 305 306 /** 307 Specify an unused chunk of memory MRR can use for the returned inner rows. 308 Must be called before Init(), and must be at least big enough to hold 309 one inner row. 310 */ set_mrr_buffer(uchar * ptr,size_t size)311 void set_mrr_buffer(uchar *ptr, size_t size) { 312 m_mrr_buffer.buffer = ptr; 313 m_mrr_buffer.buffer_end = ptr + size; 314 } 315 316 /** 317 Specify an unused chunk of memory that we can use to mark which inner rows 318 have been read (by the parent BKA iterator) or not. This is used for outer 319 joins to know which rows need NULL-complemented versions, and for semijoins 320 and antijoins to avoid matching the same inner row more than once. 321 322 Must be called before Init() for semijoins, outer joins and antijoins, and 323 never called otherwise. There must be room at least for one bit per row 324 given in set_rows(). 325 */ set_match_flag_buffer(uchar * ptr)326 void set_match_flag_buffer(uchar *ptr) { m_match_flag_buffer = ptr; } 327 328 /** 329 Mark that the BKA iterator has seen the last row we returned from Read(). 330 (It could have been discarded by a FilterIterator before it reached them.) 331 Will be a no-op for inner joins; see set_match_flag_buffer().. 332 */ MarkLastRowAsRead()333 void MarkLastRowAsRead() { 334 if (m_match_flag_buffer != nullptr) { 335 size_t row_number = std::distance(m_begin, m_last_row_returned); 336 m_match_flag_buffer[row_number / 8] |= 1 << (row_number % 8); 337 } 338 } 339 340 /** 341 Check whether the given row has been marked as read 342 (using MarkLastRowAsRead()) or not. Used internally when doing semijoins, 343 and also by the BKAIterator when synthesizing NULL-complemented rows for 344 outer joins or antijoins. 345 */ RowHasBeenRead(const hash_join_buffer::BufferRow * row)346 bool RowHasBeenRead(const hash_join_buffer::BufferRow *row) const { 347 DBUG_ASSERT(m_match_flag_buffer != nullptr); 348 size_t row_number = std::distance(m_begin, row); 349 return m_match_flag_buffer[row_number / 8] & (1 << (row_number % 8)); 350 } 351 352 /** 353 Do the actual multi-range read with the rows given by set_rows() and using 354 the temporary buffer given in set_mrr_buffer(). 355 */ 356 bool Init() override; 357 358 /** 359 Read another inner row (if any) and load the appropriate outer row(s) 360 into the associated table buffers. 361 */ 362 int Read() override; 363 364 std::vector<std::string> DebugString() const override; 365 366 private: 367 // Thunks from function pointers to the actual callbacks. MrrInitCallbackThunk(void * init_params,uint n_ranges,uint flags)368 static range_seq_t MrrInitCallbackThunk(void *init_params, uint n_ranges, 369 uint flags) { 370 return (pointer_cast<MultiRangeRowIterator *>(init_params)) 371 ->MrrInitCallback(n_ranges, flags); 372 } MrrNextCallbackThunk(void * init_params,KEY_MULTI_RANGE * range)373 static uint MrrNextCallbackThunk(void *init_params, KEY_MULTI_RANGE *range) { 374 return (pointer_cast<MultiRangeRowIterator *>(init_params)) 375 ->MrrNextCallback(range); 376 } MrrSkipIndexTupleCallbackThunk(range_seq_t seq,char * range_info)377 static bool MrrSkipIndexTupleCallbackThunk(range_seq_t seq, 378 char *range_info) { 379 return (reinterpret_cast<MultiRangeRowIterator *>(seq)) 380 ->MrrSkipIndexTuple(range_info); 381 } MrrSkipRecordCallbackThunk(range_seq_t seq,char * range_info,uchar *)382 static bool MrrSkipRecordCallbackThunk(range_seq_t seq, char *range_info, 383 uchar *) { 384 return (reinterpret_cast<MultiRangeRowIterator *>(seq)) 385 ->MrrSkipRecord(range_info); 386 } 387 388 // Callbacks we get from the handler during the actual read. 389 range_seq_t MrrInitCallback(uint n_ranges, uint flags); 390 uint MrrNextCallback(KEY_MULTI_RANGE *range); 391 bool MrrSkipIndexTuple(char *range_info); 392 bool MrrSkipRecord(char *range_info); 393 394 /** 395 There are certain conditions that would normally be pushed down to indexes, 396 but that depend on the values of outer tables in the BKA join (ie., they are 397 join conditions), which are not set when we actually read the inner row.[1] 398 Thus, we cannot push them all the way down to the handler; however, MRR 399 gives us a similar mechanism that we can use. Specifically, if we set 400 “skip_index_tuple” to a function pointer, we will be called back for each 401 row, and can load the outer table row(s) we need to evaluate the condition. 402 This allows us to reject the rows based on the index entry alone, without 403 loading the row itself. 404 405 It is unclear how much benefit this gives us over simply not pushing these 406 conditions at all. The case of a join condition that is satisfiable using 407 the index tuple but not simply pushable down into the ref is rare; it has to 408 either be on a keypart we couldn't use (e.g., an index on A,B,C where we 409 join A and C but not B -- A then becomes part of our ref, but C needs to be 410 an index condition) or a condition that needs to be rechecked, which happens 411 only when mixing PAD SPACE / NO PAD in a join (e.g. looking up in a CHAR 412 column, but wanting the comparison as NO PAD). Especially the latter case 413 would seem unlikely to filter away a significant amount of rows. 414 415 [1] In the DebugString output, we call such conditions “dependent 416 index conditions”, since they depend on values from other tables, 417 analogous to dependent subqueries. Internally, they are called 418 cache_idx_cond, presumably because BKA originated in join buffering, 419 also known as join cache. 420 */ 421 Item *const m_cache_idx_cond; 422 423 /// See constructor. 424 const bool m_keep_current_rowid; 425 426 /// The table we are reading from. 427 TABLE *const m_table; 428 429 /// Handler for the table we are reading from. 430 handler *const m_file; 431 432 /// The index condition. 433 TABLE_REF *const m_ref; 434 435 /// Flags passed on to MRR. 436 const int m_mrr_flags; 437 438 /// Current outer rows to read inner rows for. Set by set_rows(). 439 const hash_join_buffer::BufferRow *m_begin; 440 const hash_join_buffer::BufferRow *m_end; 441 442 /// Which row we are at in the [m_begin, m_end) range. 443 /// Used during the MRR callbacks. 444 const hash_join_buffer::BufferRow *m_current_pos; 445 446 /// What row we last returned from Read() (used for MarkLastRowAsRead()). 447 const hash_join_buffer::BufferRow *m_last_row_returned; 448 449 /// Temporary space for storing inner rows, used by MRR. 450 /// Set by set_mrr_buffer(). 451 HANDLER_BUFFER m_mrr_buffer; 452 453 /// See set_match_flag_buffer(). 454 uchar *m_match_flag_buffer = nullptr; 455 456 /// Tables and columns needed for each outer row. Same as m_outer_input_tables 457 /// in the corresponding BKAIterator. 458 hash_join_buffer::TableCollection m_outer_input_tables; 459 460 /// The join type of the BKA join we are part of. Same as m_join_type in the 461 /// corresponding BKAIterator. 462 JoinType m_join_type = JoinType::INNER; 463 }; 464 465 #endif // SQL_BKA_ITERATOR_H_ 466