1 /**
2  * @file   reader.h
3  *
4  * @section LICENSE
5  *
6  * The MIT License
7  *
8  * @copyright Copyright (c) 2017-2021 TileDB, Inc.
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  *
28  * @section DESCRIPTION
29  *
30  * This file defines class Reader.
31  */
32 
33 #ifndef TILEDB_READER_H
34 #define TILEDB_READER_H
35 
36 #include <atomic>
37 
38 #include "tiledb/common/logger_public.h"
39 #include "tiledb/common/status.h"
40 #include "tiledb/sm/array_schema/dimension.h"
41 #include "tiledb/sm/misc/types.h"
42 #include "tiledb/sm/query/iquery_strategy.h"
43 #include "tiledb/sm/query/query_buffer.h"
44 #include "tiledb/sm/query/query_condition.h"
45 #include "tiledb/sm/query/reader_base.h"
46 #include "tiledb/sm/query/result_cell_slab.h"
47 #include "tiledb/sm/query/result_coords.h"
48 #include "tiledb/sm/subarray/subarray_partitioner.h"
49 
50 using namespace tiledb::common;
51 
52 namespace tiledb {
53 namespace sm {
54 
55 class Array;
56 class StorageManager;
57 class Tile;
58 
59 /** Processes read queries. */
60 class Reader : public ReaderBase, public IQueryStrategy {
61  public:
62   /* ********************************* */
63   /*     CONSTRUCTORS & DESTRUCTORS    */
64   /* ********************************* */
65 
66   /** Constructor. */
67   Reader(
68       stats::Stats* stats,
69       tdb_shared_ptr<Logger> logger,
70       StorageManager* storage_manager,
71       Array* array,
72       Config& config,
73       std::unordered_map<std::string, QueryBuffer>& buffers,
74       Subarray& subarray,
75       Layout layout,
76       QueryCondition& condition);
77 
78   /** Destructor. */
79   ~Reader() = default;
80 
81   DISABLE_COPY_AND_COPY_ASSIGN(Reader);
82   DISABLE_MOVE_AND_MOVE_ASSIGN(Reader);
83 
84   /* ********************************* */
85   /*                 API               */
86   /* ********************************* */
87 
88   /** Finalizes the reader. */
89   Status finalize();
90 
91   /**
92    * Returns `true` if the query was incomplete, i.e., if all subarray
93    * partitions in the read state have not been processed or there
94    * was some buffer overflow.
95    */
96   bool incomplete() const;
97 
98   /** Initializes the reader. */
99   Status init();
100 
101   /** Initialize the memory budget variables. */
102   Status initialize_memory_budget();
103 
104   /** Returns the current read state. */
105   const ReadState* read_state() const;
106 
107   /** Returns the current read state. */
108   ReadState* read_state();
109 
110   /** Performs a read query using its set members. */
111   Status dowork();
112 
113   /** Resets the reader object. */
114   void reset();
115 
116   /**
117    * Computes the result cell slabs for the input subarray, given the
118    * input result coordinates (retrieved from the sparse fragments).
119    * The function also computes and stores the results space tiles
120    * in `result_space_tiles`. This needs to be preserved throughout
121    * the cell copying operations, since this structure stores all
122    * the relevant result tiles for the dense fragments.
123    *
124    * @tparam T The domain datatype.
125    * @param subarray The input subarray.
126    * @param result_space_tiles The result space tiles computed by the
127    *     function, which store the result tiles from the dense fragments.
128    * @param result_coords The result coordinates produced by the sparse
129    *     fragments.
130    * @param result_tiles This will store pointers to the result tiles
131    *     of both dense and sparse fragments.
132    * @param result_cell_slabs The returned result cell slabs.
133    * @return Status
134    */
135   template <class T>
136   Status compute_result_cell_slabs(
137       const Subarray& subarray,
138       std::map<const T*, ResultSpaceTile<T>>* result_space_tiles,
139       std::vector<ResultCoords>* result_coords,
140       std::vector<ResultTile*>* result_tiles,
141       std::vector<ResultCellSlab>* result_cell_slabs) const;
142 
143   /**
144    * Computes the result cell slabs for the input subarray, given the
145    * input result coordinates (retrieved from the sparse fragments).
146    * The function also computes and stores the results space tiles
147    * in `result_space_tiles`. This needs to be preserved throughout
148    * the cell copying operations, since this structure stores all
149    * the relevant result tiles for the dense fragments. Applicable
150    * only to row-/col-major subarray layouts.
151    *
152    * @tparam T The domain datatype.
153    * @param subarray The input subarray.
154    * @param result_coords The result coordinates produced by the sparse
155    *     fragments.
156    * @param result_coords_pos The position in `result_coords` to be
157    *     passed to the result cell slab iterator in the function.
158    *     This practically keeps track of the sparse coordinate results
159    *     already processed in successive calls of this function.
160    *     The function updates this value with the current position
161    *     returned by the iterator at the end of its process.
162    * @param result_space_tiles The result space tiles computed by the
163    *     function, which store the result tiles from the dense fragments.
164    * @param result_tiles This will store pointers to the result tiles
165    *     of both dense and sparse fragments.
166    * @param frag_tile_set Stores the unique pairs (frag_idx, tile_idx)
167    *     for all result tiles.
168    * @param result_cell_slabs The returned result cell slabs.
169    * @return Status
170    */
171   template <class T>
172   Status compute_result_cell_slabs_row_col(
173       const Subarray& subarray,
174       std::map<const T*, ResultSpaceTile<T>>* result_space_tiles,
175       std::vector<ResultCoords>* result_coords,
176       uint64_t* result_coords_pos,
177       std::vector<ResultTile*>* result_tiles,
178       std::set<std::pair<unsigned, uint64_t>>* frag_tile_set,
179       std::vector<ResultCellSlab>* result_cell_slabs) const;
180 
181   /**
182    * Computes the result cell slabs for the input subarray, given the
183    * input result coordinates (retrieved from the sparse fragments).
184    * The function also computes and stores the results space tiles
185    * in `result_space_tiles`. This needs to be preserved throughout
186    * the cell copying operations, since this structure stores all
187    * the relevant result tiles for the dense fragments. Applicable
188    * only to global order subarray layouts.
189    *
190    * @tparam T The domain datatype.
191    * @param subarray The input subarray.
192    * @param result_coords The result coordinates produced by the sparse
193    *     fragments.
194    * @param result_space_tiles The result space tiles computed by the
195    *     function, which store the result tiles from the dense fragments.
196    * @param result_tiles This will store pointers to the result tiles
197    *     of both dense and sparse fragments.
198    * @param result_cell_slabs The returned result cell slabs.
199    * @return Status
200    */
201   template <class T>
202   Status compute_result_cell_slabs_global(
203       const Subarray& subarray,
204       std::map<const T*, ResultSpaceTile<T>>* result_space_tiles,
205       std::vector<ResultCoords>* result_coords,
206       std::vector<ResultTile*>* result_tiles,
207       std::vector<ResultCellSlab>* result_cell_slabs) const;
208 
209  private:
210   /* ********************************* */
211   /*         PRIVATE ATTRIBUTES        */
212   /* ********************************* */
213 
214   /** UID of the logger instance */
215   inline static std::atomic<uint64_t> logger_id_ = 0;
216 
217   /** Read state. */
218   ReadState read_state_;
219 
220   /* ********************************* */
221   /*           PRIVATE METHODS         */
222   /* ********************************* */
223 
224   /**
225    * Compute the maximal cell slabs of contiguous sparse coordinates.
226    *
227    * @param coords The coordinates to compute the slabs from.
228    * @param result_cell_slabs The result cell slabs to compute.
229    * @return Status
230    */
231   Status compute_result_cell_slabs(
232       const std::vector<ResultCoords>& result_coords,
233       std::vector<ResultCellSlab>* result_cell_slabs) const;
234 
235   /**
236    * Retrieves the coordinates that overlap the input N-dimensional range
237    * from the input result tile.
238    *
239    * @param subarray The subarray to operate on.
240    * @param frag_idx The id of the fragment that the result tile belongs to.
241    * @param tile The result tile.
242    * @param range_idx The range id.
243    * @param result_coords The overlapping coordinates to retrieve.
244    * @return Status
245    */
246   Status compute_range_result_coords(
247       Subarray* subarray,
248       unsigned frag_idx,
249       ResultTile* tile,
250       uint64_t range_idx,
251       std::vector<ResultCoords>* result_coords);
252 
253   /**
254    * Computes the result coordinates for each range of the query
255    * subarray.
256    *
257    * @param subarray The subarray to operate on.
258    * @param single_fragment For each range, it indicates whether all
259    *     result coordinates come from a single fragment.
260    * @param result_tile_map This is an auxialiary map that helps finding the
261    *     result tiles of each range.
262    * @param result_tiles The result tiles to read the coordinates from.
263    * @param range_result_coords The result coordinates to be retrieved.
264    *     It contains a vector for each range of the subarray.
265    * @return Status
266    */
267   Status compute_range_result_coords(
268       Subarray* subarray,
269       const std::vector<bool>& single_fragment,
270       const std::map<std::pair<unsigned, uint64_t>, size_t>& result_tile_map,
271       std::vector<ResultTile>* result_tiles,
272       std::vector<std::vector<ResultCoords>>* range_result_coords);
273 
274   /**
275    * Computes the result coordinates of a given range of the query
276    * subarray.
277    *
278    * @param subarray The subarray to operate on.
279    * @param range_idx The range to focus on.
280    * @param result_tile_map This is an auxialiary map that helps finding the
281    *     result_tiles overlapping with each range.
282    * @param result_tiles The result tiles to read the coordinates from.
283    * @param range_result_coords The result coordinates to be retrieved.
284    *     It contains a vector for each range of the subarray.
285    * @return Status
286    */
287   Status compute_range_result_coords(
288       Subarray* subarray,
289       uint64_t range_idx,
290       const std::map<std::pair<unsigned, uint64_t>, size_t>& result_tile_map,
291       std::vector<ResultTile>* result_tiles,
292       std::vector<ResultCoords>* range_result_coords);
293 
294   /**
295    * Computes the result coordinates of a given range of the query
296    * subarray.
297    *
298    * @param subarray The subarray to operate on.
299    * @param range_idx The range to focus on.
300    * @param fragment_idx The fragment to focus on.
301    * @param result_tile_map This is an auxialiary map that helps finding the
302    *     result_tiles overlapping with each range.
303    * @param result_tiles The result tiles to read the coordinates from.
304    * @param range_result_coords The result coordinates to be retrieved.
305    *     It contains a vector for each range of the subarray.
306    * @return Status
307    */
308   Status compute_range_result_coords(
309       Subarray* subarray,
310       uint64_t range_idx,
311       uint32_t fragment_idx,
312       const std::map<std::pair<unsigned, uint64_t>, size_t>& result_tile_map,
313       std::vector<ResultTile>* result_tiles,
314       std::vector<ResultCoords>* range_result_coords);
315 
316   /**
317    * Computes the final subarray result coordinates, which will be
318    * deduplicated and sorted on the specified subarray layout.
319    *
320    * @param range_result_coords The result coordinates for each subarray range.
321    * @param result_coords The final (subarray) result coordinates to be
322    *     retrieved.
323    * @return Status
324    *
325    * @note the function will try to gradually clean up ``range_result_coords``
326    *     as it is done processing its elements to quickly reclaim memory.
327    */
328   Status compute_subarray_coords(
329       std::vector<std::vector<ResultCoords>>* range_result_coords,
330       std::vector<ResultCoords>* result_coords);
331 
332   /**
333    * Computes info about the sparse result tiles, such as which fragment they
334    * belong to, the tile index and the type of overlap. The tile vector
335    * contains unique info about the tiles. The function also computes
336    * a map from fragment index and tile id to a result tile to keep
337    * track of the unique result  tile info for subarray ranges that overlap
338    * with common tiles.
339    *
340    * @param result_tiles The result tiles to be computed.
341    * @param result_tile_map The result tile map to be computed.
342    * @param single_fragment Each element corresponds to a range of the
343    *     subarray and is set to ``true`` if all the overlapping
344    *     tiles come from a single fragment for that range.
345    * @return Status
346    */
347   Status compute_sparse_result_tiles(
348       std::vector<ResultTile>* result_tiles,
349       std::map<std::pair<unsigned, uint64_t>, size_t>* result_tile_map,
350       std::vector<bool>* single_fragment);
351 
352   /**
353    * Computes the result coordinates from the sparse fragments.
354    *
355    * @param result_tiles This will store the unique result tiles.
356    * @param result_coords This will store the result coordinates.
357    */
358   Status compute_result_coords(
359       std::vector<ResultTile>* result_tiles,
360       std::vector<ResultCoords>* result_coords);
361 
362   /**
363    * Deduplicates the input result coordinates, breaking ties giving preference
364    * to the largest fragment index (i.e., it prefers more recent fragments).
365    *
366    * @param result_coords The result coordinates to dedup.
367    * @return Status
368    */
369   Status dedup_result_coords(std::vector<ResultCoords>* result_coords) const;
370 
371   /** Performs a read on a dense array. */
372   Status dense_read();
373 
374   /**
375    * Performs a read on a dense array.
376    *
377    * @tparam The domain type.
378    * @return Status
379    */
380   template <class T>
381   Status dense_read();
382 
383   /**
384    * Gets all the result coordinates of the input tile into `result_coords`.
385    *
386    * @param result_tile The result tile to read the coordinates from.
387    * @param result_coords The result coordinates to copy into.
388    * @return Status
389    */
390   Status get_all_result_coords(
391       ResultTile* tile, std::vector<ResultCoords>* result_coords) const;
392 
393   /**
394    * Returns `true` if a coordinate buffer for a separate dimension
395    * has been set.
396    */
397   bool has_separate_coords() const;
398 
399   /** Initializes the read state. */
400   Status init_read_state();
401 
402   /**
403    * Sorts the input result coordinates according to the subarray layout.
404    *
405    * @param iter_begin The start position of the coordinates to sort.
406    * @param iter_end The end position of the coordinates to sort.
407    * @param coords_num The number of coordinates to be sorted.
408    * @param layout The layout to sort into.
409    * @return Status
410    */
411   Status sort_result_coords(
412       std::vector<ResultCoords>::iterator iter_begin,
413       std::vector<ResultCoords>::iterator iter_end,
414       size_t coords_num,
415       Layout layout) const;
416 
417   /** Performs a read on a sparse array. */
418   Status sparse_read();
419 
420   /**
421    * Adds an extra offset in the end of the offsets buffer indicating the
422    * returned data size if an attribute is var-sized.
423    */
424   Status add_extra_offset();
425 
426   /**
427    * Returns true if the input tile's MBR of the input fragment is fully
428    * covered by the non-empty domain of a more recent fragment.
429    */
430   bool sparse_tile_overwritten(unsigned frag_idx, uint64_t tile_idx) const;
431 
432   /**
433    * Erases the coordinate tiles (zipped or separate) from the input result
434    * tiles.
435    */
436   void erase_coord_tiles(std::vector<ResultTile>* result_tiles) const;
437 
438   /** Gets statistics about the result cells. */
439   void get_result_cell_stats(
440       const std::vector<ResultCellSlab>& result_cell_slabs) const;
441 
442   /** Gets statistics about the result tiles. */
443   void get_result_tile_stats(
444       const std::vector<ResultTile*>& result_tiles) const;
445 
446   /**
447    * Calculates the hilbert values of the result coordinates between
448    * `iter_begin` and `iter_begin + hilbert_values.size()`.
449    * The hilbert values are stored
450    * in `hilbert_values`, where the first pair value is the hilbert value
451    * and the second is the position of the result coords after the
452    * input iterator.
453    */
454   Status calculate_hilbert_values(
455       std::vector<ResultCoords>::iterator iter_begin,
456       std::vector<std::pair<uint64_t, uint64_t>>* hilbert_values) const;
457 
458   /**
459    * It reorganizes the result coords given the iterator offsets in
460    * `hilbert_values` (second values in the pair). This essentially
461    * sorts the result coordinates starting at `iter_begin` based
462    * on the already sorted hilbert values.
463    *
464    * The algorithm is in-place, operates with O(1) memory and
465    * in O(coords_num) time, but modifies the offsets/positions in
466    * `hilbert_values`.
467    */
468   Status reorganize_result_coords(
469       std::vector<ResultCoords>::iterator iter_begin,
470       std::vector<std::pair<uint64_t, uint64_t>>* hilbert_values) const;
471 
472   /**
473    * Returns true if the result coordinates between the two iterators
474    * belong to the same fragment.
475    */
476   bool belong_to_single_fragment(
477       std::vector<ResultCoords>::iterator iter_begin,
478       std::vector<ResultCoords>::iterator iter_end) const;
479 
480   /** Perform necessary checks before exiting a read loop */
481   Status complete_read_loop();
482 };
483 
484 }  // namespace sm
485 }  // namespace tiledb
486 
487 #endif  // TILEDB_READER_H
488