1 /**
2  * @file   storage_manager.h
3  *
4  * @section LICENSE
5  *
6  * The MIT License
7  *
8  * @copyright Copyright (c) 2017-2021 TileDB, Inc.
9  * @copyright Copyright (c) 2016 MIT and Intel Corporation
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a copy
12  * of this software and associated documentation files (the "Software"), to deal
13  * in the Software without restriction, including without limitation the rights
14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15  * copies of the Software, and to permit persons to whom the Software is
16  * furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included in
19  * all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27  * THE SOFTWARE.
28  *
29  * @section DESCRIPTION
30  *
31  * This file defines class StorageManager.
32  */
33 
34 #ifndef TILEDB_STORAGE_MANAGER_H
35 #define TILEDB_STORAGE_MANAGER_H
36 
37 #include <atomic>
38 #include <condition_variable>
39 #include <functional>
40 #include <list>
41 #include <map>
42 #include <mutex>
43 #include <queue>
44 #include <string>
45 #include <thread>
46 #include <unordered_map>
47 
48 #include "tiledb/common/heap_memory.h"
49 #include "tiledb/common/logger_public.h"
50 #include "tiledb/common/status.h"
51 #include "tiledb/common/thread_pool.h"
52 #include "tiledb/sm/config/config.h"
53 #include "tiledb/sm/enums/walk_order.h"
54 #include "tiledb/sm/filesystem/filelock.h"
55 #include "tiledb/sm/fragment/single_fragment_info.h"
56 #include "tiledb/sm/misc/cancelable_tasks.h"
57 #include "tiledb/sm/misc/uri.h"
58 #include "tiledb/sm/stats/global_stats.h"
59 
60 using namespace tiledb::common;
61 
62 namespace tiledb {
63 namespace sm {
64 
65 class Array;
66 class ArraySchema;
67 class ArraySchemaEvolution;
68 class Buffer;
69 class BufferLRUCache;
70 class Consolidator;
71 class EncryptionKey;
72 class FragmentMetadata;
73 class FragmentInfo;
74 class Metadata;
75 class OpenArray;
76 class OpenArrayMemoryTracker;
77 class Query;
78 class RestClient;
79 class VFS;
80 
81 enum class EncryptionType : uint8_t;
82 enum class ObjectType : uint8_t;
83 
84 /** The storage manager that manages pretty much everything in TileDB. */
85 class StorageManager {
86  public:
87   /* ********************************* */
88   /*          TYPE DEFINITIONS         */
89   /* ********************************* */
90 
91   /** Enables iteration over TileDB objects in a path. */
92   class ObjectIter {
93    public:
94     /**
95      * There is a one-to-one correspondence between `expanded_` and `objs_`.
96      * An `expanded_` value is `true` if the corresponding `objs_` path
97      * has been expanded to the paths it contains in a post ored traversal.
98      * This is not used in a preorder traversal.
99      */
100     std::list<bool> expanded_;
101     /** The next URI in string format. */
102     std::string next_;
103     /** The next objects to be visited. */
104     std::list<URI> objs_;
105     /** The traversal order of the iterator. */
106     WalkOrder order_;
107     /** `True` if the iterator will recursively visit the directory tree. */
108     bool recursive_;
109   };
110 
111   /* ********************************* */
112   /*     CONSTRUCTORS & DESTRUCTORS    */
113   /* ********************************* */
114 
115   /** Constructor. */
116   StorageManager(
117       ThreadPool* compute_tp,
118       ThreadPool* io_tp,
119       stats::Stats* parent_stats,
120       tdb_shared_ptr<Logger> logger);
121 
122   /** Destructor. */
123   ~StorageManager();
124 
125   DISABLE_COPY_AND_COPY_ASSIGN(StorageManager);
126   DISABLE_MOVE_AND_MOVE_ASSIGN(StorageManager);
127 
128   /* ********************************* */
129   /*                API                */
130   /* ********************************* */
131 
132   /**
133    * Closes an array opened for reads.
134    *
135    * @param array_uri The array URI
136    * @return Status
137    */
138   Status array_close_for_reads(const URI& array_uri);
139 
140   /**
141    * Closes an array opened for writes.
142    *
143    * @param array_uri The array URI
144    * @param encryption_key The array encryption key.
145    * @param array_metadata The array metadata, which the function
146    *     flush to persistent storage.
147    * @return Status
148    */
149   Status array_close_for_writes(
150       const URI& array_uri,
151       const EncryptionKey& encryption_key,
152       Metadata* array_metadata);
153 
154   /**
155    * Opens an array for reads at a timestamp. All the metadata of the
156    * fragments created before or at the input timestamp are retrieved.
157    *
158    * If a timestamp_start is provided, this API will open the array between
159    * `timestamp_start` and `timestamp_end`.
160    *
161    * @param array_uri The array URI.
162    * @param enc_key The encryption key to use.
163    * @param timestamp_start The (optional) starting timestamp to open the array
164    * between, starting at this timestamp and ending at `timestamp_end`.
165    * @param timestamp_end The timestamp at which the array will be opened.
166    *     In TileDB, timestamps are in ms elapsed since
167    *     1970-01-01 00:00:00 +0000 (UTC).
168    * @return tuple of Status, latest ArraySchema, map of all array schemas and
169    * vector of FragmentMetadata
170    *        Status Ok on success, else error
171    *        ArraySchema The array schema to be retrieved after the
172    *           array is opened.
173    *        ArraySchemaMap Map of all array schemas found keyed by name
174    *        fragment_metadata The fragment metadata to be retrieved
175    *           after the array is opened.
176    */
177   std::tuple<
178       Status,
179       std::optional<ArraySchema*>,
180       std::optional<
181           std::unordered_map<std::string, tdb_shared_ptr<ArraySchema>>>,
182       std::optional<std::vector<tdb_shared_ptr<FragmentMetadata>>>>
183   array_open_for_reads(
184       const URI& array_uri,
185       const EncryptionKey& enc_key,
186       uint64_t timestamp_start,
187       uint64_t timestamp_end);
188 
189   /**
190    * Opens an array for reads without fragments.
191    *
192    * @param array_uri The array URI.
193    * @param enc_key The encryption key to use.
194    * @return tuple of Status, latest ArraySchema and map of all array schemas
195    *        Status Ok on success, else error
196    *        ArraySchema The array schema to be retrieved after the
197    *          array is opened.
198    *        ArraySchemaMap Map of all array schemas found keyed by name
199    */
200   std::tuple<
201       Status,
202       std::optional<ArraySchema*>,
203       std::optional<
204           std::unordered_map<std::string, tdb_shared_ptr<ArraySchema>>>>
205   array_open_for_reads_without_fragments(
206       const URI& array_uri, const EncryptionKey& enc_key);
207 
208   /** Opens an array for writes.
209    *
210    * @param array_uri The array URI.
211    * @param enc_key The encryption key.
212    * @return tuple of Status, latest ArraySchema and map of all array schemas
213    *        Status Ok on success, else error
214    *        ArraySchema The array schema to be retrieved after the
215    *          array is opened.
216    *        ArraySchemaMap Map of all array schemas found keyed by name
217    */
218   std::tuple<
219       Status,
220       std::optional<ArraySchema*>,
221       std::optional<
222           std::unordered_map<std::string, tdb_shared_ptr<ArraySchema>>>>
223   array_open_for_writes(const URI& array_uri, const EncryptionKey& enc_key);
224 
225   /**
226    * Load fragments for an already open array.
227    *
228    * @param array_uri The array URI.
229    * @param enc_key The encryption key to use.
230    * @param fragment_metadata The fragment metadata to be retrieved
231    *          after the array is opened.
232    * @param fragment_info The list of fragment info.
233    * @return Status
234    */
235   Status array_load_fragments(
236       const URI& array_uri,
237       const EncryptionKey& enc_key,
238       std::vector<tdb_shared_ptr<FragmentMetadata>>* fragment_metadata,
239       const std::vector<TimestampedURI>& fragment_info);
240 
241   /**
242    * Reopens an already open array at a potentially new timestamp,
243    * retrieving the fragment metadata of any new fragments written
244    * in the array.
245    *
246    * @param array_uri The array URI.
247    * @param enc_key The encryption key to use.
248    * @param timestamp_start The optional first timestamp between which the
249    *     array will be opened.
250    * @param timestamp_end The timestamp at which the array will be opened.
251    *     In TileDB, timestamps are in ms elapsed since
252    *     1970-01-01 00:00:00 +0000 (UTC).
253    * @return tuple of Status, latest ArraySchema, map of all array schemas and
254    * vector of FragmentMetadata
255    *        Status Ok on success, else error
256    *        ArraySchema The array schema to be retrieved after the
257    *          array is opened.
258    *        ArraySchemaMap Map of all array schemas found keyed by name
259    *        FragmentMetadata The fragment metadata to be retrieved
260    *          after the array is opened.
261    */
262   std::tuple<
263       Status,
264       std::optional<ArraySchema*>,
265       std::optional<
266           std::unordered_map<std::string, tdb_shared_ptr<ArraySchema>>>,
267       std::optional<std::vector<tdb_shared_ptr<FragmentMetadata>>>>
268   array_reopen(
269       const URI& array_uri,
270       const EncryptionKey& enc_key,
271       uint64_t timestamp_start,
272       uint64_t timestamp_end);
273 
274   /**
275    * Consolidates the fragments of an array into a single one.
276    *
277    * @param array_name The name of the array to be consolidated.
278    * @param encryption_type The encryption type of the array
279    * @param encryption_key If the array is encrypted, the private encryption
280    *    key. For unencrypted arrays, pass `nullptr`.
281    * @param key_length The length in bytes of the encryption key.
282    * @param config Configuration parameters for the consolidation
283    *     (`nullptr` means default, which will use the config associated with
284    *      this instance).
285    * @return Status
286    */
287   Status array_consolidate(
288       const char* array_name,
289       EncryptionType encryption_type,
290       const void* encryption_key,
291       uint32_t key_length,
292       const Config* config);
293 
294   /**
295    * Cleans up the array, such as its consolidated fragments and array
296    * metadata. Note that this will coarsen the granularity of time traveling
297    * (see docs for more information).
298    *
299    * @param array_name The name of the array to be vacuumed.
300    * @param config Configuration parameters for vacuuming.
301    * @return Status
302    */
303   Status array_vacuum(const char* array_name, const Config* config);
304 
305   /**
306    * Cleans up fragments that took part in consolidation. Note that this
307    * will coarsen the granularity of time traveling (see docs for more
308    * information).
309    *
310    * @param array_name The name of the array to be vacuumed.
311    * @param timestamp_start The timestamp to start vacuuming at.
312    * @param timestamp_end The timestamp to end vacuuming at.
313    * @return Status
314    */
315   Status array_vacuum_fragments(
316       const char* array_name, uint64_t timestamp_start, uint64_t timestamp_end);
317 
318   /**
319    * Cleans up consolidated fragment metadata (all except the last one).
320    *
321    * @param array_name The name of the array to be consolidated.
322    * @return Status
323    */
324   Status array_vacuum_fragment_meta(const char* array_name);
325 
326   /**
327    * Cleans up consolidated array metadata.
328    *
329    * @param array_name The name of the array to be consolidated.
330    * @param timestamp_start The timestamp to start vacuuming at.
331    * @param timestamp_end The timestamp to end vacuuming at.
332    * @return Status
333    */
334   Status array_vacuum_array_meta(
335       const char* array_name, uint64_t timestamp_start, uint64_t timestamp_end);
336 
337   /**
338    * Consolidates the metadata of an array into a single file.
339    *
340    * @param array_name The name of the array whose metadata will be
341    *     consolidated.
342    * @param encryption_type The encryption type of the array
343    * @param encryption_key If the array is encrypted, the private encryption
344    *    key. For unencrypted arrays, pass `nullptr`.
345    * @param key_length The length in bytes of the encryption key.
346    * @param config Configuration parameters for the consolidation
347    *     (`nullptr` means default, which will use the config associated with
348    *      this instance).
349    * @return Status
350    */
351   Status array_metadata_consolidate(
352       const char* array_name,
353       EncryptionType encryption_type,
354       const void* encryption_key,
355       uint32_t key_length,
356       const Config* config);
357 
358   /**
359    * Creates a TileDB array storing its schema.
360    *
361    * @param array_uri The URI of the array to be created.
362    * @param array_schema The array schema.
363    * @param encryption_key The encryption key to use.
364    * @return Status
365    */
366   Status array_create(
367       const URI& array_uri,
368       ArraySchema* array_schema,
369       const EncryptionKey& encryption_key);
370 
371   /**
372    * Evolve a TileDB array schema and store its new schema.
373    *
374    * @param array_uri The URI of the array to be evolved.
375    * @param schema_evolution The schema evolution.
376    * @param encryption_key The encryption key to use.
377    * @return Status
378    */
379   Status array_evolve_schema(
380       const URI& array_uri,
381       ArraySchemaEvolution* array_schema,
382       const EncryptionKey& encryption_key);
383 
384   /**
385    * Upgrade a TileDB array to latest format version.
386    *
387    * @param array_uri The URI of the array to be upgraded.
388    * @param config Configuration parameters for the upgrade
389    *     (`nullptr` means default, which will use the config associated with
390    *      this instance).
391    * @return Status
392    */
393   Status array_upgrade_version(const URI& array_uri, const Config* config);
394 
395   /**
396    * Gets the memory tracker for an open array.
397    *
398    * @param array_uri The array URI.
399    * @param top_level Specifies if the call is the top level call in recursion.
400    * @return The memory tracker.
401    */
402   OpenArrayMemoryTracker* array_memory_tracker(
403       const URI& array_uri, bool top_level = true);
404 
405   /**
406    * Retrieves the non-empty domain from an array. This is the union of the
407    * non-empty domains of the array fragments.
408    *
409    * @param array An open array object (must be already open).
410    * @param domain The domain to be retrieved.
411    * @param is_empty `true` if the non-empty domain is empty (the array
412    *     is empty).
413    * @return Status
414    */
415   Status array_get_non_empty_domain(
416       Array* array, NDRange* domain, bool* is_empty);
417 
418   /**
419    * Retrieves the non-empty domain from an array. This is the union of the
420    * non-empty domains of the array fragments.
421    *
422    * @param array An open array object (must be already open).
423    * @param domain The domain to be retrieved.
424    * @param is_empty `ture` if the non-empty domain is empty (the array
425    *     is empty).
426    * @return Status
427    */
428   Status array_get_non_empty_domain(Array* array, void* domain, bool* is_empty);
429 
430   /**
431    * Retrieves the non-empty domain from an array on the given dimension.
432    * This is the union of the non-empty domains of the array fragments.
433    *
434    * @param array An open array object (must be already open).
435    * @param idx The dimension index.
436    * @param domain The domain to be retrieved.
437    * @param is_empty `ture` if the non-empty domain is empty (the array
438    *     is empty).
439    * @return Status
440    */
441   Status array_get_non_empty_domain_from_index(
442       Array* array, unsigned idx, void* domain, bool* is_empty);
443 
444   /**
445    * Retrieves the non-empty domain from an array on the given dimension.
446    * This is the union of the non-empty domains of the array fragments.
447    *
448    * @param array An open array object (must be already open).
449    * @param name The dimension name.
450    * @param domain The domain to be retrieved.
451    * @param is_empty `ture` if the non-empty domain is empty (the array
452    *     is empty).
453    * @return Status
454    */
455   Status array_get_non_empty_domain_from_name(
456       Array* array, const char* name, void* domain, bool* is_empty);
457 
458   /**
459    * Retrieves the non-empty domain size from an array on the given dimension.
460    * This is the union of the non-empty domains of the array fragments.
461    * Applicable only to var-sized dimensions.
462    *
463    * @param array An open array object (must be already open).
464    * @param idx The dimension index.
465    * @param start_size The size in bytes of the range start.
466    * @param end_size The size in bytes of the range end.
467    * @param is_empty `ture` if the non-empty domain is empty (the array
468    *     is empty).
469    * @return Status
470    */
471   Status array_get_non_empty_domain_var_size_from_index(
472       Array* array,
473       unsigned idx,
474       uint64_t* start_size,
475       uint64_t* end_size,
476       bool* is_empty);
477 
478   /**
479    * Retrieves the non-empty domain size from an array on the given dimension.
480    * This is the union of the non-empty domains of the array fragments.
481    * Applicable only to var-sized dimensions.
482    *
483    * @param array An open array object (must be already open).
484    * @param name The dimension name.
485    * @param start_size The size in bytes of the range start.
486    * @param end_size The size in bytes of the range end.
487    * @param is_empty `ture` if the non-empty domain is empty (the array
488    *     is empty).
489    * @return Status
490    */
491   Status array_get_non_empty_domain_var_size_from_name(
492       Array* array,
493       const char* name,
494       uint64_t* start_size,
495       uint64_t* end_size,
496       bool* is_empty);
497 
498   /**
499    * Retrieves the non-empty domain from an array on the given dimension.
500    * This is the union of the non-empty domains of the array fragments.
501    * Applicable only to var-sized dimensions.
502    *
503    * @param array An open array object (must be already open).
504    * @param idx The dimension index.
505    * @param start The domain range start to set.
506    * @param end The domain range end to set.
507    * @param is_empty `ture` if the non-empty domain is empty (the array
508    *     is empty).
509    * @return Status
510    */
511   Status array_get_non_empty_domain_var_from_index(
512       Array* array, unsigned idx, void* start, void* end, bool* is_empty);
513 
514   /**
515    * Retrieves the non-empty domain from an array on the given dimension.
516    * This is the union of the non-empty domains of the array fragments.
517    * Applicable only to var-sized dimensions.
518    *
519    * @param array An open array object (must be already open).
520    * @param name The dimension name.
521    * @param start The domain range start to set.
522    * @param end The domain range end to set.
523    * @param is_empty `ture` if the non-empty domain is empty (the array
524    *     is empty).
525    * @return Status
526    */
527   Status array_get_non_empty_domain_var_from_name(
528       Array* array, const char* name, void* start, void* end, bool* is_empty);
529 
530   /**
531    * Retrieves the encryption type from an array.
532    *
533    * @param array_uri URI of the array
534    * @param encryption_type Set to the encryption type of the array.
535    * @return Status
536    */
537   Status array_get_encryption(
538       const std::string& array_uri, EncryptionType* encryption_type);
539 
540   /**
541    * Exclusively locks an array preventing it from being opened in
542    * read mode. This function will wait on the array to
543    * be closed if it is already open (always in read mode). After an array
544    * is xlocked, any attempt to open an array in read mode will have to wait
545    * until the array is unlocked with `xunlock_array`.
546    *
547    * An array is exclusively locked only for a short time upon consolidation,
548    * during removing the directories of the old fragments that got consolidated.
549    *
550    * @note Arrays that are opened in write mode need not be xlocked. The
551    *     reason is that the `OpenArray` objects created when opening
552    *     in write mode do not store any fragment metadata and, hence,
553    *     are not affected by a potentially concurrent consolidator deleting
554    *     fragment directories.
555    */
556   Status array_xlock(const URI& array_uri);
557 
558   /** Releases an exclusive lock for the input array. */
559   Status array_xunlock(const URI& array_uri);
560 
561   /**
562    * Pushes an async query to the queue.
563    *
564    * @param query The async query.
565    * @return Status
566    */
567   Status async_push_query(Query* query);
568 
569   /** Cancels all background tasks. */
570   Status cancel_all_tasks();
571 
572   /** Returns true while all tasks are being cancelled. */
573   bool cancellation_in_progress();
574 
575   /** Returns the configuration parameters. */
576   const Config& config() const;
577 
578   /** Creates a directory with the input URI. */
579   Status create_dir(const URI& uri);
580 
581   /** Creates an empty file with the input URI. */
582   Status touch(const URI& uri);
583 
584   /**
585    * Gets the fragment information for a given array at a particular
586    * timestamp.
587    *
588    * @param array The array.
589    * @param timestamp_start The function will consider fragments created
590    *     at or after this timestamp.
591    * @param timestamp_end The function will consider fragments created
592    *     at or before this timestamp.
593    * @param fragment_info The fragment information to be retrieved.
594    *     The fragments are sorted in chronological creation order.
595    * @param get_to_vacuum Whether or not to receive information about
596    *     fragments to vacuum.
597    * @return Status
598    */
599   Status get_fragment_info(
600       const Array& array,
601       uint64_t timestamp_start,
602       uint64_t timestamp_end,
603       FragmentInfo* fragment_info,
604       bool get_to_vacuum = false);
605 
606   /**
607    * Gets the fragment info for a single fragment URI.
608    *
609    * @param array The array.
610    * @param fragment_uri The fragment URI.
611    * @param fragment_info The fragment info to retrieve.
612    * @return Status
613    */
614   Status get_fragment_info(
615       const Array& array,
616       const URI& fragment_uri,
617       SingleFragmentInfo* fragment_info);
618 
619   /**
620    * Retrieves all the fragment URIs of an array, along with the latest
621    * consolidated fragment metadata URI `meta_uri`.
622    */
623   Status get_fragment_uris(
624       const URI& array_uri,
625       std::vector<URI>* fragment_uris,
626       URI* meta_uri) const;
627 
628   /** Returns the current map of any set tags. */
629   const std::unordered_map<std::string, std::string>& tags() const;
630 
631   /**
632    * Creates a TileDB group.
633    *
634    * @param group The URI of the group to be created.
635    * @return Status
636    */
637   Status group_create(const std::string& group);
638 
639   /**
640    * Initializes the storage manager.
641    *
642    * @param config The configuration parameters.
643    * @return Status
644    */
645   Status init(const Config* config);
646 
647   /** Returns the thread pool for compute-bound tasks. */
648   ThreadPool* compute_tp();
649 
650   /** Returns the thread pool for io-bound tasks. */
651   ThreadPool* io_tp();
652 
653   /**
654    * If the storage manager was configured with a REST server, return the
655    * client instance. Else, return nullptr.
656    */
657   RestClient* rest_client() const;
658 
659   /**
660    * Checks if the input URI represents an array.
661    *
662    * @param The URI to be checked.
663    * @param is_array Set to `true` if the URI is an array and `false` otherwise.
664    * @return Status
665    */
666   Status is_array(const URI& uri, bool* is_array) const;
667 
668   /**
669    * Checks if the input URI represents a directory.
670    *
671    * @param The URI to be checked.
672    * @param is_dir Set to `true` if the URI is a directory and `false`
673    *     otherwise.
674    * @return Status
675    */
676   Status is_dir(const URI& uri, bool* is_dir) const;
677 
678   /**
679    * Checks if the input URI represents a fragment. The functions takes into
680    * account the fragment version, which is retrived directly from the URI.
681    * For versions >= 5, the function checks whether the URI is included
682    * in `ok_uris`. For versions < 5, `ok_uris` is empty so the function
683    * checks for the existence of the fragment metadata file in the fragment
684    * URI directory. Therefore, the function is more expensive for earlier
685    * fragment versions.
686    *
687    * @param The URI to be checked.
688    * @param ok_uris For checking URI existence of versions >= 5.
689    * @param is_fragment Set to `1` if the URI is a fragment and `0`
690    *     otherwise.
691    * @return Status
692    */
693   Status is_fragment(
694       const URI& uri, const std::set<URI>& ok_uris, int* is_fragment) const;
695 
696   /**
697    * Checks if the input URI represents a group.
698    *
699    * @param The URI to be checked.
700    * @param is_group Set to `true` if the URI is a group and `false`
701    *     otherwise.
702    * @return Status
703    */
704   Status is_group(const URI& uri, bool* is_group) const;
705 
706   /**
707    * Checks if the input URI represents a file.
708    *
709    * @param The URI to be checked.
710    * @param is_file Set to `true` if the URI is a file and `false`
711    *     otherwise.
712    * @return Status
713    */
714   Status is_file(const URI& uri, bool* is_file) const;
715 
716   /**
717    * Check if a URI is a vacuum file or not based on the file suffix
718    * @param uri
719    * @return true is vacuum file, false otherwise
720    */
721   bool is_vacuum_file(const URI& uri) const;
722 
723   /**
724    * Retrieve all array schemas for an array uri under its __schema directory.
725    *
726    * @param array_uri The URI path of the array.
727    * @param uris The vector of array schema URIS sorted from earliest to the
728    * latest.
729    * @return Status
730    */
731   Status get_array_schema_uris(
732       const URI& array_uri, std::vector<URI>* schema_uris) const;
733 
734   /**
735    * Get latest array schema for an array uri.
736    *
737    * @param array_uri The URI path of the array.
738    * @param uri The latest array schema URI.
739    * @return Status
740    */
741   Status get_latest_array_schema_uri(
742       const URI& array_uri, URI* schema_uri) const;
743 
744   /**
745    * Loads the schema of a schema uri from persistent storage into memory.
746    *
747    * @param array_schema_uri The URI path of the array schema.
748    * @param array_uri The URI path of the array.
749    * @param encryption_key The encryption key to use.
750    * @param array_schema The array schema to be retrieved.
751    * @return Status
752    */
753   Status load_array_schema_from_uri(
754       const URI& array_schema_uri,
755       const EncryptionKey& encryption_key,
756       ArraySchema** array_schema);
757 
758   /**
759    * Loads the schema of an array from persistent storage into memory.
760    *
761    * @param array_uri The URI path of the array.
762    * @param encryption_key The encryption key to use.
763    * @param array_schema The array schema to be retrieved.
764    * @return Status
765    */
766   Status load_array_schema(
767       const URI& array_uri,
768       const EncryptionKey& encryption_key,
769       ArraySchema** array_schema);
770 
771   /**
772    * Loads all schemas of an array from persistent storage into memory.
773    *
774    * @param array_uri The URI path of the array.
775    * @param encryption_key The encryption key to use.
776    * @return tuple of Status and optional unordered map. If Status is an error
777    * the unordered_map will be nullopt
778    *        Status Ok on success, else error
779    *        ArraySchemaMap Map of all array schemas found keyed by name
780    */
781   std::tuple<
782       Status,
783       std::optional<
784           std::unordered_map<std::string, tdb_shared_ptr<ArraySchema>>>>
785   load_all_array_schemas(
786       const URI& array_uri, const EncryptionKey& encryption_key);
787 
788   /**
789    * Loads the array metadata from persistent storage that were created
790    * at or before `timestamp_end` and at or after `timestamp_start`.
791    */
792   Status load_array_metadata(
793       const URI& array_uri,
794       const EncryptionKey& encryption_key,
795       uint64_t timestamp_start,
796       uint64_t timestamp_end,
797       Metadata* metadata);
798 
799   /** Removes a TileDB object (group, array). */
800   Status object_remove(const char* path) const;
801 
802   /**
803    * Renames a TileDB object (group, array). If
804    * `new_path` exists, `new_path` will be overwritten.
805    */
806   Status object_move(const char* old_path, const char* new_path) const;
807 
808   /**
809    * Creates a new object iterator for the input path. The iteration
810    * in this case will be recursive in the entire directory tree rooted
811    * at `path`.
812    *
813    * @param obj_iter The object iterator to be created (memory is allocated for
814    *     it by the function).
815    * @param path The path the iterator will target at.
816    * @param order The traversal order of the iterator.
817    * @return Status
818    */
819   Status object_iter_begin(
820       ObjectIter** obj_iter, const char* path, WalkOrder order);
821 
822   /**
823    * Creates a new object iterator for the input path. The iteration will
824    * not be recursive, and only the children of `path` will be visited.
825    *
826    * @param obj_iter The object iterator to be created (memory is allocated for
827    *     it by the function).
828    * @param path The path the iterator will target at.
829    * @return Status
830    */
831   Status object_iter_begin(ObjectIter** obj_iter, const char* path);
832 
833   /** Frees the object iterator. */
834   void object_iter_free(ObjectIter* obj_iter);
835 
836   /**
837    * Retrieves the next object path and type.
838    *
839    * @param obj_iter The object iterator.
840    * @param path The object path that is retrieved.
841    * @param type The object type that is retrieved.
842    * @param has_next True if an object path was retrieved and false otherwise.
843    * @return Status
844    */
845   Status object_iter_next(
846       ObjectIter* obj_iter,
847       const char** path,
848       ObjectType* type,
849       bool* has_next);
850 
851   /**
852    * Retrieves the next object in the post-order traversal.
853    *
854    * @param obj_iter The object iterator.
855    * @param path The object path that is retrieved.
856    * @param type The object type that is retrieved.
857    * @param has_next True if an object path was retrieved and false otherwise.
858    * @return Status
859    */
860   Status object_iter_next_postorder(
861       ObjectIter* obj_iter,
862       const char** path,
863       ObjectType* type,
864       bool* has_next);
865 
866   /**
867    * Retrieves the next object in the post-order traversal.
868    *
869    * @param obj_iter The object iterator.
870    * @param path The object path that is retrieved.
871    * @param type The object type that is retrieved.
872    * @param has_next True if an object path was retrieved and false otherwise.
873    * @return Status
874    */
875   Status object_iter_next_preorder(
876       ObjectIter* obj_iter,
877       const char** path,
878       ObjectType* type,
879       bool* has_next);
880 
881   /**
882    * Returns the tiledb object type
883    *
884    * @param uri Path to TileDB object resource
885    * @param type The ObjectType to be retrieved.
886    * @return Status
887    */
888   Status object_type(const URI& uri, ObjectType* type) const;
889 
890   /** Submits a query for (sync) execution. */
891   Status query_submit(Query* query);
892 
893   /**
894    * Submits a query for async execution.
895    *
896    * @param query The query to submit.
897    * @return Status
898    */
899   Status query_submit_async(Query* query);
900 
901   /**
902    * Reads from the cache into the input buffer. `uri` and `offset` collectively
903    * form the key of the cached object to be read. Essentially, this is used
904    * to read potentially cached tiles. `uri` is the URI of the attribute the
905    * tile belongs to, and `offset` is the offset in the attribute file where
906    * the tile is located. Observe that the `uri`, `offset` pair is unique.
907    *
908    * @param uri The URI of the cached object.
909    * @param offset The offset of the cached object.
910    * @param buffer The buffer to write into. The function reallocates memory
911    *     for the buffer, sets its size to *nbytes* and resets its offset.
912    * @param nbytes Number of bytes to be read.
913    * @param in_cache This is set to `true` if the object is in the cache,
914    *     and `false` otherwise.
915    * @return Status.
916    */
917   Status read_from_cache(
918       const URI& uri,
919       uint64_t offset,
920       Buffer* buffer,
921       uint64_t nbytes,
922       bool* in_cache) const;
923 
924   /**
925    * Reads from a file into the input buffer.
926    *
927    * @param uri The URI file to read from.
928    * @param offset The offset in the file the read will start from.
929    * @param buffer The buffer to write into. The function reallocates memory
930    *     for the buffer, sets its size to *nbytes* and resets its offset.
931    * @param nbytes The number of bytes to read.
932    * @return Status.
933    */
934   Status read(
935       const URI& uri, uint64_t offset, Buffer* buffer, uint64_t nbytes) const;
936 
937   /**
938    * Reads from a file into the raw input buffer.
939    *
940    * @param uri The URI file to read from.
941    * @param offset The offset in the file the read will start from.
942    * @param buffer The buffer to write into.
943    * @param nbytes The number of bytes to read.
944    * @return Status.
945    */
946   Status read(
947       const URI& uri, uint64_t offset, void* buffer, uint64_t nbytes) const;
948 
949   /**
950    * Sets a string/string KV "tag" on the storage manager instance.
951    *
952    * This is currently only meant for internal TileDB Inc. usage.
953    *
954    * @param key Tag key
955    * @param value Tag value
956    * @return Status
957    */
958   Status set_tag(const std::string& key, const std::string& value);
959 
960   /**
961    * Stores an array schema into persistent storage.
962    *
963    * @param array_schema The array metadata to be stored.
964    * @param encryption_key The encryption key to use.
965    * @return Status
966    */
967   Status store_array_schema(
968       ArraySchema* array_schema, const EncryptionKey& encryption_key);
969 
970   /**
971    * Stores the array metadata into persistent storage.
972    *
973    * @param array_uri The array URI.
974    * @param encryption_key The encryption key to use.
975    * @param array_metadata The array metadata.
976    * @return Status
977    */
978   Status store_array_metadata(
979       const URI& array_uri,
980       const EncryptionKey& encryption_key,
981       Metadata* array_metadata);
982 
983   /** Closes a file, flushing its contents to persistent storage. */
984   Status close_file(const URI& uri);
985 
986   /** Syncs a file or directory, flushing its contents to persistent storage. */
987   Status sync(const URI& uri);
988 
989   /** Returns the virtual filesystem object. */
990   VFS* vfs() const;
991 
992   /**
993    * Writes the contents of a buffer into the cache. `uri` and `offset`
994    * collectively form the key of the object to be cached. Essentially, this is
995    * used to cach tiles. `uri` is the URI of the attribute the
996    * tile belongs to, and `offset` is the offset in the attribute file where
997    * the tile is located. Observe that the `uri`, `offset` pair is unique.
998    *
999    * @param uri The URI of the cached object.
1000    * @param offset The offset of the cached object.
1001    * @param buffer The buffer whose contents will be cached.
1002    * @return Status.
1003    */
1004   Status write_to_cache(const URI& uri, uint64_t offset, Buffer* buffer) const;
1005 
1006   /**
1007    * Writes the contents of a buffer into a URI file.
1008    *
1009    * @param uri The file to write into.
1010    * @param buffer The buffer to write.
1011    * @return Status.
1012    */
1013   Status write(const URI& uri, Buffer* buffer) const;
1014 
1015   /**
1016    * Writes the input data into a URI file.
1017    *
1018    * @param uri The file to write into.
1019    * @param data The data to write.
1020    * @param size The data size in bytes.
1021    * @return Status.
1022    */
1023   Status write(const URI& uri, void* data, uint64_t size) const;
1024 
1025   /** Returns `stats_`. */
1026   stats::Stats* stats();
1027 
1028   /** Returns the internal logger object. */
1029   tdb_shared_ptr<Logger> logger() const;
1030 
1031  private:
1032   /* ********************************* */
1033   /*        PRIVATE DATATYPES          */
1034   /* ********************************* */
1035 
1036   /**
1037    * Helper RAII struct that increments 'queries_in_progress' in the constructor
1038    * and decrements in the destructor, on the given StorageManager instance.
1039    *
1040    * This ensures that the counter is decremented even in the case of
1041    * exceptions.
1042    */
1043   struct QueryInProgress {
1044     /** The StorageManager instance. */
1045     StorageManager* sm;
1046 
1047     /** Constructor. Calls increment_in_progress() on given StorageManager. */
QueryInProgressQueryInProgress1048     QueryInProgress(StorageManager* sm)
1049         : sm(sm) {
1050       sm->increment_in_progress();
1051     }
1052 
1053     /** Destructor. Calls decrement_in_progress() on given StorageManager. */
~QueryInProgressQueryInProgress1054     ~QueryInProgress() {
1055       sm->decrement_in_progress();
1056     }
1057   };
1058 
1059   /* ********************************* */
1060   /*        PRIVATE ATTRIBUTES         */
1061   /* ********************************* */
1062 
1063   /** The class stats. */
1064   stats::Stats* stats_;
1065 
1066   /** The class logger. */
1067   tdb_shared_ptr<Logger> logger_;
1068 
1069   /** Set to true when tasks are being cancelled. */
1070   bool cancellation_in_progress_;
1071 
1072   /** Mutex protecting cancellation_in_progress_. */
1073   std::mutex cancellation_in_progress_mtx_;
1074 
1075   /**
1076    * The condition variable for exlcusively locking arrays. This is used
1077    * to wait for an array to be closed, before being exclusively locked
1078    * by `array_xlock`.
1079    */
1080   std::condition_variable xlock_cv_;
1081 
1082   /** Mutex for providing thread-safety upon creating TileDB objects. */
1083   std::mutex object_create_mtx_;
1084 
1085   /** Stores the TileDB configuration parameters. */
1086   Config config_;
1087 
1088   /** Stores exclusive filelocks for arrays. */
1089   std::unordered_map<std::string, filelock_t> xfilelocks_;
1090 
1091   /** Mutex for managing OpenArray objects for reads. */
1092   std::mutex open_array_for_reads_mtx_;
1093 
1094   /** Mutex for managing OpenArray objects for writes. */
1095   std::mutex open_array_for_writes_mtx_;
1096 
1097   /** Mutex for managing exclusive locks. */
1098   std::mutex xlock_mtx_;
1099 
1100   /** Stores the currently open arrays for reads. */
1101   std::map<std::string, OpenArray*> open_arrays_for_reads_;
1102 
1103   /** Stores the currently open arrays for writes. */
1104   std::map<std::string, OpenArray*> open_arrays_for_writes_;
1105 
1106   /** Count of the number of queries currently in progress. */
1107   uint64_t queries_in_progress_;
1108 
1109   /** Guards queries_in_progress_ counter. */
1110   std::mutex queries_in_progress_mtx_;
1111 
1112   /** Guards queries_in_progress_ counter. */
1113   std::condition_variable queries_in_progress_cv_;
1114 
1115   /** The thread pool for compute-bound tasks. */
1116   ThreadPool* const compute_tp_;
1117 
1118   /** The thread pool for io-bound tasks. */
1119   ThreadPool* const io_tp_;
1120 
1121   /** Tracks all scheduled tasks that can be safely cancelled before execution.
1122    */
1123   CancelableTasks cancelable_tasks_;
1124 
1125   /** Tags for the context object. */
1126   std::unordered_map<std::string, std::string> tags_;
1127 
1128   /** A tile cache. */
1129   tdb_unique_ptr<BufferLRUCache> tile_cache_;
1130 
1131   /**
1132    * Virtual filesystem handler. It directs queries to the appropriate
1133    * filesystem backend. Note that this is stateful.
1134    */
1135   VFS* vfs_;
1136 
1137   /** The rest client (may be null if none was configured). */
1138   tdb_unique_ptr<RestClient> rest_client_;
1139 
1140   /* ********************************* */
1141   /*         PRIVATE METHODS           */
1142   /* ********************************* */
1143 
1144   /**
1145    * This is an auxiliary function to the other `array_open*` functions.
1146    * It opens the array, retrieves an `OpenArray` instance, acquires
1147    * its mutex and increases its counter. The array schema of the array
1148    * is loaded, but not any fragment metadata at this point.
1149    *
1150    * @param array_uri The array URI.
1151    * @param encryption_key The encryption key.
1152    * @param open_array The `OpenArray` instance retrieved after opening
1153    *      the array. Note that its mutex will be locked and its counter
1154    *      will have been incremented when the function returns.
1155    * @return Status
1156    */
1157   Status array_open_without_fragments(
1158       const URI& array_uri,
1159       const EncryptionKey& encryption_key,
1160       OpenArray** open_array);
1161 
1162   /** Decrement the count of in-progress queries. */
1163   void decrement_in_progress();
1164 
1165   /** Retrieves all the array metadata URI's of an array. */
1166   Status get_array_metadata_uris(
1167       const URI& array_uri, std::vector<URI>* array_metadata_uris) const;
1168 
1169   /** Increment the count of in-progress queries. */
1170   void increment_in_progress();
1171 
1172   /**
1173    * Loads the array schema into an open array.
1174    *
1175    * @param array_uri The array URI.
1176    * @param open_array The open array object.
1177    * @param encryption_key The encryption key to use.
1178    * @return Status
1179    */
1180   Status load_array_schema(
1181       const URI& array_uri,
1182       OpenArray* open_array,
1183       const EncryptionKey& encryption_key);
1184 
1185   /**
1186    * Loads the array metadata whose URIs are specified in the input.
1187    * The array metadata are cached in binary form in `open_array`.
1188    * Only the metadata that have not yet been loaded will be fetched
1189    * into the open array object. Eventually, the function will
1190    * deserialize the appropriate array metadata into `metadata`.
1191    *
1192    * @param open_array The open array object.
1193    * @param encryption_key The encryption key to use.
1194    * @param array_metadata_to_load The timestamped URIs of the array
1195    *     metadata to load.
1196    * @param metadata The array metadata to be retrieved.
1197    * @return Status
1198    */
1199   Status load_array_metadata(
1200       OpenArray* open_array,
1201       const EncryptionKey& encryption_key,
1202       const std::vector<TimestampedURI>& array_metadata_to_load,
1203       Metadata* metadata);
1204 
1205   /**
1206    * Loads the fragment metadata of an open array given a vector of
1207    * fragment URIs `fragments_to_load`. If the fragment metadata
1208    * are not already loaded into the array, the function loads them.
1209    * The function stores the fragment metadata of each fragment
1210    * in `fragments_to_load` into vector `fragment_metadata`, such
1211    * that there is a one-to-one correspondence between the two vectors.
1212    *
1213    * @param open_array The open array object.
1214    * @param encryption_key The encryption key to use.
1215    * @param fragments_to_load The fragments whose metadata to load.
1216    * @param meta_buff A buffer that contains the consolidated fragment
1217    *     metadata.
1218    * @param offsets A map from a fragment name to an offset in `meta_buff`
1219    *     where the basic metadata can be found. If the offset cannot be
1220    *     found, then the metadata of that fragment will be loaded from
1221    *     storage instead.
1222    * @param fragment_metadata The fragment metadata retrieved in a
1223    *     vector.
1224    * @return Status
1225    */
1226   Status load_fragment_metadata(
1227       OpenArray* open_array,
1228       const EncryptionKey& encryption_key,
1229       const std::vector<TimestampedURI>& fragments_to_load,
1230       Buffer* meta_buff,
1231       const std::unordered_map<std::string, uint64_t>& offsets,
1232       std::vector<tdb_shared_ptr<FragmentMetadata>>* fragment_metadata);
1233 
1234   /**
1235    * Loads the latest consolidated fragment metadata from storage.
1236    *
1237    * @param uri The URI of the consolidated fragment metadata.
1238    * @param enc_key The encryption key that may be needed to access the file.
1239    * @param f_buff The buffer to hold the consolidated fragment metadata.
1240    * @param offsets A map from the fragment name to the offset in `f_buff` where
1241    *     the basic fragment metadata starts.
1242    * @return Status
1243    */
1244   Status load_consolidated_fragment_meta(
1245       const URI& uri,
1246       const EncryptionKey& enc_key,
1247       Buffer* f_buff,
1248       std::unordered_map<std::string, uint64_t>* offsets);
1249 
1250   /**
1251    * Retrieves the URI of the latest consolidated fragment metadata,
1252    * among the URIs in `uris`.
1253    */
1254   Status get_consolidated_fragment_meta_uri(
1255       const std::vector<URI>& uris, URI* meta_uri) const;
1256 
1257   /**
1258    * Applicable to fragment and array metadata URIs.
1259    *
1260    * Gets the sorted URIs in ascending first timestamp order,
1261    * breaking ties with lexicographic
1262    * sorting of UUID. Only the URIs with timestamp between `timestamp_start`
1263    * and `timestamp_end` (inclusive) are considered. The sorted URIs are
1264    * stored in the last input, including their timestamps.
1265    */
1266   Status get_sorted_uris(
1267       const std::vector<URI>& uris,
1268       std::vector<TimestampedURI>* sorted_uris,
1269       uint64_t timestamp_start,
1270       uint64_t timestamp_end) const;
1271 
1272   /**
1273    * It computes the URIs `to_vacuum` from the input `uris`, considering
1274    * only the URIs whose first timestamp is greater than or equal to
1275    * `timestamp_start` and second timestamp is smaller than or equal to
1276    * `timestamp_end`. The function also retrieves the `vac_uris` (files with
1277    * `.vac` suffix) that were used to compute `to_vacuum`.
1278    */
1279   Status get_uris_to_vacuum(
1280       const std::vector<URI>& uris,
1281       uint64_t timestamp_start,
1282       uint64_t timestamp_end,
1283       std::vector<URI>* to_vacuum,
1284       std::vector<URI>* vac_uris,
1285       bool allow_partial = true) const;
1286 
1287   /** Block until there are zero in-progress queries. */
1288   void wait_for_zero_in_progress();
1289 
1290   /** Initializes a REST client, if one was configured. */
1291   Status init_rest_client();
1292 
1293   /** Sets default tag values on this StorageManager. */
1294   Status set_default_tags();
1295 };
1296 
1297 }  // namespace sm
1298 }  // namespace tiledb
1299 
1300 #endif  // TILEDB_STORAGE_MANAGER_H
1301