1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 //
10 // Thread-safe (provides internal synchronization)
11 
12 #pragma once
13 #include <string>
14 #include <vector>
15 #include <stdint.h>
16 
17 #include "db/dbformat.h"
18 #include "db/range_del_aggregator.h"
19 #include "options/cf_options.h"
20 #include "port/port.h"
21 #include "rocksdb/cache.h"
22 #include "rocksdb/env.h"
23 #include "rocksdb/options.h"
24 #include "rocksdb/table.h"
25 #include "table/table_reader.h"
26 #include "trace_replay/block_cache_tracer.h"
27 
28 namespace ROCKSDB_NAMESPACE {
29 
30 class Env;
31 class Arena;
32 struct FileDescriptor;
33 class GetContext;
34 class HistogramImpl;
35 
36 // Manages caching for TableReader objects for a column family. The actual
37 // cache is allocated separately and passed to the constructor. TableCache
38 // wraps around the underlying SST file readers by providing Get(),
39 // MultiGet() and NewIterator() methods that hide the instantiation,
40 // caching and access to the TableReader. The main purpose of this is
41 // performance - by caching the TableReader, it avoids unnecessary file opens
42 // and object allocation and instantiation. One exception is compaction, where
43 // a new TableReader may be instantiated - see NewIterator() comments
44 //
45 // Another service provided by TableCache is managing the row cache - if the
46 // DB is configured with a row cache, and the lookup key is present in the row
47 // cache, lookup is very fast. The row cache is obtained from
48 // ioptions.row_cache
49 class TableCache {
50  public:
51   TableCache(const ImmutableCFOptions& ioptions,
52              const FileOptions& storage_options, Cache* cache,
53              BlockCacheTracer* const block_cache_tracer);
54   ~TableCache();
55 
56   // Return an iterator for the specified file number (the corresponding
57   // file length must be exactly "file_size" bytes).  If "table_reader_ptr"
58   // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
59   // underlying the returned iterator, or nullptr if no Table object underlies
60   // the returned iterator.  The returned "*table_reader_ptr" object is owned
61   // by the cache and should not be deleted, and is valid for as long as the
62   // returned iterator is live.
63   // @param range_del_agg If non-nullptr, adds range deletions to the
64   //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
65   // @param for_compaction If true, a new TableReader may be allocated (but
66   //                       not cached), depending on the CF options
67   // @param skip_filters Disables loading/accessing the filter block
68   // @param level The level this table is at, -1 for "not set / don't know"
69   InternalIterator* NewIterator(
70       const ReadOptions& options, const FileOptions& toptions,
71       const InternalKeyComparator& internal_comparator,
72       const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
73       const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
74       HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
75       bool skip_filters, int level, const InternalKey* smallest_compaction_key,
76       const InternalKey* largest_compaction_key);
77 
78   // If a seek to internal key "k" in specified file finds an entry,
79   // call get_context->SaveValue() repeatedly until
80   // it returns false. As a side effect, it will insert the TableReader
81   // into the cache and potentially evict another entry
82   // @param get_context Context for get operation. The result of the lookup
83   //                    can be retrieved by calling get_context->State()
84   // @param file_read_hist If non-nullptr, the file reader statistics are
85   //                       recorded
86   // @param skip_filters Disables loading/accessing the filter block
87   // @param level The level this table is at, -1 for "not set / don't know"
88   Status Get(const ReadOptions& options,
89              const InternalKeyComparator& internal_comparator,
90              const FileMetaData& file_meta, const Slice& k,
91              GetContext* get_context,
92              const SliceTransform* prefix_extractor = nullptr,
93              HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
94              int level = -1);
95 
96   // Return the range delete tombstone iterator of the file specified by
97   // `file_meta`.
98   Status GetRangeTombstoneIterator(
99       const ReadOptions& options,
100       const InternalKeyComparator& internal_comparator,
101       const FileMetaData& file_meta,
102       std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
103 
104   // If a seek to internal key "k" in specified file finds an entry,
105   // call get_context->SaveValue() repeatedly until
106   // it returns false. As a side effect, it will insert the TableReader
107   // into the cache and potentially evict another entry
108   // @param mget_range Pointer to the structure describing a batch of keys to
109   //                   be looked up in this table file. The result is stored
110   //                   in the embedded GetContext
111   // @param skip_filters Disables loading/accessing the filter block
112   // @param level The level this table is at, -1 for "not set / don't know"
113   Status MultiGet(const ReadOptions& options,
114                   const InternalKeyComparator& internal_comparator,
115                   const FileMetaData& file_meta,
116                   const MultiGetContext::Range* mget_range,
117                   const SliceTransform* prefix_extractor = nullptr,
118                   HistogramImpl* file_read_hist = nullptr,
119                   bool skip_filters = false, int level = -1);
120 
121   // Evict any entry for the specified file number
122   static void Evict(Cache* cache, uint64_t file_number);
123 
124   // Clean table handle and erase it from the table cache
125   // Used in DB close, or the file is not live anymore.
126   void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
127 
128   // Find table reader
129   // @param skip_filters Disables loading/accessing the filter block
130   // @param level == -1 means not specified
131   Status FindTable(const FileOptions& toptions,
132                    const InternalKeyComparator& internal_comparator,
133                    const FileDescriptor& file_fd, Cache::Handle**,
134                    const SliceTransform* prefix_extractor = nullptr,
135                    const bool no_io = false, bool record_read_stats = true,
136                    HistogramImpl* file_read_hist = nullptr,
137                    bool skip_filters = false, int level = -1,
138                    bool prefetch_index_and_filter_in_cache = true);
139 
140   // Get TableReader from a cache handle.
141   TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
142 
143   // Get the table properties of a given table.
144   // @no_io: indicates if we should load table to the cache if it is not present
145   //         in table cache yet.
146   // @returns: `properties` will be reset on success. Please note that we will
147   //            return Status::Incomplete() if table is not present in cache and
148   //            we set `no_io` to be true.
149   Status GetTableProperties(const FileOptions& toptions,
150                             const InternalKeyComparator& internal_comparator,
151                             const FileDescriptor& file_meta,
152                             std::shared_ptr<const TableProperties>* properties,
153                             const SliceTransform* prefix_extractor = nullptr,
154                             bool no_io = false);
155 
156   // Return total memory usage of the table reader of the file.
157   // 0 if table reader of the file is not loaded.
158   size_t GetMemoryUsageByTableReader(
159       const FileOptions& toptions,
160       const InternalKeyComparator& internal_comparator,
161       const FileDescriptor& fd,
162       const SliceTransform* prefix_extractor = nullptr);
163 
164   // Returns approximated offset of a key in a file represented by fd.
165   uint64_t ApproximateOffsetOf(
166       const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
167       const InternalKeyComparator& internal_comparator,
168       const SliceTransform* prefix_extractor = nullptr);
169 
170   // Returns approximated data size between start and end keys in a file
171   // represented by fd (the start key must not be greater than the end key).
172   uint64_t ApproximateSize(const Slice& start, const Slice& end,
173                            const FileDescriptor& fd, TableReaderCaller caller,
174                            const InternalKeyComparator& internal_comparator,
175                            const SliceTransform* prefix_extractor = nullptr);
176 
177   // Release the handle from a cache
178   void ReleaseHandle(Cache::Handle* handle);
179 
get_cache()180   Cache* get_cache() const { return cache_; }
181 
182   // Capacity of the backing Cache that indicates inifinite TableCache capacity.
183   // For example when max_open_files is -1 we set the backing Cache to this.
184   static const int kInfiniteCapacity = 0x400000;
185 
186   // The tables opened with this TableCache will be immortal, i.e., their
187   // lifetime is as long as that of the DB.
SetTablesAreImmortal()188   void SetTablesAreImmortal() {
189     if (cache_->GetCapacity() >= kInfiniteCapacity) {
190       immortal_tables_ = true;
191     }
192   }
193 
194  private:
195   // Build a table reader
196   Status GetTableReader(const FileOptions& file_options,
197                         const InternalKeyComparator& internal_comparator,
198                         const FileDescriptor& fd, bool sequential_mode,
199                         bool record_read_stats, HistogramImpl* file_read_hist,
200                         std::unique_ptr<TableReader>* table_reader,
201                         const SliceTransform* prefix_extractor = nullptr,
202                         bool skip_filters = false, int level = -1,
203                         bool prefetch_index_and_filter_in_cache = true);
204 
205   // Create a key prefix for looking up the row cache. The prefix is of the
206   // format row_cache_id + fd_number + seq_no. Later, the user key can be
207   // appended to form the full key
208   void CreateRowCacheKeyPrefix(const ReadOptions& options,
209                                const FileDescriptor& fd,
210                                const Slice& internal_key,
211                                GetContext* get_context, IterKey& row_cache_key);
212 
213   // Helper function to lookup the row cache for a key. It appends the
214   // user key to row_cache_key at offset prefix_size
215   bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
216                        size_t prefix_size, GetContext* get_context);
217 
218   const ImmutableCFOptions& ioptions_;
219   const FileOptions& file_options_;
220   Cache* const cache_;
221   std::string row_cache_id_;
222   bool immortal_tables_;
223   BlockCacheTracer* const block_cache_tracer_;
224 };
225 
226 }  // namespace ROCKSDB_NAMESPACE
227