1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 
6 #pragma once
7 
8 #include <atomic>
9 #include <fstream>
10 
11 #include "monitoring/instrumented_mutex.h"
12 #include "rocksdb/env.h"
13 #include "rocksdb/options.h"
14 #include "rocksdb/trace_reader_writer.h"
15 #include "table/table_reader_caller.h"
16 #include "trace_replay/trace_replay.h"
17 
18 namespace ROCKSDB_NAMESPACE {
19 
20 extern const uint64_t kMicrosInSecond;
21 extern const uint64_t kSecondInMinute;
22 extern const uint64_t kSecondInHour;
23 
24 struct BlockCacheTraceRecord;
25 
26 class BlockCacheTraceHelper {
27  public:
28   static bool IsGetOrMultiGetOnDataBlock(TraceType block_type,
29                                          TableReaderCaller caller);
30   static bool IsGetOrMultiGet(TableReaderCaller caller);
31   static bool IsUserAccess(TableReaderCaller caller);
32   // Row key is a concatenation of the access's fd_number and the referenced
33   // user key.
34   static std::string ComputeRowKey(const BlockCacheTraceRecord& access);
35   // The first four bytes of the referenced key in a Get request is the table
36   // id.
37   static uint64_t GetTableId(const BlockCacheTraceRecord& access);
38   // The sequence number of a get request is the last part of the referenced
39   // key.
40   static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access);
41   // Block offset in a file is the last varint64 in the block key.
42   static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access);
43 
44   static const std::string kUnknownColumnFamilyName;
45   static const uint64_t kReservedGetId;
46 };
47 
48 // Lookup context for tracing block cache accesses.
49 // We trace block accesses at five places:
50 // 1. BlockBasedTable::GetFilter
51 // 2. BlockBasedTable::GetUncompressedDict.
52 // 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index,
53 // and range deletion block.)
54 // 4. BlockBasedTable::Get. (To trace the referenced key and whether the
55 // referenced key exists in a fetched data block.)
56 // 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the
57 // referenced key exists in a fetched data block.)
58 // The context is created at:
59 // 1. BlockBasedTable::Get. (kUserGet)
60 // 2. BlockBasedTable::MultiGet. (kUserMGet)
61 // 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or
62 // external SST ingestion calls this function.)
63 // 4. BlockBasedTable::Open. (kPrefetch)
64 // 5. Index/Filter::CacheDependencies. (kPrefetch)
65 // 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
66 // kUserApproximateSize).
67 struct BlockCacheLookupContext {
BlockCacheLookupContextBlockCacheLookupContext68   BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
BlockCacheLookupContextBlockCacheLookupContext69   BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id,
70                           bool _get_from_user_specified_snapshot)
71       : caller(_caller),
72         get_id(_get_id),
73         get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {}
74   const TableReaderCaller caller;
75   // These are populated when we perform lookup/insert on block cache. The block
76   // cache tracer uses these inforation when logging the block access at
77   // BlockBasedTable::GET and BlockBasedTable::MultiGet.
78   bool is_cache_hit = false;
79   bool no_insert = false;
80   TraceType block_type = TraceType::kTraceMax;
81   uint64_t block_size = 0;
82   std::string block_key;
83   uint64_t num_keys_in_block = 0;
84   // The unique id associated with Get and MultiGet. This enables us to track
85   // how many blocks a Get/MultiGet request accesses. We can also measure the
86   // impact of row cache vs block cache.
87   uint64_t get_id = 0;
88   std::string referenced_key;
89   bool get_from_user_specified_snapshot = false;
90 
FillLookupContextBlockCacheLookupContext91   void FillLookupContext(bool _is_cache_hit, bool _no_insert,
92                          TraceType _block_type, uint64_t _block_size,
93                          const std::string& _block_key,
94                          uint64_t _num_keys_in_block) {
95     is_cache_hit = _is_cache_hit;
96     no_insert = _no_insert;
97     block_type = _block_type;
98     block_size = _block_size;
99     block_key = _block_key;
100     num_keys_in_block = _num_keys_in_block;
101   }
102 };
103 
104 enum Boolean : char { kTrue = 1, kFalse = 0 };
105 
106 struct BlockCacheTraceRecord {
107   // Required fields for all accesses.
108   uint64_t access_timestamp = 0;
109   std::string block_key;
110   TraceType block_type = TraceType::kTraceMax;
111   uint64_t block_size = 0;
112   uint64_t cf_id = 0;
113   std::string cf_name;
114   uint32_t level = 0;
115   uint64_t sst_fd_number = 0;
116   TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
117   Boolean is_cache_hit = Boolean::kFalse;
118   Boolean no_insert = Boolean::kFalse;
119   // Required field for Get and MultiGet
120   uint64_t get_id = BlockCacheTraceHelper::kReservedGetId;
121   Boolean get_from_user_specified_snapshot = Boolean::kFalse;
122   std::string referenced_key;
123   // Required fields for data block and user Get/Multi-Get only.
124   uint64_t referenced_data_size = 0;
125   uint64_t num_keys_in_block = 0;
126   Boolean referenced_key_exist_in_block = Boolean::kFalse;
127 
BlockCacheTraceRecordBlockCacheTraceRecord128   BlockCacheTraceRecord() {}
129 
130   BlockCacheTraceRecord(
131       uint64_t _access_timestamp, std::string _block_key, TraceType _block_type,
132       uint64_t _block_size, uint64_t _cf_id, std::string _cf_name,
133       uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller,
134       bool _is_cache_hit, bool _no_insert,
135       uint64_t _get_id = BlockCacheTraceHelper::kReservedGetId,
136       bool _get_from_user_specified_snapshot = false,
137       std::string _referenced_key = "", uint64_t _referenced_data_size = 0,
138       uint64_t _num_keys_in_block = 0,
139       bool _referenced_key_exist_in_block = false)
access_timestampBlockCacheTraceRecord140       : access_timestamp(_access_timestamp),
141         block_key(_block_key),
142         block_type(_block_type),
143         block_size(_block_size),
144         cf_id(_cf_id),
145         cf_name(_cf_name),
146         level(_level),
147         sst_fd_number(_sst_fd_number),
148         caller(_caller),
149         is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
150         no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
151         get_id(_get_id),
152         get_from_user_specified_snapshot(_get_from_user_specified_snapshot
153                                              ? Boolean::kTrue
154                                              : Boolean::kFalse),
155         referenced_key(_referenced_key),
156         referenced_data_size(_referenced_data_size),
157         num_keys_in_block(_num_keys_in_block),
158         referenced_key_exist_in_block(
159             _referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) {
160   }
161 };
162 
163 struct BlockCacheTraceHeader {
164   uint64_t start_time;
165   uint32_t rocksdb_major_version;
166   uint32_t rocksdb_minor_version;
167 };
168 
169 // BlockCacheTraceWriter captures all RocksDB block cache accesses using a
170 // user-provided TraceWriter. Every RocksDB operation is written as a single
171 // trace. Each trace will have a timestamp and type, followed by the trace
172 // payload.
173 class BlockCacheTraceWriter {
174  public:
175   BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options,
176                         std::unique_ptr<TraceWriter>&& trace_writer);
177   ~BlockCacheTraceWriter() = default;
178   // No copy and move.
179   BlockCacheTraceWriter(const BlockCacheTraceWriter&) = delete;
180   BlockCacheTraceWriter& operator=(const BlockCacheTraceWriter&) = delete;
181   BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete;
182   BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete;
183 
184   // Pass Slice references to avoid copy.
185   Status WriteBlockAccess(const BlockCacheTraceRecord& record,
186                           const Slice& block_key, const Slice& cf_name,
187                           const Slice& referenced_key);
188 
189   // Write a trace header at the beginning, typically on initiating a trace,
190   // with some metadata like a magic number and RocksDB version.
191   Status WriteHeader();
192 
193  private:
194   Env* env_;
195   TraceOptions trace_options_;
196   std::unique_ptr<TraceWriter> trace_writer_;
197 };
198 
199 // Write a trace record in human readable format, see
200 // https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
201 // for details.
202 class BlockCacheHumanReadableTraceWriter {
203  public:
204   ~BlockCacheHumanReadableTraceWriter();
205 
206   Status NewWritableFile(const std::string& human_readable_trace_file_path,
207                          ROCKSDB_NAMESPACE::Env* env);
208 
209   Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
210                                        uint64_t block_id, uint64_t get_key_id);
211 
212  private:
213   char trace_record_buffer_[1024 * 1024];
214   std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
215       human_readable_trace_file_writer_;
216 };
217 
218 // BlockCacheTraceReader helps read the trace file generated by
219 // BlockCacheTraceWriter using a user provided TraceReader.
220 class BlockCacheTraceReader {
221  public:
222   BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader);
223   ~BlockCacheTraceReader() = default;
224   // No copy and move.
225   BlockCacheTraceReader(const BlockCacheTraceReader&) = delete;
226   BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete;
227   BlockCacheTraceReader(BlockCacheTraceReader&&) = delete;
228   BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete;
229 
230   Status ReadHeader(BlockCacheTraceHeader* header);
231 
232   Status ReadAccess(BlockCacheTraceRecord* record);
233 
234  private:
235   std::unique_ptr<TraceReader> trace_reader_;
236 };
237 
238 // Read a trace record in human readable format, see
239 // https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
240 // for detailed.
241 class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader {
242  public:
243   BlockCacheHumanReadableTraceReader(const std::string& trace_file_path);
244 
245   ~BlockCacheHumanReadableTraceReader();
246 
247   Status ReadHeader(BlockCacheTraceHeader* header);
248 
249   Status ReadAccess(BlockCacheTraceRecord* record);
250 
251  private:
252   std::ifstream human_readable_trace_reader_;
253 };
254 
255 // A block cache tracer. It downsamples the accesses according to
256 // trace_options and uses BlockCacheTraceWriter to write the access record to
257 // the trace file.
258 class BlockCacheTracer {
259  public:
260   BlockCacheTracer();
261   ~BlockCacheTracer();
262   // No copy and move.
263   BlockCacheTracer(const BlockCacheTracer&) = delete;
264   BlockCacheTracer& operator=(const BlockCacheTracer&) = delete;
265   BlockCacheTracer(BlockCacheTracer&&) = delete;
266   BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
267 
268   // Start writing block cache accesses to the trace_writer.
269   Status StartTrace(Env* env, const TraceOptions& trace_options,
270                     std::unique_ptr<TraceWriter>&& trace_writer);
271 
272   // Stop writing block cache accesses to the trace_writer.
273   void EndTrace();
274 
is_tracing_enabled()275   bool is_tracing_enabled() const {
276     return writer_.load(std::memory_order_relaxed);
277   }
278 
279   Status WriteBlockAccess(const BlockCacheTraceRecord& record,
280                           const Slice& block_key, const Slice& cf_name,
281                           const Slice& referenced_key);
282 
283   // GetId cycles from 1 to port::kMaxUint64.
284   uint64_t NextGetId();
285 
286  private:
287   TraceOptions trace_options_;
288   // A mutex protects the writer_.
289   InstrumentedMutex trace_writer_mutex_;
290   std::atomic<BlockCacheTraceWriter*> writer_;
291   std::atomic<uint64_t> get_id_counter_;
292 };
293 
294 }  // namespace ROCKSDB_NAMESPACE
295