1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 6 #pragma once 7 8 #include <atomic> 9 #include <fstream> 10 11 #include "monitoring/instrumented_mutex.h" 12 #include "rocksdb/env.h" 13 #include "rocksdb/options.h" 14 #include "rocksdb/trace_reader_writer.h" 15 #include "table/table_reader_caller.h" 16 #include "trace_replay/trace_replay.h" 17 18 namespace ROCKSDB_NAMESPACE { 19 20 extern const uint64_t kMicrosInSecond; 21 extern const uint64_t kSecondInMinute; 22 extern const uint64_t kSecondInHour; 23 24 struct BlockCacheTraceRecord; 25 26 class BlockCacheTraceHelper { 27 public: 28 static bool IsGetOrMultiGetOnDataBlock(TraceType block_type, 29 TableReaderCaller caller); 30 static bool IsGetOrMultiGet(TableReaderCaller caller); 31 static bool IsUserAccess(TableReaderCaller caller); 32 // Row key is a concatenation of the access's fd_number and the referenced 33 // user key. 34 static std::string ComputeRowKey(const BlockCacheTraceRecord& access); 35 // The first four bytes of the referenced key in a Get request is the table 36 // id. 37 static uint64_t GetTableId(const BlockCacheTraceRecord& access); 38 // The sequence number of a get request is the last part of the referenced 39 // key. 40 static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access); 41 // Block offset in a file is the last varint64 in the block key. 42 static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access); 43 44 static const std::string kUnknownColumnFamilyName; 45 static const uint64_t kReservedGetId; 46 }; 47 48 // Lookup context for tracing block cache accesses. 49 // We trace block accesses at five places: 50 // 1. BlockBasedTable::GetFilter 51 // 2. BlockBasedTable::GetUncompressedDict. 52 // 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index, 53 // and range deletion block.) 54 // 4. BlockBasedTable::Get. (To trace the referenced key and whether the 55 // referenced key exists in a fetched data block.) 56 // 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the 57 // referenced key exists in a fetched data block.) 58 // The context is created at: 59 // 1. BlockBasedTable::Get. (kUserGet) 60 // 2. BlockBasedTable::MultiGet. (kUserMGet) 61 // 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or 62 // external SST ingestion calls this function.) 63 // 4. BlockBasedTable::Open. (kPrefetch) 64 // 5. Index/Filter::CacheDependencies. (kPrefetch) 65 // 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or 66 // kUserApproximateSize). 67 struct BlockCacheLookupContext { BlockCacheLookupContextBlockCacheLookupContext68 BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {} BlockCacheLookupContextBlockCacheLookupContext69 BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id, 70 bool _get_from_user_specified_snapshot) 71 : caller(_caller), 72 get_id(_get_id), 73 get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {} 74 const TableReaderCaller caller; 75 // These are populated when we perform lookup/insert on block cache. The block 76 // cache tracer uses these inforation when logging the block access at 77 // BlockBasedTable::GET and BlockBasedTable::MultiGet. 78 bool is_cache_hit = false; 79 bool no_insert = false; 80 TraceType block_type = TraceType::kTraceMax; 81 uint64_t block_size = 0; 82 std::string block_key; 83 uint64_t num_keys_in_block = 0; 84 // The unique id associated with Get and MultiGet. This enables us to track 85 // how many blocks a Get/MultiGet request accesses. We can also measure the 86 // impact of row cache vs block cache. 87 uint64_t get_id = 0; 88 std::string referenced_key; 89 bool get_from_user_specified_snapshot = false; 90 FillLookupContextBlockCacheLookupContext91 void FillLookupContext(bool _is_cache_hit, bool _no_insert, 92 TraceType _block_type, uint64_t _block_size, 93 const std::string& _block_key, 94 uint64_t _num_keys_in_block) { 95 is_cache_hit = _is_cache_hit; 96 no_insert = _no_insert; 97 block_type = _block_type; 98 block_size = _block_size; 99 block_key = _block_key; 100 num_keys_in_block = _num_keys_in_block; 101 } 102 }; 103 104 enum Boolean : char { kTrue = 1, kFalse = 0 }; 105 106 struct BlockCacheTraceRecord { 107 // Required fields for all accesses. 108 uint64_t access_timestamp = 0; 109 std::string block_key; 110 TraceType block_type = TraceType::kTraceMax; 111 uint64_t block_size = 0; 112 uint64_t cf_id = 0; 113 std::string cf_name; 114 uint32_t level = 0; 115 uint64_t sst_fd_number = 0; 116 TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller; 117 Boolean is_cache_hit = Boolean::kFalse; 118 Boolean no_insert = Boolean::kFalse; 119 // Required field for Get and MultiGet 120 uint64_t get_id = BlockCacheTraceHelper::kReservedGetId; 121 Boolean get_from_user_specified_snapshot = Boolean::kFalse; 122 std::string referenced_key; 123 // Required fields for data block and user Get/Multi-Get only. 124 uint64_t referenced_data_size = 0; 125 uint64_t num_keys_in_block = 0; 126 Boolean referenced_key_exist_in_block = Boolean::kFalse; 127 BlockCacheTraceRecordBlockCacheTraceRecord128 BlockCacheTraceRecord() {} 129 130 BlockCacheTraceRecord( 131 uint64_t _access_timestamp, std::string _block_key, TraceType _block_type, 132 uint64_t _block_size, uint64_t _cf_id, std::string _cf_name, 133 uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller, 134 bool _is_cache_hit, bool _no_insert, 135 uint64_t _get_id = BlockCacheTraceHelper::kReservedGetId, 136 bool _get_from_user_specified_snapshot = false, 137 std::string _referenced_key = "", uint64_t _referenced_data_size = 0, 138 uint64_t _num_keys_in_block = 0, 139 bool _referenced_key_exist_in_block = false) access_timestampBlockCacheTraceRecord140 : access_timestamp(_access_timestamp), 141 block_key(_block_key), 142 block_type(_block_type), 143 block_size(_block_size), 144 cf_id(_cf_id), 145 cf_name(_cf_name), 146 level(_level), 147 sst_fd_number(_sst_fd_number), 148 caller(_caller), 149 is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse), 150 no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse), 151 get_id(_get_id), 152 get_from_user_specified_snapshot(_get_from_user_specified_snapshot 153 ? Boolean::kTrue 154 : Boolean::kFalse), 155 referenced_key(_referenced_key), 156 referenced_data_size(_referenced_data_size), 157 num_keys_in_block(_num_keys_in_block), 158 referenced_key_exist_in_block( 159 _referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) { 160 } 161 }; 162 163 struct BlockCacheTraceHeader { 164 uint64_t start_time; 165 uint32_t rocksdb_major_version; 166 uint32_t rocksdb_minor_version; 167 }; 168 169 // BlockCacheTraceWriter captures all RocksDB block cache accesses using a 170 // user-provided TraceWriter. Every RocksDB operation is written as a single 171 // trace. Each trace will have a timestamp and type, followed by the trace 172 // payload. 173 class BlockCacheTraceWriter { 174 public: 175 BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options, 176 std::unique_ptr<TraceWriter>&& trace_writer); 177 ~BlockCacheTraceWriter() = default; 178 // No copy and move. 179 BlockCacheTraceWriter(const BlockCacheTraceWriter&) = delete; 180 BlockCacheTraceWriter& operator=(const BlockCacheTraceWriter&) = delete; 181 BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete; 182 BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete; 183 184 // Pass Slice references to avoid copy. 185 Status WriteBlockAccess(const BlockCacheTraceRecord& record, 186 const Slice& block_key, const Slice& cf_name, 187 const Slice& referenced_key); 188 189 // Write a trace header at the beginning, typically on initiating a trace, 190 // with some metadata like a magic number and RocksDB version. 191 Status WriteHeader(); 192 193 private: 194 Env* env_; 195 TraceOptions trace_options_; 196 std::unique_ptr<TraceWriter> trace_writer_; 197 }; 198 199 // Write a trace record in human readable format, see 200 // https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format 201 // for details. 202 class BlockCacheHumanReadableTraceWriter { 203 public: 204 ~BlockCacheHumanReadableTraceWriter(); 205 206 Status NewWritableFile(const std::string& human_readable_trace_file_path, 207 ROCKSDB_NAMESPACE::Env* env); 208 209 Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access, 210 uint64_t block_id, uint64_t get_key_id); 211 212 private: 213 char trace_record_buffer_[1024 * 1024]; 214 std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> 215 human_readable_trace_file_writer_; 216 }; 217 218 // BlockCacheTraceReader helps read the trace file generated by 219 // BlockCacheTraceWriter using a user provided TraceReader. 220 class BlockCacheTraceReader { 221 public: 222 BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader); 223 ~BlockCacheTraceReader() = default; 224 // No copy and move. 225 BlockCacheTraceReader(const BlockCacheTraceReader&) = delete; 226 BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete; 227 BlockCacheTraceReader(BlockCacheTraceReader&&) = delete; 228 BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete; 229 230 Status ReadHeader(BlockCacheTraceHeader* header); 231 232 Status ReadAccess(BlockCacheTraceRecord* record); 233 234 private: 235 std::unique_ptr<TraceReader> trace_reader_; 236 }; 237 238 // Read a trace record in human readable format, see 239 // https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format 240 // for detailed. 241 class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader { 242 public: 243 BlockCacheHumanReadableTraceReader(const std::string& trace_file_path); 244 245 ~BlockCacheHumanReadableTraceReader(); 246 247 Status ReadHeader(BlockCacheTraceHeader* header); 248 249 Status ReadAccess(BlockCacheTraceRecord* record); 250 251 private: 252 std::ifstream human_readable_trace_reader_; 253 }; 254 255 // A block cache tracer. It downsamples the accesses according to 256 // trace_options and uses BlockCacheTraceWriter to write the access record to 257 // the trace file. 258 class BlockCacheTracer { 259 public: 260 BlockCacheTracer(); 261 ~BlockCacheTracer(); 262 // No copy and move. 263 BlockCacheTracer(const BlockCacheTracer&) = delete; 264 BlockCacheTracer& operator=(const BlockCacheTracer&) = delete; 265 BlockCacheTracer(BlockCacheTracer&&) = delete; 266 BlockCacheTracer& operator=(BlockCacheTracer&&) = delete; 267 268 // Start writing block cache accesses to the trace_writer. 269 Status StartTrace(Env* env, const TraceOptions& trace_options, 270 std::unique_ptr<TraceWriter>&& trace_writer); 271 272 // Stop writing block cache accesses to the trace_writer. 273 void EndTrace(); 274 is_tracing_enabled()275 bool is_tracing_enabled() const { 276 return writer_.load(std::memory_order_relaxed); 277 } 278 279 Status WriteBlockAccess(const BlockCacheTraceRecord& record, 280 const Slice& block_key, const Slice& cf_name, 281 const Slice& referenced_key); 282 283 // GetId cycles from 1 to port::kMaxUint64. 284 uint64_t NextGetId(); 285 286 private: 287 TraceOptions trace_options_; 288 // A mutex protects the writer_. 289 InstrumentedMutex trace_writer_mutex_; 290 std::atomic<BlockCacheTraceWriter*> writer_; 291 std::atomic<uint64_t> get_id_counter_; 292 }; 293 294 } // namespace ROCKSDB_NAMESPACE 295