1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file. See the AUTHORS file for names of contributors.
5
6 #ifndef ROCKSDB_LITE
7
8 #include "table/plain/plain_table_reader.h"
9
10 #include <string>
11 #include <vector>
12
13 #include "db/dbformat.h"
14
15 #include "rocksdb/cache.h"
16 #include "rocksdb/comparator.h"
17 #include "rocksdb/env.h"
18 #include "rocksdb/filter_policy.h"
19 #include "rocksdb/options.h"
20 #include "rocksdb/statistics.h"
21
22 #include "table/block_based/block.h"
23 #include "table/block_based/filter_block.h"
24 #include "table/format.h"
25 #include "table/get_context.h"
26 #include "table/internal_iterator.h"
27 #include "table/meta_blocks.h"
28 #include "table/plain/plain_table_bloom.h"
29 #include "table/plain/plain_table_factory.h"
30 #include "table/plain/plain_table_key_coding.h"
31 #include "table/two_level_iterator.h"
32
33 #include "memory/arena.h"
34 #include "monitoring/histogram.h"
35 #include "monitoring/perf_context_imp.h"
36 #include "util/coding.h"
37 #include "util/dynamic_bloom.h"
38 #include "util/hash.h"
39 #include "util/stop_watch.h"
40 #include "util/string_util.h"
41
42 namespace rocksdb {
43
44 namespace {
45
46 // Safely getting a uint32_t element from a char array, where, starting from
47 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
GetFixed32Element(const char * base,size_t offset)48 inline uint32_t GetFixed32Element(const char* base, size_t offset) {
49 return DecodeFixed32(base + offset * sizeof(uint32_t));
50 }
51 } // namespace
52
53 // Iterator to iterate IndexedTable
54 class PlainTableIterator : public InternalIterator {
55 public:
56 explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
57 // No copying allowed
58 PlainTableIterator(const PlainTableIterator&) = delete;
59 void operator=(const Iterator&) = delete;
60
61 ~PlainTableIterator() override;
62
63 bool Valid() const override;
64
65 void SeekToFirst() override;
66
67 void SeekToLast() override;
68
69 void Seek(const Slice& target) override;
70
71 void SeekForPrev(const Slice& target) override;
72
73 void Next() override;
74
75 void Prev() override;
76
77 Slice key() const override;
78
79 Slice value() const override;
80
81 Status status() const override;
82
83 private:
84 PlainTableReader* table_;
85 PlainTableKeyDecoder decoder_;
86 bool use_prefix_seek_;
87 uint32_t offset_;
88 uint32_t next_offset_;
89 Slice key_;
90 Slice value_;
91 Status status_;
92 };
93
94 extern const uint64_t kPlainTableMagicNumber;
PlainTableReader(const ImmutableCFOptions & ioptions,std::unique_ptr<RandomAccessFileReader> && file,const EnvOptions & storage_options,const InternalKeyComparator & icomparator,EncodingType encoding_type,uint64_t file_size,const TableProperties * table_properties,const SliceTransform * prefix_extractor)95 PlainTableReader::PlainTableReader(
96 const ImmutableCFOptions& ioptions,
97 std::unique_ptr<RandomAccessFileReader>&& file,
98 const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
99 EncodingType encoding_type, uint64_t file_size,
100 const TableProperties* table_properties,
101 const SliceTransform* prefix_extractor)
102 : internal_comparator_(icomparator),
103 encoding_type_(encoding_type),
104 full_scan_mode_(false),
105 user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
106 prefix_extractor_(prefix_extractor),
107 enable_bloom_(false),
108 bloom_(6),
109 file_info_(std::move(file), storage_options,
110 static_cast<uint32_t>(table_properties->data_size)),
111 ioptions_(ioptions),
112 file_size_(file_size),
113 table_properties_(nullptr) {}
114
~PlainTableReader()115 PlainTableReader::~PlainTableReader() {
116 }
117
Open(const ImmutableCFOptions & ioptions,const EnvOptions & env_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const int bloom_bits_per_key,double hash_table_ratio,size_t index_sparseness,size_t huge_page_tlb_size,bool full_scan_mode,const bool immortal_table,const SliceTransform * prefix_extractor)118 Status PlainTableReader::Open(
119 const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
120 const InternalKeyComparator& internal_comparator,
121 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
122 std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
123 double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
124 bool full_scan_mode, const bool immortal_table,
125 const SliceTransform* prefix_extractor) {
126 if (file_size > PlainTableIndex::kMaxFileSize) {
127 return Status::NotSupported("File is too large for PlainTableReader!");
128 }
129
130 TableProperties* props_ptr = nullptr;
131 auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
132 ioptions, &props_ptr,
133 true /* compression_type_missing */);
134 std::shared_ptr<TableProperties> props(props_ptr);
135 if (!s.ok()) {
136 return s;
137 }
138
139 assert(hash_table_ratio >= 0.0);
140 auto& user_props = props->user_collected_properties;
141 auto prefix_extractor_in_file = props->prefix_extractor_name;
142
143 if (!full_scan_mode &&
144 !prefix_extractor_in_file.empty() /* old version sst file*/
145 && prefix_extractor_in_file != "nullptr") {
146 if (!prefix_extractor) {
147 return Status::InvalidArgument(
148 "Prefix extractor is missing when opening a PlainTable built "
149 "using a prefix extractor");
150 } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
151 0) {
152 return Status::InvalidArgument(
153 "Prefix extractor given doesn't match the one used to build "
154 "PlainTable");
155 }
156 }
157
158 EncodingType encoding_type = kPlain;
159 auto encoding_type_prop =
160 user_props.find(PlainTablePropertyNames::kEncodingType);
161 if (encoding_type_prop != user_props.end()) {
162 encoding_type = static_cast<EncodingType>(
163 DecodeFixed32(encoding_type_prop->second.c_str()));
164 }
165
166 std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
167 ioptions, std::move(file), env_options, internal_comparator,
168 encoding_type, file_size, props.get(), prefix_extractor));
169
170 s = new_reader->MmapDataIfNeeded();
171 if (!s.ok()) {
172 return s;
173 }
174
175 if (!full_scan_mode) {
176 s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
177 hash_table_ratio, index_sparseness,
178 huge_page_tlb_size);
179 if (!s.ok()) {
180 return s;
181 }
182 } else {
183 // Flag to indicate it is a full scan mode so that none of the indexes
184 // can be used.
185 new_reader->full_scan_mode_ = true;
186 }
187 // PopulateIndex can add to the props, so don't store them until now
188 new_reader->table_properties_ = props;
189
190 if (immortal_table && new_reader->file_info_.is_mmap_mode) {
191 new_reader->dummy_cleanable_.reset(new Cleanable());
192 }
193
194 *table_reader = std::move(new_reader);
195 return s;
196 }
197
SetupForCompaction()198 void PlainTableReader::SetupForCompaction() {
199 }
200
NewIterator(const ReadOptions & options,const SliceTransform *,Arena * arena,bool,TableReaderCaller,size_t)201 InternalIterator* PlainTableReader::NewIterator(
202 const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
203 Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
204 size_t /*compaction_readahead_size*/) {
205 // Not necessarily used here, but make sure this has been initialized
206 assert(table_properties_);
207
208 bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
209 if (arena == nullptr) {
210 return new PlainTableIterator(this, use_prefix_seek);
211 } else {
212 auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
213 return new (mem) PlainTableIterator(this, use_prefix_seek);
214 }
215 }
216
PopulateIndexRecordList(PlainTableIndexBuilder * index_builder,std::vector<uint32_t> * prefix_hashes)217 Status PlainTableReader::PopulateIndexRecordList(
218 PlainTableIndexBuilder* index_builder,
219 std::vector<uint32_t>* prefix_hashes) {
220 Slice prev_key_prefix_slice;
221 std::string prev_key_prefix_buf;
222 uint32_t pos = data_start_offset_;
223
224 bool is_first_record = true;
225 Slice key_prefix_slice;
226 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
227 prefix_extractor_);
228 while (pos < file_info_.data_end_offset) {
229 uint32_t key_offset = pos;
230 ParsedInternalKey key;
231 Slice value_slice;
232 bool seekable = false;
233 Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
234 if (!s.ok()) {
235 return s;
236 }
237
238 key_prefix_slice = GetPrefix(key);
239 if (enable_bloom_) {
240 bloom_.AddHash(GetSliceHash(key.user_key));
241 } else {
242 if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
243 if (!is_first_record) {
244 prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
245 }
246 if (file_info_.is_mmap_mode) {
247 prev_key_prefix_slice = key_prefix_slice;
248 } else {
249 prev_key_prefix_buf = key_prefix_slice.ToString();
250 prev_key_prefix_slice = prev_key_prefix_buf;
251 }
252 }
253 }
254
255 index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
256
257 if (!seekable && is_first_record) {
258 return Status::Corruption("Key for a prefix is not seekable");
259 }
260
261 is_first_record = false;
262 }
263
264 prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
265 auto s = index_.InitFromRawData(index_builder->Finish());
266 return s;
267 }
268
AllocateBloom(int bloom_bits_per_key,int num_keys,size_t huge_page_tlb_size)269 void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
270 size_t huge_page_tlb_size) {
271 uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
272 if (bloom_total_bits > 0) {
273 enable_bloom_ = true;
274 bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
275 huge_page_tlb_size, ioptions_.info_log);
276 }
277 }
278
FillBloom(const std::vector<uint32_t> & prefix_hashes)279 void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
280 assert(bloom_.IsInitialized());
281 for (const auto prefix_hash : prefix_hashes) {
282 bloom_.AddHash(prefix_hash);
283 }
284 }
285
MmapDataIfNeeded()286 Status PlainTableReader::MmapDataIfNeeded() {
287 if (file_info_.is_mmap_mode) {
288 // Get mmapped memory.
289 return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr);
290 }
291 return Status::OK();
292 }
293
PopulateIndex(TableProperties * props,int bloom_bits_per_key,double hash_table_ratio,size_t index_sparseness,size_t huge_page_tlb_size)294 Status PlainTableReader::PopulateIndex(TableProperties* props,
295 int bloom_bits_per_key,
296 double hash_table_ratio,
297 size_t index_sparseness,
298 size_t huge_page_tlb_size) {
299 assert(props != nullptr);
300
301 BlockContents index_block_contents;
302 Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
303 file_size_, kPlainTableMagicNumber, ioptions_,
304 PlainTableIndexBuilder::kPlainTableIndexBlock,
305 BlockType::kIndex, &index_block_contents,
306 true /* compression_type_missing */);
307
308 bool index_in_file = s.ok();
309
310 BlockContents bloom_block_contents;
311 bool bloom_in_file = false;
312 // We only need to read the bloom block if index block is in file.
313 if (index_in_file) {
314 s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
315 file_size_, kPlainTableMagicNumber, ioptions_,
316 BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
317 &bloom_block_contents,
318 true /* compression_type_missing */);
319 bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
320 }
321
322 Slice* bloom_block;
323 if (bloom_in_file) {
324 // If bloom_block_contents.allocation is not empty (which will be the case
325 // for non-mmap mode), it holds the alloated memory for the bloom block.
326 // It needs to be kept alive to keep `bloom_block` valid.
327 bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
328 bloom_block = &bloom_block_contents.data;
329 } else {
330 bloom_block = nullptr;
331 }
332
333 Slice* index_block;
334 if (index_in_file) {
335 // If index_block_contents.allocation is not empty (which will be the case
336 // for non-mmap mode), it holds the alloated memory for the index block.
337 // It needs to be kept alive to keep `index_block` valid.
338 index_block_alloc_ = std::move(index_block_contents.allocation);
339 index_block = &index_block_contents.data;
340 } else {
341 index_block = nullptr;
342 }
343
344 if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
345 // moptions.prefix_extractor is requried for a hash-based look-up.
346 return Status::NotSupported(
347 "PlainTable requires a prefix extractor enable prefix hash mode.");
348 }
349
350 // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
351 // for a prefix (starting from the first one), generate a record of (hash,
352 // offset) and append it to IndexRecordList, which is a data structure created
353 // to store them.
354
355 if (!index_in_file) {
356 // Allocate bloom filter here for total order mode.
357 if (IsTotalOrderMode()) {
358 AllocateBloom(bloom_bits_per_key,
359 static_cast<uint32_t>(props->num_entries),
360 huge_page_tlb_size);
361 }
362 } else if (bloom_in_file) {
363 enable_bloom_ = true;
364 auto num_blocks_property = props->user_collected_properties.find(
365 PlainTablePropertyNames::kNumBloomBlocks);
366
367 uint32_t num_blocks = 0;
368 if (num_blocks_property != props->user_collected_properties.end()) {
369 Slice temp_slice(num_blocks_property->second);
370 if (!GetVarint32(&temp_slice, &num_blocks)) {
371 num_blocks = 0;
372 }
373 }
374 // cast away const qualifier, because bloom_ won't be changed
375 bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
376 static_cast<uint32_t>(bloom_block->size()) * 8,
377 num_blocks);
378 } else {
379 // Index in file but no bloom in file. Disable bloom filter in this case.
380 enable_bloom_ = false;
381 bloom_bits_per_key = 0;
382 }
383
384 PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
385 index_sparseness, hash_table_ratio,
386 huge_page_tlb_size);
387
388 std::vector<uint32_t> prefix_hashes;
389 if (!index_in_file) {
390 // Populates _bloom if enabled (total order mode)
391 s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
392 if (!s.ok()) {
393 return s;
394 }
395 } else {
396 s = index_.InitFromRawData(*index_block);
397 if (!s.ok()) {
398 return s;
399 }
400 }
401
402 if (!index_in_file) {
403 if (!IsTotalOrderMode()) {
404 // Calculated bloom filter size and allocate memory for
405 // bloom filter based on the number of prefixes, then fill it.
406 AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
407 huge_page_tlb_size);
408 if (enable_bloom_) {
409 FillBloom(prefix_hashes);
410 }
411 }
412 }
413
414 // Fill two table properties.
415 if (!index_in_file) {
416 props->user_collected_properties["plain_table_hash_table_size"] =
417 ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
418 props->user_collected_properties["plain_table_sub_index_size"] =
419 ToString(index_.GetSubIndexSize());
420 } else {
421 props->user_collected_properties["plain_table_hash_table_size"] =
422 ToString(0);
423 props->user_collected_properties["plain_table_sub_index_size"] =
424 ToString(0);
425 }
426
427 return Status::OK();
428 }
429
GetOffset(PlainTableKeyDecoder * decoder,const Slice & target,const Slice & prefix,uint32_t prefix_hash,bool & prefix_matched,uint32_t * offset) const430 Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
431 const Slice& target, const Slice& prefix,
432 uint32_t prefix_hash, bool& prefix_matched,
433 uint32_t* offset) const {
434 prefix_matched = false;
435 uint32_t prefix_index_offset;
436 auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
437 if (res == PlainTableIndex::kNoPrefixForBucket) {
438 *offset = file_info_.data_end_offset;
439 return Status::OK();
440 } else if (res == PlainTableIndex::kDirectToFile) {
441 *offset = prefix_index_offset;
442 return Status::OK();
443 }
444
445 // point to sub-index, need to do a binary search
446 uint32_t upper_bound;
447 const char* base_ptr =
448 index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
449 uint32_t low = 0;
450 uint32_t high = upper_bound;
451 ParsedInternalKey mid_key;
452 ParsedInternalKey parsed_target;
453 if (!ParseInternalKey(target, &parsed_target)) {
454 return Status::Corruption(Slice());
455 }
456
457 // The key is between [low, high). Do a binary search between it.
458 while (high - low > 1) {
459 uint32_t mid = (high + low) / 2;
460 uint32_t file_offset = GetFixed32Element(base_ptr, mid);
461 uint32_t tmp;
462 Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
463 if (!s.ok()) {
464 return s;
465 }
466 int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
467 if (cmp_result < 0) {
468 low = mid;
469 } else {
470 if (cmp_result == 0) {
471 // Happen to have found the exact key or target is smaller than the
472 // first key after base_offset.
473 prefix_matched = true;
474 *offset = file_offset;
475 return Status::OK();
476 } else {
477 high = mid;
478 }
479 }
480 }
481 // Both of the key at the position low or low+1 could share the same
482 // prefix as target. We need to rule out one of them to avoid to go
483 // to the wrong prefix.
484 ParsedInternalKey low_key;
485 uint32_t tmp;
486 uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
487 Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
488 if (!s.ok()) {
489 return s;
490 }
491
492 if (GetPrefix(low_key) == prefix) {
493 prefix_matched = true;
494 *offset = low_key_offset;
495 } else if (low + 1 < upper_bound) {
496 // There is possible a next prefix, return it
497 prefix_matched = false;
498 *offset = GetFixed32Element(base_ptr, low + 1);
499 } else {
500 // target is larger than a key of the last prefix in this bucket
501 // but with a different prefix. Key does not exist.
502 *offset = file_info_.data_end_offset;
503 }
504 return Status::OK();
505 }
506
MatchBloom(uint32_t hash) const507 bool PlainTableReader::MatchBloom(uint32_t hash) const {
508 if (!enable_bloom_) {
509 return true;
510 }
511
512 if (bloom_.MayContainHash(hash)) {
513 PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
514 return true;
515 } else {
516 PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
517 return false;
518 }
519 }
520
Next(PlainTableKeyDecoder * decoder,uint32_t * offset,ParsedInternalKey * parsed_key,Slice * internal_key,Slice * value,bool * seekable) const521 Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
522 ParsedInternalKey* parsed_key,
523 Slice* internal_key, Slice* value,
524 bool* seekable) const {
525 if (*offset == file_info_.data_end_offset) {
526 *offset = file_info_.data_end_offset;
527 return Status::OK();
528 }
529
530 if (*offset > file_info_.data_end_offset) {
531 return Status::Corruption("Offset is out of file size");
532 }
533
534 uint32_t bytes_read;
535 Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
536 &bytes_read, seekable);
537 if (!s.ok()) {
538 return s;
539 }
540 *offset = *offset + bytes_read;
541 return Status::OK();
542 }
543
Prepare(const Slice & target)544 void PlainTableReader::Prepare(const Slice& target) {
545 if (enable_bloom_) {
546 uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
547 bloom_.Prefetch(prefix_hash);
548 }
549 }
550
Get(const ReadOptions &,const Slice & target,GetContext * get_context,const SliceTransform *,bool)551 Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
552 GetContext* get_context,
553 const SliceTransform* /* prefix_extractor */,
554 bool /*skip_filters*/) {
555 // Check bloom filter first.
556 Slice prefix_slice;
557 uint32_t prefix_hash;
558 if (IsTotalOrderMode()) {
559 if (full_scan_mode_) {
560 status_ =
561 Status::InvalidArgument("Get() is not allowed in full scan mode.");
562 }
563 // Match whole user key for bloom filter check.
564 if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
565 return Status::OK();
566 }
567 // in total order mode, there is only one bucket 0, and we always use empty
568 // prefix.
569 prefix_slice = Slice();
570 prefix_hash = 0;
571 } else {
572 prefix_slice = GetPrefix(target);
573 prefix_hash = GetSliceHash(prefix_slice);
574 if (!MatchBloom(prefix_hash)) {
575 return Status::OK();
576 }
577 }
578 uint32_t offset;
579 bool prefix_match;
580 PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
581 prefix_extractor_);
582 Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
583 prefix_match, &offset);
584
585 if (!s.ok()) {
586 return s;
587 }
588 ParsedInternalKey found_key;
589 ParsedInternalKey parsed_target;
590 if (!ParseInternalKey(target, &parsed_target)) {
591 return Status::Corruption(Slice());
592 }
593 Slice found_value;
594 while (offset < file_info_.data_end_offset) {
595 s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
596 if (!s.ok()) {
597 return s;
598 }
599 if (!prefix_match) {
600 // Need to verify prefix for the first key found if it is not yet
601 // checked.
602 if (GetPrefix(found_key) != prefix_slice) {
603 return Status::OK();
604 }
605 prefix_match = true;
606 }
607 // TODO(ljin): since we know the key comparison result here,
608 // can we enable the fast path?
609 if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
610 bool dont_care __attribute__((__unused__));
611 if (!get_context->SaveValue(found_key, found_value, &dont_care,
612 dummy_cleanable_.get())) {
613 break;
614 }
615 }
616 }
617 return Status::OK();
618 }
619
ApproximateOffsetOf(const Slice &,TableReaderCaller)620 uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
621 TableReaderCaller /*caller*/) {
622 return 0;
623 }
624
ApproximateSize(const Slice &,const Slice &,TableReaderCaller)625 uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
626 const Slice& /*end*/,
627 TableReaderCaller /*caller*/) {
628 return 0;
629 }
630
PlainTableIterator(PlainTableReader * table,bool use_prefix_seek)631 PlainTableIterator::PlainTableIterator(PlainTableReader* table,
632 bool use_prefix_seek)
633 : table_(table),
634 decoder_(&table_->file_info_, table_->encoding_type_,
635 table_->user_key_len_, table_->prefix_extractor_),
636 use_prefix_seek_(use_prefix_seek) {
637 next_offset_ = offset_ = table_->file_info_.data_end_offset;
638 }
639
~PlainTableIterator()640 PlainTableIterator::~PlainTableIterator() {
641 }
642
Valid() const643 bool PlainTableIterator::Valid() const {
644 return offset_ < table_->file_info_.data_end_offset &&
645 offset_ >= table_->data_start_offset_;
646 }
647
SeekToFirst()648 void PlainTableIterator::SeekToFirst() {
649 status_ = Status::OK();
650 next_offset_ = table_->data_start_offset_;
651 if (next_offset_ >= table_->file_info_.data_end_offset) {
652 next_offset_ = offset_ = table_->file_info_.data_end_offset;
653 } else {
654 Next();
655 }
656 }
657
SeekToLast()658 void PlainTableIterator::SeekToLast() {
659 assert(false);
660 status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
661 next_offset_ = offset_ = table_->file_info_.data_end_offset;
662 }
663
Seek(const Slice & target)664 void PlainTableIterator::Seek(const Slice& target) {
665 if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
666 // This check is done here instead of NewIterator() to permit creating an
667 // iterator with total_order_seek = true even if we won't be able to Seek()
668 // it. This is needed for compaction: it creates iterator with
669 // total_order_seek = true but usually never does Seek() on it,
670 // only SeekToFirst().
671 status_ =
672 Status::InvalidArgument(
673 "total_order_seek not implemented for PlainTable.");
674 offset_ = next_offset_ = table_->file_info_.data_end_offset;
675 return;
676 }
677
678 // If the user doesn't set prefix seek option and we are not able to do a
679 // total Seek(). assert failure.
680 if (table_->IsTotalOrderMode()) {
681 if (table_->full_scan_mode_) {
682 status_ =
683 Status::InvalidArgument("Seek() is not allowed in full scan mode.");
684 offset_ = next_offset_ = table_->file_info_.data_end_offset;
685 return;
686 } else if (table_->GetIndexSize() > 1) {
687 assert(false);
688 status_ = Status::NotSupported(
689 "PlainTable cannot issue non-prefix seek unless in total order "
690 "mode.");
691 offset_ = next_offset_ = table_->file_info_.data_end_offset;
692 return;
693 }
694 }
695
696 Slice prefix_slice = table_->GetPrefix(target);
697 uint32_t prefix_hash = 0;
698 // Bloom filter is ignored in total-order mode.
699 if (!table_->IsTotalOrderMode()) {
700 prefix_hash = GetSliceHash(prefix_slice);
701 if (!table_->MatchBloom(prefix_hash)) {
702 status_ = Status::OK();
703 offset_ = next_offset_ = table_->file_info_.data_end_offset;
704 return;
705 }
706 }
707 bool prefix_match;
708 status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
709 prefix_match, &next_offset_);
710 if (!status_.ok()) {
711 offset_ = next_offset_ = table_->file_info_.data_end_offset;
712 return;
713 }
714
715 if (next_offset_ < table_->file_info_.data_end_offset) {
716 for (Next(); status_.ok() && Valid(); Next()) {
717 if (!prefix_match) {
718 // Need to verify the first key's prefix
719 if (table_->GetPrefix(key()) != prefix_slice) {
720 offset_ = next_offset_ = table_->file_info_.data_end_offset;
721 break;
722 }
723 prefix_match = true;
724 }
725 if (table_->internal_comparator_.Compare(key(), target) >= 0) {
726 break;
727 }
728 }
729 } else {
730 offset_ = table_->file_info_.data_end_offset;
731 }
732 }
733
SeekForPrev(const Slice &)734 void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
735 assert(false);
736 status_ =
737 Status::NotSupported("SeekForPrev() is not supported in PlainTable");
738 offset_ = next_offset_ = table_->file_info_.data_end_offset;
739 }
740
Next()741 void PlainTableIterator::Next() {
742 offset_ = next_offset_;
743 if (offset_ < table_->file_info_.data_end_offset) {
744 Slice tmp_slice;
745 ParsedInternalKey parsed_key;
746 status_ =
747 table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
748 if (!status_.ok()) {
749 offset_ = next_offset_ = table_->file_info_.data_end_offset;
750 }
751 }
752 }
753
Prev()754 void PlainTableIterator::Prev() {
755 assert(false);
756 }
757
key() const758 Slice PlainTableIterator::key() const {
759 assert(Valid());
760 return key_;
761 }
762
value() const763 Slice PlainTableIterator::value() const {
764 assert(Valid());
765 return value_;
766 }
767
status() const768 Status PlainTableIterator::status() const {
769 return status_;
770 }
771
772 } // namespace rocksdb
773 #endif // ROCKSDB_LITE
774