1 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file. See the AUTHORS file for names of contributors.
5 
6 #ifndef ROCKSDB_LITE
7 
8 #include "table/plain/plain_table_reader.h"
9 
10 #include <string>
11 #include <vector>
12 
13 #include "db/dbformat.h"
14 
15 #include "rocksdb/cache.h"
16 #include "rocksdb/comparator.h"
17 #include "rocksdb/env.h"
18 #include "rocksdb/filter_policy.h"
19 #include "rocksdb/options.h"
20 #include "rocksdb/statistics.h"
21 
22 #include "table/block_based/block.h"
23 #include "table/block_based/filter_block.h"
24 #include "table/format.h"
25 #include "table/get_context.h"
26 #include "table/internal_iterator.h"
27 #include "table/meta_blocks.h"
28 #include "table/plain/plain_table_bloom.h"
29 #include "table/plain/plain_table_factory.h"
30 #include "table/plain/plain_table_key_coding.h"
31 #include "table/two_level_iterator.h"
32 
33 #include "memory/arena.h"
34 #include "monitoring/histogram.h"
35 #include "monitoring/perf_context_imp.h"
36 #include "util/coding.h"
37 #include "util/dynamic_bloom.h"
38 #include "util/hash.h"
39 #include "util/stop_watch.h"
40 #include "util/string_util.h"
41 
42 namespace ROCKSDB_NAMESPACE {
43 
44 namespace {
45 
46 // Safely getting a uint32_t element from a char array, where, starting from
47 // `base`, every 4 bytes are considered as an fixed 32 bit integer.
GetFixed32Element(const char * base,size_t offset)48 inline uint32_t GetFixed32Element(const char* base, size_t offset) {
49   return DecodeFixed32(base + offset * sizeof(uint32_t));
50 }
51 }  // namespace
52 
53 // Iterator to iterate IndexedTable
54 class PlainTableIterator : public InternalIterator {
55  public:
56   explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
57   // No copying allowed
58   PlainTableIterator(const PlainTableIterator&) = delete;
59   void operator=(const Iterator&) = delete;
60 
61   ~PlainTableIterator() override;
62 
63   bool Valid() const override;
64 
65   void SeekToFirst() override;
66 
67   void SeekToLast() override;
68 
69   void Seek(const Slice& target) override;
70 
71   void SeekForPrev(const Slice& target) override;
72 
73   void Next() override;
74 
75   void Prev() override;
76 
77   Slice key() const override;
78 
79   Slice value() const override;
80 
81   Status status() const override;
82 
83  private:
84   PlainTableReader* table_;
85   PlainTableKeyDecoder decoder_;
86   bool use_prefix_seek_;
87   uint32_t offset_;
88   uint32_t next_offset_;
89   Slice key_;
90   Slice value_;
91   Status status_;
92 };
93 
94 extern const uint64_t kPlainTableMagicNumber;
PlainTableReader(const ImmutableOptions & ioptions,std::unique_ptr<RandomAccessFileReader> && file,const EnvOptions & storage_options,const InternalKeyComparator & icomparator,EncodingType encoding_type,uint64_t file_size,const TableProperties * table_properties,const SliceTransform * prefix_extractor)95 PlainTableReader::PlainTableReader(
96     const ImmutableOptions& ioptions,
97     std::unique_ptr<RandomAccessFileReader>&& file,
98     const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
99     EncodingType encoding_type, uint64_t file_size,
100     const TableProperties* table_properties,
101     const SliceTransform* prefix_extractor)
102     : internal_comparator_(icomparator),
103       encoding_type_(encoding_type),
104       full_scan_mode_(false),
105       user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
106       prefix_extractor_(prefix_extractor),
107       enable_bloom_(false),
108       bloom_(6),
109       file_info_(std::move(file), storage_options,
110                  static_cast<uint32_t>(table_properties->data_size)),
111       ioptions_(ioptions),
112       file_size_(file_size),
113       table_properties_(nullptr) {}
114 
~PlainTableReader()115 PlainTableReader::~PlainTableReader() {
116   // Should fix?
117   status_.PermitUncheckedError();
118 }
119 
Open(const ImmutableOptions & ioptions,const EnvOptions & env_options,const InternalKeyComparator & internal_comparator,std::unique_ptr<RandomAccessFileReader> && file,uint64_t file_size,std::unique_ptr<TableReader> * table_reader,const int bloom_bits_per_key,double hash_table_ratio,size_t index_sparseness,size_t huge_page_tlb_size,bool full_scan_mode,const bool immortal_table,const SliceTransform * prefix_extractor)120 Status PlainTableReader::Open(
121     const ImmutableOptions& ioptions, const EnvOptions& env_options,
122     const InternalKeyComparator& internal_comparator,
123     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
124     std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
125     double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
126     bool full_scan_mode, const bool immortal_table,
127     const SliceTransform* prefix_extractor) {
128   if (file_size > PlainTableIndex::kMaxFileSize) {
129     return Status::NotSupported("File is too large for PlainTableReader!");
130   }
131 
132   TableProperties* props_ptr = nullptr;
133   auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
134                                ioptions, &props_ptr,
135                                true /* compression_type_missing */);
136   std::shared_ptr<TableProperties> props(props_ptr);
137   if (!s.ok()) {
138     return s;
139   }
140 
141   assert(hash_table_ratio >= 0.0);
142   auto& user_props = props->user_collected_properties;
143   auto prefix_extractor_in_file = props->prefix_extractor_name;
144 
145   if (!full_scan_mode &&
146       !prefix_extractor_in_file.empty() /* old version sst file*/
147       && prefix_extractor_in_file != "nullptr") {
148     if (!prefix_extractor) {
149       return Status::InvalidArgument(
150           "Prefix extractor is missing when opening a PlainTable built "
151           "using a prefix extractor");
152     } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
153                0) {
154       return Status::InvalidArgument(
155           "Prefix extractor given doesn't match the one used to build "
156           "PlainTable");
157     }
158   }
159 
160   EncodingType encoding_type = kPlain;
161   auto encoding_type_prop =
162       user_props.find(PlainTablePropertyNames::kEncodingType);
163   if (encoding_type_prop != user_props.end()) {
164     encoding_type = static_cast<EncodingType>(
165         DecodeFixed32(encoding_type_prop->second.c_str()));
166   }
167 
168   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
169       ioptions, std::move(file), env_options, internal_comparator,
170       encoding_type, file_size, props.get(), prefix_extractor));
171 
172   s = new_reader->MmapDataIfNeeded();
173   if (!s.ok()) {
174     return s;
175   }
176 
177   if (!full_scan_mode) {
178     s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
179                                   hash_table_ratio, index_sparseness,
180                                   huge_page_tlb_size);
181     if (!s.ok()) {
182       return s;
183     }
184   } else {
185     // Flag to indicate it is a full scan mode so that none of the indexes
186     // can be used.
187     new_reader->full_scan_mode_ = true;
188   }
189   // PopulateIndex can add to the props, so don't store them until now
190   new_reader->table_properties_ = props;
191 
192   if (immortal_table && new_reader->file_info_.is_mmap_mode) {
193     new_reader->dummy_cleanable_.reset(new Cleanable());
194   }
195 
196   *table_reader = std::move(new_reader);
197   return s;
198 }
199 
SetupForCompaction()200 void PlainTableReader::SetupForCompaction() {
201 }
202 
NewIterator(const ReadOptions & options,const SliceTransform *,Arena * arena,bool,TableReaderCaller,size_t,bool)203 InternalIterator* PlainTableReader::NewIterator(
204     const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
205     Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
206     size_t /*compaction_readahead_size*/,
207     bool /* allow_unprepared_value */) {
208   // Not necessarily used here, but make sure this has been initialized
209   assert(table_properties_);
210 
211   // Auto prefix mode is not implemented in PlainTable.
212   bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
213                          !options.auto_prefix_mode;
214   if (arena == nullptr) {
215     return new PlainTableIterator(this, use_prefix_seek);
216   } else {
217     auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
218     return new (mem) PlainTableIterator(this, use_prefix_seek);
219   }
220 }
221 
PopulateIndexRecordList(PlainTableIndexBuilder * index_builder,std::vector<uint32_t> * prefix_hashes)222 Status PlainTableReader::PopulateIndexRecordList(
223     PlainTableIndexBuilder* index_builder,
224     std::vector<uint32_t>* prefix_hashes) {
225   Slice prev_key_prefix_slice;
226   std::string prev_key_prefix_buf;
227   uint32_t pos = data_start_offset_;
228 
229   bool is_first_record = true;
230   Slice key_prefix_slice;
231   PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
232                                prefix_extractor_);
233   while (pos < file_info_.data_end_offset) {
234     uint32_t key_offset = pos;
235     ParsedInternalKey key;
236     Slice value_slice;
237     bool seekable = false;
238     Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
239     if (!s.ok()) {
240       return s;
241     }
242 
243     key_prefix_slice = GetPrefix(key);
244     if (enable_bloom_) {
245       bloom_.AddHash(GetSliceHash(key.user_key));
246     } else {
247       if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
248         if (!is_first_record) {
249           prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
250         }
251         if (file_info_.is_mmap_mode) {
252           prev_key_prefix_slice = key_prefix_slice;
253         } else {
254           prev_key_prefix_buf = key_prefix_slice.ToString();
255           prev_key_prefix_slice = prev_key_prefix_buf;
256         }
257       }
258     }
259 
260     index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
261 
262     if (!seekable && is_first_record) {
263       return Status::Corruption("Key for a prefix is not seekable");
264     }
265 
266     is_first_record = false;
267   }
268 
269   prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
270   auto s = index_.InitFromRawData(index_builder->Finish());
271   return s;
272 }
273 
AllocateBloom(int bloom_bits_per_key,int num_keys,size_t huge_page_tlb_size)274 void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
275                                      size_t huge_page_tlb_size) {
276   uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
277   if (bloom_total_bits > 0) {
278     enable_bloom_ = true;
279     bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
280                         huge_page_tlb_size, ioptions_.logger);
281   }
282 }
283 
FillBloom(const std::vector<uint32_t> & prefix_hashes)284 void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
285   assert(bloom_.IsInitialized());
286   for (const auto prefix_hash : prefix_hashes) {
287     bloom_.AddHash(prefix_hash);
288   }
289 }
290 
MmapDataIfNeeded()291 Status PlainTableReader::MmapDataIfNeeded() {
292   if (file_info_.is_mmap_mode) {
293     // Get mmapped memory.
294     return file_info_.file->Read(IOOptions(), 0,
295                                  static_cast<size_t>(file_size_),
296                                  &file_info_.file_data, nullptr, nullptr);
297   }
298   return Status::OK();
299 }
300 
PopulateIndex(TableProperties * props,int bloom_bits_per_key,double hash_table_ratio,size_t index_sparseness,size_t huge_page_tlb_size)301 Status PlainTableReader::PopulateIndex(TableProperties* props,
302                                        int bloom_bits_per_key,
303                                        double hash_table_ratio,
304                                        size_t index_sparseness,
305                                        size_t huge_page_tlb_size) {
306   assert(props != nullptr);
307 
308   BlockContents index_block_contents;
309   Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
310                            file_size_, kPlainTableMagicNumber, ioptions_,
311                            PlainTableIndexBuilder::kPlainTableIndexBlock,
312                            BlockType::kIndex, &index_block_contents,
313                            true /* compression_type_missing */);
314 
315   bool index_in_file = s.ok();
316 
317   BlockContents bloom_block_contents;
318   bool bloom_in_file = false;
319   // We only need to read the bloom block if index block is in file.
320   if (index_in_file) {
321     s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
322                       file_size_, kPlainTableMagicNumber, ioptions_,
323                       BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
324                       &bloom_block_contents,
325                       true /* compression_type_missing */);
326     bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
327   }
328 
329   Slice* bloom_block;
330   if (bloom_in_file) {
331     // If bloom_block_contents.allocation is not empty (which will be the case
332     // for non-mmap mode), it holds the alloated memory for the bloom block.
333     // It needs to be kept alive to keep `bloom_block` valid.
334     bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
335     bloom_block = &bloom_block_contents.data;
336   } else {
337     bloom_block = nullptr;
338   }
339 
340   Slice* index_block;
341   if (index_in_file) {
342     // If index_block_contents.allocation is not empty (which will be the case
343     // for non-mmap mode), it holds the alloated memory for the index block.
344     // It needs to be kept alive to keep `index_block` valid.
345     index_block_alloc_ = std::move(index_block_contents.allocation);
346     index_block = &index_block_contents.data;
347   } else {
348     index_block = nullptr;
349   }
350 
351   if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
352     // moptions.prefix_extractor is requried for a hash-based look-up.
353     return Status::NotSupported(
354         "PlainTable requires a prefix extractor enable prefix hash mode.");
355   }
356 
357   // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
358   // for a prefix (starting from the first one), generate a record of (hash,
359   // offset) and append it to IndexRecordList, which is a data structure created
360   // to store them.
361 
362   if (!index_in_file) {
363     // Allocate bloom filter here for total order mode.
364     if (IsTotalOrderMode()) {
365       AllocateBloom(bloom_bits_per_key,
366                     static_cast<uint32_t>(props->num_entries),
367                     huge_page_tlb_size);
368     }
369   } else if (bloom_in_file) {
370     enable_bloom_ = true;
371     auto num_blocks_property = props->user_collected_properties.find(
372         PlainTablePropertyNames::kNumBloomBlocks);
373 
374     uint32_t num_blocks = 0;
375     if (num_blocks_property != props->user_collected_properties.end()) {
376       Slice temp_slice(num_blocks_property->second);
377       if (!GetVarint32(&temp_slice, &num_blocks)) {
378         num_blocks = 0;
379       }
380     }
381     // cast away const qualifier, because bloom_ won't be changed
382     bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
383                       static_cast<uint32_t>(bloom_block->size()) * 8,
384                       num_blocks);
385   } else {
386     // Index in file but no bloom in file. Disable bloom filter in this case.
387     enable_bloom_ = false;
388     bloom_bits_per_key = 0;
389   }
390 
391   PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
392                                        index_sparseness, hash_table_ratio,
393                                        huge_page_tlb_size);
394 
395   std::vector<uint32_t> prefix_hashes;
396   if (!index_in_file) {
397     // Populates _bloom if enabled (total order mode)
398     s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
399     if (!s.ok()) {
400       return s;
401     }
402   } else {
403     s = index_.InitFromRawData(*index_block);
404     if (!s.ok()) {
405       return s;
406     }
407   }
408 
409   if (!index_in_file) {
410     if (!IsTotalOrderMode()) {
411       // Calculated bloom filter size and allocate memory for
412       // bloom filter based on the number of prefixes, then fill it.
413       AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
414                     huge_page_tlb_size);
415       if (enable_bloom_) {
416         FillBloom(prefix_hashes);
417       }
418     }
419   }
420 
421   // Fill two table properties.
422   if (!index_in_file) {
423     props->user_collected_properties["plain_table_hash_table_size"] =
424         ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
425     props->user_collected_properties["plain_table_sub_index_size"] =
426         ToString(index_.GetSubIndexSize());
427   } else {
428     props->user_collected_properties["plain_table_hash_table_size"] =
429         ToString(0);
430     props->user_collected_properties["plain_table_sub_index_size"] =
431         ToString(0);
432   }
433 
434   return Status::OK();
435 }
436 
GetOffset(PlainTableKeyDecoder * decoder,const Slice & target,const Slice & prefix,uint32_t prefix_hash,bool & prefix_matched,uint32_t * offset) const437 Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
438                                    const Slice& target, const Slice& prefix,
439                                    uint32_t prefix_hash, bool& prefix_matched,
440                                    uint32_t* offset) const {
441   prefix_matched = false;
442   uint32_t prefix_index_offset;
443   auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
444   if (res == PlainTableIndex::kNoPrefixForBucket) {
445     *offset = file_info_.data_end_offset;
446     return Status::OK();
447   } else if (res == PlainTableIndex::kDirectToFile) {
448     *offset = prefix_index_offset;
449     return Status::OK();
450   }
451 
452   // point to sub-index, need to do a binary search
453   uint32_t upper_bound = 0;
454   const char* base_ptr =
455       index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
456   uint32_t low = 0;
457   uint32_t high = upper_bound;
458   ParsedInternalKey mid_key;
459   ParsedInternalKey parsed_target;
460   Status s = ParseInternalKey(target, &parsed_target,
461                               false /* log_err_key */);  // TODO
462   if (!s.ok()) return s;
463 
464   // The key is between [low, high). Do a binary search between it.
465   while (high - low > 1) {
466     uint32_t mid = (high + low) / 2;
467     uint32_t file_offset = GetFixed32Element(base_ptr, mid);
468     uint32_t tmp;
469     s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
470     if (!s.ok()) {
471       return s;
472     }
473     int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
474     if (cmp_result < 0) {
475       low = mid;
476     } else {
477       if (cmp_result == 0) {
478         // Happen to have found the exact key or target is smaller than the
479         // first key after base_offset.
480         prefix_matched = true;
481         *offset = file_offset;
482         return Status::OK();
483       } else {
484         high = mid;
485       }
486     }
487   }
488   // Both of the key at the position low or low+1 could share the same
489   // prefix as target. We need to rule out one of them to avoid to go
490   // to the wrong prefix.
491   ParsedInternalKey low_key;
492   uint32_t tmp;
493   uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
494   s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
495   if (!s.ok()) {
496     return s;
497   }
498 
499   if (GetPrefix(low_key) == prefix) {
500     prefix_matched = true;
501     *offset = low_key_offset;
502   } else if (low + 1 < upper_bound) {
503     // There is possible a next prefix, return it
504     prefix_matched = false;
505     *offset = GetFixed32Element(base_ptr, low + 1);
506   } else {
507     // target is larger than a key of the last prefix in this bucket
508     // but with a different prefix. Key does not exist.
509     *offset = file_info_.data_end_offset;
510   }
511   return Status::OK();
512 }
513 
MatchBloom(uint32_t hash) const514 bool PlainTableReader::MatchBloom(uint32_t hash) const {
515   if (!enable_bloom_) {
516     return true;
517   }
518 
519   if (bloom_.MayContainHash(hash)) {
520     PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
521     return true;
522   } else {
523     PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
524     return false;
525   }
526 }
527 
Next(PlainTableKeyDecoder * decoder,uint32_t * offset,ParsedInternalKey * parsed_key,Slice * internal_key,Slice * value,bool * seekable) const528 Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
529                               ParsedInternalKey* parsed_key,
530                               Slice* internal_key, Slice* value,
531                               bool* seekable) const {
532   if (*offset == file_info_.data_end_offset) {
533     *offset = file_info_.data_end_offset;
534     return Status::OK();
535   }
536 
537   if (*offset > file_info_.data_end_offset) {
538     return Status::Corruption("Offset is out of file size");
539   }
540 
541   uint32_t bytes_read;
542   Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
543                               &bytes_read, seekable);
544   if (!s.ok()) {
545     return s;
546   }
547   *offset = *offset + bytes_read;
548   return Status::OK();
549 }
550 
Prepare(const Slice & target)551 void PlainTableReader::Prepare(const Slice& target) {
552   if (enable_bloom_) {
553     uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
554     bloom_.Prefetch(prefix_hash);
555   }
556 }
557 
Get(const ReadOptions &,const Slice & target,GetContext * get_context,const SliceTransform *,bool)558 Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
559                              GetContext* get_context,
560                              const SliceTransform* /* prefix_extractor */,
561                              bool /*skip_filters*/) {
562   // Check bloom filter first.
563   Slice prefix_slice;
564   uint32_t prefix_hash;
565   if (IsTotalOrderMode()) {
566     if (full_scan_mode_) {
567       status_ =
568           Status::InvalidArgument("Get() is not allowed in full scan mode.");
569     }
570     // Match whole user key for bloom filter check.
571     if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
572       return Status::OK();
573     }
574     // in total order mode, there is only one bucket 0, and we always use empty
575     // prefix.
576     prefix_slice = Slice();
577     prefix_hash = 0;
578   } else {
579     prefix_slice = GetPrefix(target);
580     prefix_hash = GetSliceHash(prefix_slice);
581     if (!MatchBloom(prefix_hash)) {
582       return Status::OK();
583     }
584   }
585   uint32_t offset;
586   bool prefix_match;
587   PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
588                                prefix_extractor_);
589   Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
590                        prefix_match, &offset);
591 
592   if (!s.ok()) {
593     return s;
594   }
595   ParsedInternalKey found_key;
596   ParsedInternalKey parsed_target;
597   s = ParseInternalKey(target, &parsed_target,
598                        false /* log_err_key */);  // TODO
599   if (!s.ok()) return s;
600 
601   Slice found_value;
602   while (offset < file_info_.data_end_offset) {
603     s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
604     if (!s.ok()) {
605       return s;
606     }
607     if (!prefix_match) {
608       // Need to verify prefix for the first key found if it is not yet
609       // checked.
610       if (GetPrefix(found_key) != prefix_slice) {
611         return Status::OK();
612       }
613       prefix_match = true;
614     }
615     // TODO(ljin): since we know the key comparison result here,
616     // can we enable the fast path?
617     if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
618       bool dont_care __attribute__((__unused__));
619       if (!get_context->SaveValue(found_key, found_value, &dont_care,
620                                   dummy_cleanable_.get())) {
621         break;
622       }
623     }
624   }
625   return Status::OK();
626 }
627 
ApproximateOffsetOf(const Slice &,TableReaderCaller)628 uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
629                                                TableReaderCaller /*caller*/) {
630   return 0;
631 }
632 
ApproximateSize(const Slice &,const Slice &,TableReaderCaller)633 uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
634                                            const Slice& /*end*/,
635                                            TableReaderCaller /*caller*/) {
636   return 0;
637 }
638 
PlainTableIterator(PlainTableReader * table,bool use_prefix_seek)639 PlainTableIterator::PlainTableIterator(PlainTableReader* table,
640                                        bool use_prefix_seek)
641     : table_(table),
642       decoder_(&table_->file_info_, table_->encoding_type_,
643                table_->user_key_len_, table_->prefix_extractor_),
644       use_prefix_seek_(use_prefix_seek) {
645   next_offset_ = offset_ = table_->file_info_.data_end_offset;
646 }
647 
~PlainTableIterator()648 PlainTableIterator::~PlainTableIterator() {
649 }
650 
Valid() const651 bool PlainTableIterator::Valid() const {
652   return offset_ < table_->file_info_.data_end_offset &&
653          offset_ >= table_->data_start_offset_;
654 }
655 
SeekToFirst()656 void PlainTableIterator::SeekToFirst() {
657   status_ = Status::OK();
658   next_offset_ = table_->data_start_offset_;
659   if (next_offset_ >= table_->file_info_.data_end_offset) {
660     next_offset_ = offset_ = table_->file_info_.data_end_offset;
661   } else {
662     Next();
663   }
664 }
665 
SeekToLast()666 void PlainTableIterator::SeekToLast() {
667   assert(false);
668   status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
669   next_offset_ = offset_ = table_->file_info_.data_end_offset;
670 }
671 
Seek(const Slice & target)672 void PlainTableIterator::Seek(const Slice& target) {
673   if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
674     // This check is done here instead of NewIterator() to permit creating an
675     // iterator with total_order_seek = true even if we won't be able to Seek()
676     // it. This is needed for compaction: it creates iterator with
677     // total_order_seek = true but usually never does Seek() on it,
678     // only SeekToFirst().
679     status_ =
680         Status::InvalidArgument(
681           "total_order_seek not implemented for PlainTable.");
682     offset_ = next_offset_ = table_->file_info_.data_end_offset;
683     return;
684   }
685 
686   // If the user doesn't set prefix seek option and we are not able to do a
687   // total Seek(). assert failure.
688   if (table_->IsTotalOrderMode()) {
689     if (table_->full_scan_mode_) {
690       status_ =
691           Status::InvalidArgument("Seek() is not allowed in full scan mode.");
692       offset_ = next_offset_ = table_->file_info_.data_end_offset;
693       return;
694     } else if (table_->GetIndexSize() > 1) {
695       assert(false);
696       status_ = Status::NotSupported(
697           "PlainTable cannot issue non-prefix seek unless in total order "
698           "mode.");
699       offset_ = next_offset_ = table_->file_info_.data_end_offset;
700       return;
701     }
702   }
703 
704   Slice prefix_slice = table_->GetPrefix(target);
705   uint32_t prefix_hash = 0;
706   // Bloom filter is ignored in total-order mode.
707   if (!table_->IsTotalOrderMode()) {
708     prefix_hash = GetSliceHash(prefix_slice);
709     if (!table_->MatchBloom(prefix_hash)) {
710       status_ = Status::OK();
711       offset_ = next_offset_ = table_->file_info_.data_end_offset;
712       return;
713     }
714   }
715   bool prefix_match;
716   status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
717                               prefix_match, &next_offset_);
718   if (!status_.ok()) {
719     offset_ = next_offset_ = table_->file_info_.data_end_offset;
720     return;
721   }
722 
723   if (next_offset_ < table_->file_info_.data_end_offset) {
724     for (Next(); status_.ok() && Valid(); Next()) {
725       if (!prefix_match) {
726         // Need to verify the first key's prefix
727         if (table_->GetPrefix(key()) != prefix_slice) {
728           offset_ = next_offset_ = table_->file_info_.data_end_offset;
729           break;
730         }
731         prefix_match = true;
732       }
733       if (table_->internal_comparator_.Compare(key(), target) >= 0) {
734         break;
735       }
736     }
737   } else {
738     offset_ = table_->file_info_.data_end_offset;
739   }
740 }
741 
SeekForPrev(const Slice &)742 void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
743   assert(false);
744   status_ =
745       Status::NotSupported("SeekForPrev() is not supported in PlainTable");
746   offset_ = next_offset_ = table_->file_info_.data_end_offset;
747 }
748 
Next()749 void PlainTableIterator::Next() {
750   offset_ = next_offset_;
751   if (offset_ < table_->file_info_.data_end_offset) {
752     Slice tmp_slice;
753     ParsedInternalKey parsed_key;
754     status_ =
755         table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
756     if (!status_.ok()) {
757       offset_ = next_offset_ = table_->file_info_.data_end_offset;
758     }
759   }
760 }
761 
Prev()762 void PlainTableIterator::Prev() {
763   assert(false);
764 }
765 
key() const766 Slice PlainTableIterator::key() const {
767   assert(Valid());
768   return key_;
769 }
770 
value() const771 Slice PlainTableIterator::value() const {
772   assert(Valid());
773   return value_;
774 }
775 
status() const776 Status PlainTableIterator::status() const {
777   return status_;
778 }
779 
780 }  // namespace ROCKSDB_NAMESPACE
781 #endif  // ROCKSDB_LITE
782