1 //===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "PostingList.h"
10 #include "Iterator.h"
11 #include "Token.h"
12 #include "llvm/ADT/STLExtras.h"
13 #include "llvm/Support/Error.h"
14 #include "llvm/Support/MathExtras.h"
15 
16 namespace clang {
17 namespace clangd {
18 namespace dex {
19 namespace {
20 
21 /// Implements iterator of PostingList chunks. This requires iterating over two
22 /// levels: the first level iterator iterates over the chunks and decompresses
23 /// them on-the-fly when the contents of chunk are to be seen.
24 class ChunkIterator : public Iterator {
25 public:
ChunkIterator(const Token * Tok,llvm::ArrayRef<Chunk> Chunks)26   explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
27       : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
28     if (!Chunks.empty()) {
29       DecompressedChunk = CurrentChunk->decompress();
30       CurrentID = DecompressedChunk.begin();
31     }
32   }
33 
reachedEnd() const34   bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
35 
36   /// Advances cursor to the next item.
advance()37   void advance() override {
38     assert(!reachedEnd() &&
39            "Posting List iterator can't advance() at the end.");
40     ++CurrentID;
41     normalizeCursor();
42   }
43 
44   /// Applies binary search to advance cursor to the next item with DocID
45   /// equal or higher than the given one.
advanceTo(DocID ID)46   void advanceTo(DocID ID) override {
47     assert(!reachedEnd() &&
48            "Posting List iterator can't advance() at the end.");
49     if (ID <= peek())
50       return;
51     advanceToChunk(ID);
52     // Try to find ID within current chunk.
53     CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),
54                                      [&](const DocID D) { return D < ID; });
55     normalizeCursor();
56   }
57 
peek() const58   DocID peek() const override {
59     assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
60     return *CurrentID;
61   }
62 
consume()63   float consume() override {
64     assert(!reachedEnd() &&
65            "Posting List iterator can't consume() at the end.");
66     return 1;
67   }
68 
estimateSize() const69   size_t estimateSize() const override {
70     return Chunks.size() * ApproxEntriesPerChunk;
71   }
72 
73 private:
dump(llvm::raw_ostream & OS) const74   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
75     if (Tok != nullptr)
76       return OS << *Tok;
77     OS << '[';
78     const char *Sep = "";
79     for (const Chunk &C : Chunks)
80       for (const DocID Doc : C.decompress()) {
81         OS << Sep << Doc;
82         Sep = " ";
83       }
84     return OS << ']';
85   }
86 
87   /// If the cursor is at the end of a chunk, place it at the start of the next
88   /// chunk.
normalizeCursor()89   void normalizeCursor() {
90     // Invariant is already established if examined chunk is not exhausted.
91     if (CurrentID != std::end(DecompressedChunk))
92       return;
93     // Advance to next chunk if current one is exhausted.
94     ++CurrentChunk;
95     if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
96       return;
97     DecompressedChunk = CurrentChunk->decompress();
98     CurrentID = DecompressedChunk.begin();
99   }
100 
101   /// Advances CurrentChunk to the chunk which might contain ID.
advanceToChunk(DocID ID)102   void advanceToChunk(DocID ID) {
103     if ((CurrentChunk != Chunks.end() - 1) &&
104         ((CurrentChunk + 1)->Head <= ID)) {
105       CurrentChunk =
106           std::partition_point(CurrentChunk + 1, Chunks.end(),
107                                [&](const Chunk &C) { return C.Head < ID; });
108       --CurrentChunk;
109       DecompressedChunk = CurrentChunk->decompress();
110       CurrentID = DecompressedChunk.begin();
111     }
112   }
113 
114   const Token *Tok;
115   llvm::ArrayRef<Chunk> Chunks;
116   /// Iterator over chunks.
117   /// If CurrentChunk is valid, then DecompressedChunk is
118   /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
119   /// into it.
120   decltype(Chunks)::const_iterator CurrentChunk;
121   llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
122   /// Iterator over DecompressedChunk.
123   decltype(DecompressedChunk)::iterator CurrentID;
124 
125   static constexpr size_t ApproxEntriesPerChunk = 15;
126 };
127 
128 static constexpr size_t BitsPerEncodingByte = 7;
129 
130 /// Writes a variable length DocID into the buffer and updates the buffer size.
131 /// If it doesn't fit, returns false and doesn't write to the buffer.
encodeVByte(DocID Delta,llvm::MutableArrayRef<uint8_t> & Payload)132 bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
133   assert(Delta != 0 && "0 is not a valid PostingList delta.");
134   // Calculate number of bytes Delta encoding would take by examining the
135   // meaningful bits.
136   unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte;
137   if (Width > Payload.size())
138     return false;
139 
140   do {
141     uint8_t Encoding = Delta & 0x7f;
142     Delta >>= 7;
143     Payload.front() = Delta ? Encoding | 0x80 : Encoding;
144     Payload = Payload.drop_front();
145   } while (Delta != 0);
146   return true;
147 }
148 
149 /// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
150 /// DocIDs. The compression stores deltas (differences) between subsequent
151 /// DocIDs and encodes these deltas utilizing the least possible number of
152 /// bytes.
153 ///
154 /// Each encoding byte consists of two parts: the first bit (continuation bit)
155 /// indicates whether this is the last byte (0 if this byte is the last) of
156 /// current encoding and seven bytes a piece of DocID (payload). DocID contains
157 /// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
158 /// payloads and one 4-bit payload), but in practice it is expected that gaps
159 /// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
160 /// In very dense posting lists (with average gaps less than 128) this
161 /// representation would be 4 times more efficient than raw DocID array.
162 ///
163 /// PostingList encoding example:
164 ///
165 /// DocIDs    42            47        7000
166 /// gaps                    5         6958
167 /// Encoding  (raw number)  00000101  10110110 00101110
encodeStream(llvm::ArrayRef<DocID> Documents)168 std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
169   assert(!Documents.empty() && "Can't encode empty sequence.");
170   std::vector<Chunk> Result;
171   Result.emplace_back();
172   DocID Last = Result.back().Head = Documents.front();
173   llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
174   for (DocID Doc : Documents.drop_front()) {
175     if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
176       Result.emplace_back();
177       Result.back().Head = Doc;
178       RemainingPayload = Result.back().Payload;
179     }
180     Last = Doc;
181   }
182   return std::vector<Chunk>(Result); // no move, shrink-to-fit
183 }
184 
185 /// Reads variable length DocID from the buffer and updates the buffer size. If
186 /// the stream is terminated, return None.
readVByte(llvm::ArrayRef<uint8_t> & Bytes)187 llvm::Optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
188   if (Bytes.front() == 0 || Bytes.empty())
189     return None;
190   DocID Result = 0;
191   bool HasNextByte = true;
192   for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
193     assert(Length <= 5 && "Malformed VByte encoding sequence.");
194     // Write meaningful bits to the correct place in the document decoding.
195     Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
196     if ((Bytes.front() & 0x80) == 0)
197       HasNextByte = false;
198     Bytes = Bytes.drop_front();
199   }
200   return Result;
201 }
202 
203 } // namespace
204 
decompress() const205 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
206   llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
207   llvm::ArrayRef<uint8_t> Bytes(Payload);
208   DocID Delta;
209   for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
210     auto MaybeDelta = readVByte(Bytes);
211     if (!MaybeDelta)
212       break;
213     Delta = *MaybeDelta;
214     Result.push_back(Current + Delta);
215   }
216   return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
217 }
218 
PostingList(llvm::ArrayRef<DocID> Documents)219 PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
220     : Chunks(encodeStream(Documents)) {}
221 
iterator(const Token * Tok) const222 std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const {
223   return std::make_unique<ChunkIterator>(Tok, Chunks);
224 }
225 
226 } // namespace dex
227 } // namespace clangd
228 } // namespace clang
229