1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #pragma once
32 
33 #include <third_party/murmurhash3/MurmurHash3.h>
34 
35 #include <bitset>
36 #include <boost/intrusive_ptr.hpp>
37 
38 #include "mongo/base/static_assert.h"
39 #include "mongo/db/pipeline/value.h"
40 #include "mongo/util/intrusive_counter.h"
41 
42 namespace mongo {
43 /** Helper class to make the position in a document abstract
44  *  Warning: This is NOT guaranteed to be the ordered position.
45  *           eg. the first field may not be at Position(0)
46  */
47 class Position {
48 public:
49     // This represents "not found" similar to std::string::npos
Position()50     Position() : index(static_cast<unsigned>(-1)) {}
found()51     bool found() const {
52         return index != Position().index;
53     }
54 
55     bool operator==(Position rhs) const {
56         return this->index == rhs.index;
57     }
58     bool operator!=(Position rhs) const {
59         return !(*this == rhs);
60     }
61 
62     // For debugging and ASSERT_EQUALS in tests.
63     template <typename OStream>
64     friend OStream& operator<<(OStream& stream, Position p) {
65         return stream << p.index;
66     }
67 
68 private:
Position(size_t i)69     explicit Position(size_t i) : index(i) {}
70     unsigned index;
71     friend class DocumentStorage;
72     friend class DocumentStorageIterator;
73 };
74 
75 #pragma pack(1)
76 /** This is how values are stored in the DocumentStorage buffer
77  *  Internal class. Consumers shouldn't care about this.
78  */
79 class ValueElement {
80     MONGO_DISALLOW_COPYING(ValueElement);
81 
82 public:
83     Value val;
84     Position nextCollision;  // Position of next field with same hashBucket
85     const int nameLen;       // doesn't include '\0'
86     const char _name[1];     // pointer to start of name (use nameSD instead)
87 
next()88     ValueElement* next() {
89         return align(plusBytes(sizeof(ValueElement) + nameLen));
90     }
91 
next()92     const ValueElement* next() const {
93         return align(plusBytes(sizeof(ValueElement) + nameLen));
94     }
95 
nameSD()96     StringData nameSD() const {
97         return StringData(_name, nameLen);
98     }
99 
100 
101     // helpers for doing pointer arithmetic with this class
ptr()102     char* ptr() {
103         return reinterpret_cast<char*>(this);
104     }
ptr()105     const char* ptr() const {
106         return reinterpret_cast<const char*>(this);
107     }
plusBytes(size_t bytes)108     const ValueElement* plusBytes(size_t bytes) const {
109         return reinterpret_cast<const ValueElement*>(ptr() + bytes);
110     }
plusBytes(size_t bytes)111     ValueElement* plusBytes(size_t bytes) {
112         return reinterpret_cast<ValueElement*>(ptr() + bytes);
113     }
114 
115     // Round number or pointer up to N-byte boundary. No change if already aligned.
116     template <typename T>
align(T size)117     static T align(T size) {
118         const intmax_t ALIGNMENT = 8;  // must be power of 2 and <= 16 (malloc alignment)
119         // Can't use c++ cast because of conversion between intmax_t and both ints and pointers
120         return (T)(((intmax_t)(size) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1));
121     }
122 
123 private:
124     ValueElement();   // this class should never be constructed
125     ~ValueElement();  // or destructed
126 };
127 // Real size is sizeof(ValueElement) + nameLen
128 #pragma pack()
129 MONGO_STATIC_ASSERT(sizeof(ValueElement) == (sizeof(Value) + sizeof(Position) + sizeof(int) + 1));
130 
131 // This is an internal class for Document. See FieldIterator for the public version.
132 class DocumentStorageIterator {
133 public:
134     // DocumentStorage::iterator() and iteratorAll() are easier to use
DocumentStorageIterator(const ValueElement * first,const ValueElement * end,bool includeMissing)135     DocumentStorageIterator(const ValueElement* first, const ValueElement* end, bool includeMissing)
136         : _first(first), _it(first), _end(end), _includeMissing(includeMissing) {
137         if (!_includeMissing)
138             skipMissing();
139     }
140 
atEnd()141     bool atEnd() const {
142         return _it == _end;
143     }
144 
get()145     const ValueElement& get() const {
146         return *_it;
147     }
148 
position()149     Position position() const {
150         return Position(_it->ptr() - _first->ptr());
151     }
152 
advance()153     void advance() {
154         advanceOne();
155         if (!_includeMissing)
156             skipMissing();
157     }
158 
159     const ValueElement* operator->() {
160         return _it;
161     }
162     const ValueElement& operator*() {
163         return *_it;
164     }
165 
166 private:
advanceOne()167     void advanceOne() {
168         _it = _it->next();
169     }
170 
skipMissing()171     void skipMissing() {
172         while (!atEnd() && _it->val.missing()) {
173             advanceOne();
174         }
175     }
176 
177     const ValueElement* _first;
178     const ValueElement* _it;
179     const ValueElement* _end;
180     bool _includeMissing;
181 };
182 
183 /// Storage class used by both Document and MutableDocument
184 class DocumentStorage : public RefCountable {
185 public:
DocumentStorage()186     DocumentStorage()
187         : _buffer(NULL),
188           _bufferEnd(NULL),
189           _usedBytes(0),
190           _numFields(0),
191           _hashTabMask(0),
192           _metaFields(),
193           _textScore(0),
194           _randVal(0) {}
195 
196     ~DocumentStorage();
197 
198     enum MetaType : char {
199         TEXT_SCORE,
200         RAND_VAL,
201         SORT_KEY,
202 
203         NUM_FIELDS
204     };
205 
emptyDoc()206     static const DocumentStorage& emptyDoc() {
207         return kEmptyDoc;
208     }
209 
size()210     size_t size() const {
211         // can't use _numFields because it includes removed Fields
212         size_t count = 0;
213         for (DocumentStorageIterator it = iterator(); !it.atEnd(); it.advance())
214             count++;
215         return count;
216     }
217 
218     /// Returns the position of the next field to be inserted
getNextPosition()219     Position getNextPosition() const {
220         return Position(_usedBytes);
221     }
222 
223     /// Returns the position of the named field (may be missing) or Position()
224     Position findField(StringData name) const;
225 
226     // Document uses these
getField(Position pos)227     const ValueElement& getField(Position pos) const {
228         verify(pos.found());
229         return *(_firstElement->plusBytes(pos.index));
230     }
getField(StringData name)231     Value getField(StringData name) const {
232         Position pos = findField(name);
233         if (!pos.found())
234             return Value();
235         return getField(pos).val;
236     }
237 
238     // MutableDocument uses these
getField(Position pos)239     ValueElement& getField(Position pos) {
240         verify(pos.found());
241         return *(_firstElement->plusBytes(pos.index));
242     }
getField(StringData name)243     Value& getField(StringData name) {
244         Position pos = findField(name);
245         if (!pos.found())
246             return appendField(name);  // TODO: find a way to avoid hashing name twice
247         return getField(pos).val;
248     }
249 
250     /// Adds a new field with missing Value at the end of the document
251     Value& appendField(StringData name);
252 
253     /** Preallocates space for fields. Use this to attempt to prevent buffer growth.
254      *  This is only valid to call before anything is added to the document.
255      */
256     void reserveFields(size_t expectedFields);
257 
258     /// This skips missing values
iterator()259     DocumentStorageIterator iterator() const {
260         return DocumentStorageIterator(_firstElement, end(), false);
261     }
262 
263     /// This includes missing values
iteratorAll()264     DocumentStorageIterator iteratorAll() const {
265         return DocumentStorageIterator(_firstElement, end(), true);
266     }
267 
268     /// Shallow copy of this. Caller owns memory.
269     boost::intrusive_ptr<DocumentStorage> clone() const;
270 
allocatedBytes()271     size_t allocatedBytes() const {
272         return !_buffer ? 0 : (_bufferEnd - _buffer + hashTabBytes());
273     }
274 
275     /**
276      * Copies all metadata from source if it has any.
277      * Note: does not clear metadata from this.
278      */
copyMetaDataFrom(const DocumentStorage & source)279     void copyMetaDataFrom(const DocumentStorage& source) {
280         if (source.hasTextScore()) {
281             setTextScore(source.getTextScore());
282         }
283         if (source.hasRandMetaField()) {
284             setRandMetaField(source.getRandMetaField());
285         }
286         if (source.hasSortKeyMetaField()) {
287             setSortKeyMetaField(source.getSortKeyMetaField());
288         }
289     }
290 
hasTextScore()291     bool hasTextScore() const {
292         return _metaFields.test(MetaType::TEXT_SCORE);
293     }
getTextScore()294     double getTextScore() const {
295         return _textScore;
296     }
setTextScore(double score)297     void setTextScore(double score) {
298         _metaFields.set(MetaType::TEXT_SCORE);
299         _textScore = score;
300     }
301 
hasRandMetaField()302     bool hasRandMetaField() const {
303         return _metaFields.test(MetaType::RAND_VAL);
304     }
getRandMetaField()305     double getRandMetaField() const {
306         return _randVal;
307     }
setRandMetaField(double val)308     void setRandMetaField(double val) {
309         _metaFields.set(MetaType::RAND_VAL);
310         _randVal = val;
311     }
312 
hasSortKeyMetaField()313     bool hasSortKeyMetaField() const {
314         return _metaFields.test(MetaType::SORT_KEY);
315     }
getSortKeyMetaField()316     BSONObj getSortKeyMetaField() const {
317         return _sortKey;
318     }
setSortKeyMetaField(BSONObj sortKey)319     void setSortKeyMetaField(BSONObj sortKey) {
320         _metaFields.set(MetaType::SORT_KEY);
321         _sortKey = sortKey.getOwned();
322     }
323 
324 private:
325     /// Same as lastElement->next() or firstElement() if empty.
end()326     const ValueElement* end() const {
327         return _firstElement ? _firstElement->plusBytes(_usedBytes) : nullptr;
328     }
329 
330     /// Allocates space in _buffer. Copies existing data if there is any.
331     void alloc(unsigned newSize);
332 
333     /// Call after adding field to _buffer and increasing _numFields
334     void addFieldToHashTable(Position pos);
335 
336     // assumes _hashTabMask is (power of two) - 1
hashTabBuckets()337     unsigned hashTabBuckets() const {
338         return _hashTabMask + 1;
339     }
hashTabBytes()340     unsigned hashTabBytes() const {
341         return hashTabBuckets() * sizeof(Position);
342     }
343 
344     /// rehash on buffer growth if load-factor > .5 (attempt to keep lf < 1 when full)
needRehash()345     bool needRehash() const {
346         return _numFields * 2 > hashTabBuckets();
347     }
348 
349     /// Initialize empty hash table
hashTabInit()350     void hashTabInit() {
351         memset(_hashTab, -1, hashTabBytes());
352     }
353 
hashKey(StringData name)354     static unsigned hashKey(StringData name) {
355         // TODO consider FNV-1a once we have a better benchmark corpus
356         unsigned out;
357         MurmurHash3_x86_32(name.rawData(), name.size(), 0, &out);
358         return out;
359     }
360 
bucketForKey(StringData name)361     unsigned bucketForKey(StringData name) const {
362         return hashKey(name) & _hashTabMask;
363     }
364 
365     /// Adds all fields to the hash table
rehash()366     void rehash() {
367         hashTabInit();
368         for (DocumentStorageIterator it = iteratorAll(); !it.atEnd(); it.advance())
369             addFieldToHashTable(it.position());
370     }
371 
372     enum {
373         HASH_TAB_INIT_SIZE = 8,  // must be power of 2
374         HASH_TAB_MIN = 4,        // don't hash fields for docs smaller than this
375                                  // set to 1 to always hash
376     };
377 
378     // _buffer layout:
379     // -------------------------------------------------------------------------------
380     // | ValueElement1 Name1 | ValueElement2 Name2 | ... FREE SPACE ... | Hash Table |
381     // -------------------------------------------------------------------------------
382     //  ^ _buffer and _firstElement point here                           ^
383     //                                _bufferEnd and _hashTab point here ^
384     //
385     //
386     // When the buffer grows, the hash table moves to the new end.
387     union {
388         char* _buffer;
389         ValueElement* _firstElement;
390     };
391 
392     union {
393         // pointer to "end" of _buffer element space and start of hash table (same position)
394         char* _bufferEnd;
395         Position* _hashTab;  // table lazily initialized once _numFields == HASH_TAB_MIN
396     };
397 
398     unsigned _usedBytes;    // position where next field would start
399     unsigned _numFields;    // this includes removed fields
400     unsigned _hashTabMask;  // equal to hashTabBuckets()-1 but used more often
401 
402     std::bitset<MetaType::NUM_FIELDS> _metaFields;
403     double _textScore;
404     double _randVal;
405     BSONObj _sortKey;
406     // When adding a field, make sure to update clone() method
407 
408     // Defined in document.cpp
409     static const DocumentStorage kEmptyDoc;
410 };
411 }
412