1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #include "mongo/db/index/btree_key_generator.h"
32 
33 #include <boost/optional.hpp>
34 
35 #include "mongo/bson/bsonobjbuilder.h"
36 #include "mongo/db/bson/dotted_path_support.h"
37 #include "mongo/db/field_ref.h"
38 #include "mongo/db/query/collation/collation_index_key.h"
39 #include "mongo/db/query/collation/collator_interface.h"
40 #include "mongo/stdx/memory.h"
41 #include "mongo/util/assert_util.h"
42 #include "mongo/util/mongoutils/str.h"
43 
44 namespace mongo {
45 
46 using IndexVersion = IndexDescriptor::IndexVersion;
47 
48 namespace dps = ::mongo::dotted_path_support;
49 
50 namespace {
51 
52 const BSONObj nullObj = BSON("" << BSONNULL);
53 const BSONElement nullElt = nullObj.firstElement();
54 const BSONObj undefinedObj = BSON("" << BSONUndefined);
55 const BSONElement undefinedElt = undefinedObj.firstElement();
56 
57 }  // namespace
58 
BtreeKeyGenerator(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse)59 BtreeKeyGenerator::BtreeKeyGenerator(std::vector<const char*> fieldNames,
60                                      std::vector<BSONElement> fixed,
61                                      bool isSparse)
62     : _fieldNames(fieldNames), _isSparse(isSparse), _fixed(fixed) {
63     BSONObjBuilder nullKeyBuilder;
64     for (size_t i = 0; i < fieldNames.size(); ++i) {
65         nullKeyBuilder.appendNull("");
66     }
67     _nullKey = nullKeyBuilder.obj();
68 
69     _isIdIndex = fieldNames.size() == 1 && std::string("_id") == fieldNames[0];
70 }
71 
make(IndexVersion indexVersion,std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse,const CollatorInterface * collator)72 std::unique_ptr<BtreeKeyGenerator> BtreeKeyGenerator::make(IndexVersion indexVersion,
73                                                            std::vector<const char*> fieldNames,
74                                                            std::vector<BSONElement> fixed,
75                                                            bool isSparse,
76                                                            const CollatorInterface* collator) {
77     switch (indexVersion) {
78         case IndexVersion::kV0:
79             return stdx::make_unique<BtreeKeyGeneratorV0>(fieldNames, fixed, isSparse);
80         case IndexVersion::kV1:
81         case IndexVersion::kV2:
82             return stdx::make_unique<BtreeKeyGeneratorV1>(fieldNames, fixed, isSparse, collator);
83     }
84     return nullptr;
85 }
86 
getKeys(const BSONObj & obj,BSONObjSet * keys,MultikeyPaths * multikeyPaths) const87 void BtreeKeyGenerator::getKeys(const BSONObj& obj,
88                                 BSONObjSet* keys,
89                                 MultikeyPaths* multikeyPaths) const {
90     // '_fieldNames' and '_fixed' are passed by value so that they can be mutated as part of the
91     // getKeys call.  :|
92     getKeysImpl(_fieldNames, _fixed, obj, keys, multikeyPaths);
93     if (keys->empty() && !_isSparse) {
94         keys->insert(_nullKey);
95     }
96 }
97 
assertParallelArrays(const char * first,const char * second)98 static void assertParallelArrays(const char* first, const char* second) {
99     std::stringstream ss;
100     ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
101     uasserted(ErrorCodes::CannotIndexParallelArrays, ss.str());
102 }
103 
BtreeKeyGeneratorV0(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse)104 BtreeKeyGeneratorV0::BtreeKeyGeneratorV0(std::vector<const char*> fieldNames,
105                                          std::vector<BSONElement> fixed,
106                                          bool isSparse)
107     : BtreeKeyGenerator(fieldNames, fixed, isSparse) {}
108 
getKeysImpl(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,const BSONObj & obj,BSONObjSet * keys,MultikeyPaths * multikeyPaths) const109 void BtreeKeyGeneratorV0::getKeysImpl(std::vector<const char*> fieldNames,
110                                       std::vector<BSONElement> fixed,
111                                       const BSONObj& obj,
112                                       BSONObjSet* keys,
113                                       MultikeyPaths* multikeyPaths) const {
114     if (_isIdIndex) {
115         // we special case for speed
116         BSONElement e = obj["_id"];
117         if (e.eoo()) {
118             keys->insert(_nullKey);
119         } else {
120             int size = e.size() + 5 /* bson over head*/ - 3 /* remove _id string */;
121             BSONObjBuilder b(size);
122             b.appendAs(e, "");
123             keys->insert(b.obj());
124             invariant(keys->begin()->objsize() == size);
125         }
126         return;
127     }
128 
129     BSONElement arrElt;
130     unsigned arrIdx = ~0;
131     unsigned numNotFound = 0;
132 
133     for (unsigned i = 0; i < fieldNames.size(); ++i) {
134         if (*fieldNames[i] == '\0')
135             continue;
136 
137         BSONElement e = dps::extractElementAtPathOrArrayAlongPath(obj, fieldNames[i]);
138 
139         if (e.eoo()) {
140             e = nullElt;  // no matching field
141             numNotFound++;
142         }
143 
144         if (e.type() != Array)
145             fieldNames[i] = "";  // no matching field or non-array match
146 
147         if (*fieldNames[i] == '\0')
148             // no need for further object expansion (though array expansion still possible)
149             fixed[i] = e;
150 
151         if (e.type() == Array && arrElt.eoo()) {
152             // we only expand arrays on a single path -- track the path here
153             arrIdx = i;
154             arrElt = e;
155         }
156 
157         // enforce single array path here
158         if (e.type() == Array && e.rawdata() != arrElt.rawdata()) {
159             assertParallelArrays(e.fieldName(), arrElt.fieldName());
160         }
161     }
162 
163     bool allFound = true;  // have we found elements for all field names in the key spec?
164     for (std::vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end();
165          ++i) {
166         if (**i != '\0') {
167             allFound = false;
168             break;
169         }
170     }
171 
172     if (_isSparse && numNotFound == _fieldNames.size()) {
173         // we didn't find any fields
174         // so we're not going to index this document
175         return;
176     }
177 
178     bool insertArrayNull = false;
179 
180     if (allFound) {
181         if (arrElt.eoo()) {
182             // no terminal array element to expand
183             BSONObjBuilder b(_sizeTracker);
184             for (std::vector<BSONElement>::iterator i = fixed.begin(); i != fixed.end(); ++i)
185                 b.appendAs(*i, "");
186             keys->insert(b.obj());
187         } else {
188             // terminal array element to expand, so generate all keys
189             BSONObjIterator i(arrElt.embeddedObject());
190             if (i.more()) {
191                 while (i.more()) {
192                     BSONObjBuilder b(_sizeTracker);
193                     for (unsigned j = 0; j < fixed.size(); ++j) {
194                         if (j == arrIdx)
195                             b.appendAs(i.next(), "");
196                         else
197                             b.appendAs(fixed[j], "");
198                     }
199                     keys->insert(b.obj());
200                 }
201             } else if (fixed.size() > 1) {
202                 insertArrayNull = true;
203             }
204         }
205     } else {
206         // nonterminal array element to expand, so recurse
207         verify(!arrElt.eoo());
208         BSONObjIterator i(arrElt.embeddedObject());
209         if (i.more()) {
210             while (i.more()) {
211                 BSONElement e = i.next();
212                 if (e.type() == Object) {
213                     getKeysImpl(fieldNames, fixed, e.embeddedObject(), keys, multikeyPaths);
214                 }
215             }
216         } else {
217             insertArrayNull = true;
218         }
219     }
220 
221     if (insertArrayNull) {
222         // x : [] - need to insert undefined
223         BSONObjBuilder b(_sizeTracker);
224         for (unsigned j = 0; j < fixed.size(); ++j) {
225             if (j == arrIdx) {
226                 b.appendUndefined("");
227             } else {
228                 BSONElement e = fixed[j];
229                 if (e.eoo())
230                     b.appendNull("");
231                 else
232                     b.appendAs(e, "");
233             }
234         }
235         keys->insert(b.obj());
236     }
237 }
238 
BtreeKeyGeneratorV1(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse,const CollatorInterface * collator)239 BtreeKeyGeneratorV1::BtreeKeyGeneratorV1(std::vector<const char*> fieldNames,
240                                          std::vector<BSONElement> fixed,
241                                          bool isSparse,
242                                          const CollatorInterface* collator)
243     : BtreeKeyGenerator(fieldNames, fixed, isSparse),
244       _emptyPositionalInfo(fieldNames.size()),
245       _collator(collator) {
246     for (const char* fieldName : fieldNames) {
247         size_t pathLength = FieldRef{fieldName}.numParts();
248         invariant(pathLength > 0);
249         _pathLengths.push_back(pathLength);
250     }
251 }
252 
extractNextElement(const BSONObj & obj,const PositionalPathInfo & positionalInfo,const char ** field,bool * arrayNestedArray) const253 BSONElement BtreeKeyGeneratorV1::extractNextElement(const BSONObj& obj,
254                                                     const PositionalPathInfo& positionalInfo,
255                                                     const char** field,
256                                                     bool* arrayNestedArray) const {
257     std::string firstField = mongoutils::str::before(*field, '.');
258     bool haveObjField = !obj.getField(firstField).eoo();
259     BSONElement arrField = positionalInfo.positionallyIndexedElt;
260 
261     // An index component field name cannot exist in both a document
262     // array and one of that array's children.
263     uassert(16746,
264             mongoutils::str::stream()
265                 << "Ambiguous field name found in array (do not use numeric field names in "
266                    "embedded elements in an array), field: '"
267                 << arrField.fieldName()
268                 << "' for array: "
269                 << positionalInfo.arrayObj,
270             !haveObjField || !positionalInfo.hasPositionallyIndexedElt());
271 
272     *arrayNestedArray = false;
273     if (haveObjField) {
274         return dps::extractElementAtPathOrArrayAlongPath(obj, *field);
275     } else if (positionalInfo.hasPositionallyIndexedElt()) {
276         if (arrField.type() == Array) {
277             *arrayNestedArray = true;
278         }
279         *field = positionalInfo.remainingPath;
280         return positionalInfo.dottedElt;
281     }
282     return BSONElement();
283 }
284 
_getKeysArrEltFixed(std::vector<const char * > * fieldNames,std::vector<BSONElement> * fixed,const BSONElement & arrEntry,BSONObjSet * keys,unsigned numNotFound,const BSONElement & arrObjElt,const std::set<size_t> & arrIdxs,bool mayExpandArrayUnembedded,const std::vector<PositionalPathInfo> & positionalInfo,MultikeyPaths * multikeyPaths) const285 void BtreeKeyGeneratorV1::_getKeysArrEltFixed(std::vector<const char*>* fieldNames,
286                                               std::vector<BSONElement>* fixed,
287                                               const BSONElement& arrEntry,
288                                               BSONObjSet* keys,
289                                               unsigned numNotFound,
290                                               const BSONElement& arrObjElt,
291                                               const std::set<size_t>& arrIdxs,
292                                               bool mayExpandArrayUnembedded,
293                                               const std::vector<PositionalPathInfo>& positionalInfo,
294                                               MultikeyPaths* multikeyPaths) const {
295     // Set up any terminal array values.
296     for (const auto idx : arrIdxs) {
297         if (*(*fieldNames)[idx] == '\0') {
298             (*fixed)[idx] = mayExpandArrayUnembedded ? arrEntry : arrObjElt;
299         }
300     }
301 
302     // Recurse.
303     getKeysImplWithArray(*fieldNames,
304                          *fixed,
305                          arrEntry.type() == Object ? arrEntry.embeddedObject() : BSONObj(),
306                          keys,
307                          numNotFound,
308                          positionalInfo,
309                          multikeyPaths);
310 }
311 
getKeysImpl(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,const BSONObj & obj,BSONObjSet * keys,MultikeyPaths * multikeyPaths) const312 void BtreeKeyGeneratorV1::getKeysImpl(std::vector<const char*> fieldNames,
313                                       std::vector<BSONElement> fixed,
314                                       const BSONObj& obj,
315                                       BSONObjSet* keys,
316                                       MultikeyPaths* multikeyPaths) const {
317     if (_isIdIndex) {
318         // we special case for speed
319         BSONElement e = obj["_id"];
320         if (e.eoo()) {
321             keys->insert(_nullKey);
322         } else if (_collator) {
323             BSONObjBuilder b;
324             CollationIndexKey::collationAwareIndexKeyAppend(e, _collator, &b);
325 
326             // Insert a copy so its buffer size fits the object size.
327             keys->insert(b.obj().copy());
328         } else {
329             int size = e.size() + 5 /* bson over head*/ - 3 /* remove _id string */;
330             BSONObjBuilder b(size);
331             b.appendAs(e, "");
332             keys->insert(b.obj());
333             invariant(keys->begin()->objsize() == size);
334         }
335 
336         // The {_id: 1} index can never be multikey because the _id field isn't allowed to be an
337         // array value. We therefore always set 'multikeyPaths' as [ [ ] ].
338         if (multikeyPaths) {
339             multikeyPaths->resize(1);
340         }
341         return;
342     }
343 
344     if (multikeyPaths) {
345         invariant(multikeyPaths->empty());
346         multikeyPaths->resize(fieldNames.size());
347     }
348     getKeysImplWithArray(
349         std::move(fieldNames), std::move(fixed), obj, keys, 0, _emptyPositionalInfo, multikeyPaths);
350 }
351 
getKeysImplWithArray(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,const BSONObj & obj,BSONObjSet * keys,unsigned numNotFound,const std::vector<PositionalPathInfo> & positionalInfo,MultikeyPaths * multikeyPaths) const352 void BtreeKeyGeneratorV1::getKeysImplWithArray(
353     std::vector<const char*> fieldNames,
354     std::vector<BSONElement> fixed,
355     const BSONObj& obj,
356     BSONObjSet* keys,
357     unsigned numNotFound,
358     const std::vector<PositionalPathInfo>& positionalInfo,
359     MultikeyPaths* multikeyPaths) const {
360     BSONElement arrElt;
361 
362     // A set containing the position of any indexed fields in the key pattern that traverse through
363     // the 'arrElt' array value.
364     std::set<size_t> arrIdxs;
365 
366     // A vector with size equal to the number of elements in the index key pattern. Each element in
367     // the vector, if initialized, refers to the component within the indexed field that traverses
368     // through the 'arrElt' array value. We say that this component within the indexed field
369     // corresponds to a path that causes the index to be multikey if the 'arrElt' array value
370     // contains multiple elements.
371     //
372     // For example, consider the index {'a.b': 1, 'a.c'} and the document
373     // {a: [{b: 1, c: 'x'}, {b: 2, c: 'y'}]}. The path "a" causes the index to be multikey, so we'd
374     // have a std::vector<boost::optional<size_t>>{{0U}, {0U}}.
375     //
376     // Furthermore, due to how positional key patterns are specified, it's possible for an indexed
377     // field to cause the index to be multikey at a different component than another indexed field
378     // that also traverses through the 'arrElt' array value. It's then also possible for an indexed
379     // field not to cause the index to be multikey, even if it traverses through the 'arrElt' array
380     // value, because only a particular element would be indexed.
381     //
382     // For example, consider the index {'a.b': 1, 'a.b.0'} and the document {a: {b: [1, 2]}}. The
383     // path "a.b" causes the index to be multikey, but the key pattern "a.b.0" only indexes the
384     // first element of the array, so we'd have a
385     // std::vector<boost::optional<size_t>>{{1U}, boost::none}.
386     std::vector<boost::optional<size_t>> arrComponents(fieldNames.size());
387 
388     bool mayExpandArrayUnembedded = true;
389     for (size_t i = 0; i < fieldNames.size(); ++i) {
390         if (*fieldNames[i] == '\0') {
391             continue;
392         }
393 
394         bool arrayNestedArray;
395         // Extract element matching fieldName[ i ] from object xor array.
396         BSONElement e =
397             extractNextElement(obj, positionalInfo[i], &fieldNames[i], &arrayNestedArray);
398 
399         if (e.eoo()) {
400             // if field not present, set to null
401             fixed[i] = nullElt;
402             // done expanding this field name
403             fieldNames[i] = "";
404             numNotFound++;
405         } else if (e.type() == Array) {
406             arrIdxs.insert(i);
407             if (arrElt.eoo()) {
408                 // we only expand arrays on a single path -- track the path here
409                 arrElt = e;
410             } else if (e.rawdata() != arrElt.rawdata()) {
411                 // enforce single array path here
412                 assertParallelArrays(e.fieldName(), arrElt.fieldName());
413             }
414             if (arrayNestedArray) {
415                 mayExpandArrayUnembedded = false;
416             }
417         } else {
418             // not an array - no need for further expansion
419             fixed[i] = e;
420         }
421     }
422 
423     if (arrElt.eoo()) {
424         // No array, so generate a single key.
425         if (_isSparse && numNotFound == fieldNames.size()) {
426             return;
427         }
428         BSONObjBuilder b(_sizeTracker);
429         for (std::vector<BSONElement>::iterator i = fixed.begin(); i != fixed.end(); ++i) {
430             CollationIndexKey::collationAwareIndexKeyAppend(*i, _collator, &b);
431         }
432         keys->insert(b.obj());
433     } else if (arrElt.embeddedObject().firstElement().eoo()) {
434         // We've encountered an empty array.
435         if (multikeyPaths && mayExpandArrayUnembedded) {
436             // Any indexed path which traverses through the empty array must be recorded as an array
437             // component.
438             for (auto i : arrIdxs) {
439                 // We need to determine which component of the indexed field causes the index to be
440                 // multikey as a result of the empty array. Indexed empty arrays are considered
441                 // multikey and may occur mid-path. For instance, the indexed path "a.b.c" has
442                 // multikey components {0, 1} given the document {a: [{b: []}, {b: 1}]}.
443                 size_t fullPathLength = _pathLengths[i];
444                 size_t suffixPathLength = FieldRef{fieldNames[i]}.numParts();
445                 invariant(suffixPathLength < fullPathLength);
446                 arrComponents[i] = fullPathLength - suffixPathLength - 1;
447             }
448         }
449 
450         // For an empty array, set matching fields to undefined.
451         _getKeysArrEltFixed(&fieldNames,
452                             &fixed,
453                             undefinedElt,
454                             keys,
455                             numNotFound,
456                             arrElt,
457                             arrIdxs,
458                             true,
459                             _emptyPositionalInfo,
460                             multikeyPaths);
461     } else {
462         BSONObj arrObj = arrElt.embeddedObject();
463 
464         // For positional key patterns, e.g. {'a.1.b': 1}, we lookup the indexed array element
465         // and then traverse the remainder of the field path up front. This prevents us from
466         // having to look up the indexed element again on each recursive call (i.e. once per
467         // array element).
468         std::vector<PositionalPathInfo> subPositionalInfo(fixed.size());
469         for (size_t i = 0; i < fieldNames.size(); ++i) {
470             const bool fieldIsArray = arrIdxs.find(i) != arrIdxs.end();
471 
472             if (*fieldNames[i] == '\0') {
473                 // We've reached the end of the path.
474                 if (multikeyPaths && fieldIsArray && mayExpandArrayUnembedded) {
475                     // The 'arrElt' array value isn't expanded into multiple elements when the last
476                     // component of the indexed field is positional and 'arrElt' contains nested
477                     // array values. In all other cases, the 'arrElt' array value may be expanded
478                     // into multiple element and can therefore cause the index to be multikey.
479                     arrComponents[i] = _pathLengths[i] - 1;
480                 }
481                 continue;
482             }
483 
484             // The earlier call to dps::extractElementAtPathOrArrayAlongPath(..., fieldNames[i])
485             // modified fieldNames[i] to refer to the suffix of the path immediately following the
486             // 'arrElt' array value. If we haven't reached the end of this indexed field yet, then
487             // we must have traversed through 'arrElt'.
488             invariant(fieldIsArray);
489 
490             StringData part = fieldNames[i];
491             part = part.substr(0, part.find('.'));
492             subPositionalInfo[i].positionallyIndexedElt = arrObj[part];
493             if (subPositionalInfo[i].positionallyIndexedElt.eoo()) {
494                 // We aren't indexing a particular element of the 'arrElt' array value, so it may be
495                 // expanded into multiple elements. It can therefore cause the index to be multikey.
496                 if (multikeyPaths) {
497                     // We need to determine which component of the indexed field causes the index to
498                     // be multikey as a result of the 'arrElt' array value. Since
499                     //
500                     //   NumComponents("<pathPrefix>") + NumComponents("<pathSuffix>")
501                     //       = NumComponents("<pathPrefix>.<pathSuffix>"),
502                     //
503                     // we can compute the number of components in a prefix of the indexed field by
504                     // subtracting the number of components in the suffix 'fieldNames[i]' from the
505                     // number of components in the indexed field '_fieldNames[i]'.
506                     //
507                     // For example, consider the indexed field "a.b.c" and the suffix "c". The path
508                     // "a.b.c" has 3 components and the suffix "c" has 1 component. Subtracting the
509                     // latter from the former yields the number of components in the prefix "a.b",
510                     // i.e. 2.
511                     size_t fullPathLength = _pathLengths[i];
512                     size_t suffixPathLength = FieldRef{fieldNames[i]}.numParts();
513                     invariant(suffixPathLength < fullPathLength);
514                     arrComponents[i] = fullPathLength - suffixPathLength - 1;
515                 }
516                 continue;
517             }
518 
519             // We're indexing an array element by its position. Traverse the remainder of the
520             // field path now.
521             //
522             // Indexing an array element by its position selects a particular element of the
523             // 'arrElt' array value when generating keys. It therefore cannot cause the index to be
524             // multikey.
525             subPositionalInfo[i].arrayObj = arrObj;
526             subPositionalInfo[i].remainingPath = fieldNames[i];
527             subPositionalInfo[i].dottedElt = dps::extractElementAtPathOrArrayAlongPath(
528                 arrObj, subPositionalInfo[i].remainingPath);
529         }
530 
531         // Generate a key for each element of the indexed array.
532         for (const auto arrObjElem : arrObj) {
533             _getKeysArrEltFixed(&fieldNames,
534                                 &fixed,
535                                 arrObjElem,
536                                 keys,
537                                 numNotFound,
538                                 arrElt,
539                                 arrIdxs,
540                                 mayExpandArrayUnembedded,
541                                 subPositionalInfo,
542                                 multikeyPaths);
543         }
544     }
545 
546     // Record multikey path components.
547     if (multikeyPaths) {
548         for (size_t i = 0; i < arrComponents.size(); ++i) {
549             if (auto arrComponent = arrComponents[i]) {
550                 (*multikeyPaths)[i].insert(*arrComponent);
551             }
552         }
553     }
554 }
555 
556 }  // namespace mongo
557