1
2 /**
3 * Copyright (C) 2018-present MongoDB, Inc.
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the Server Side Public License, version 1,
7 * as published by MongoDB, Inc.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * Server Side Public License for more details.
13 *
14 * You should have received a copy of the Server Side Public License
15 * along with this program. If not, see
16 * <http://www.mongodb.com/licensing/server-side-public-license>.
17 *
18 * As a special exception, the copyright holders give permission to link the
19 * code of portions of this program with the OpenSSL library under certain
20 * conditions as described in each individual source file and distribute
21 * linked combinations including the program with the OpenSSL library. You
22 * must comply with the Server Side Public License in all respects for
23 * all of the code used other than as permitted herein. If you modify file(s)
24 * with this exception, you may extend this exception to your version of the
25 * file(s), but you are not obligated to do so. If you do not wish to do so,
26 * delete this exception statement from your version. If you delete this
27 * exception statement from all source files in the program, then also delete
28 * it in the license file.
29 */
30
31 #include "mongo/db/index/btree_key_generator.h"
32
33 #include <boost/optional.hpp>
34
35 #include "mongo/bson/bsonobjbuilder.h"
36 #include "mongo/db/bson/dotted_path_support.h"
37 #include "mongo/db/field_ref.h"
38 #include "mongo/db/query/collation/collation_index_key.h"
39 #include "mongo/db/query/collation/collator_interface.h"
40 #include "mongo/stdx/memory.h"
41 #include "mongo/util/assert_util.h"
42 #include "mongo/util/mongoutils/str.h"
43
44 namespace mongo {
45
46 using IndexVersion = IndexDescriptor::IndexVersion;
47
48 namespace dps = ::mongo::dotted_path_support;
49
50 namespace {
51
52 const BSONObj nullObj = BSON("" << BSONNULL);
53 const BSONElement nullElt = nullObj.firstElement();
54 const BSONObj undefinedObj = BSON("" << BSONUndefined);
55 const BSONElement undefinedElt = undefinedObj.firstElement();
56
57 } // namespace
58
BtreeKeyGenerator(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse)59 BtreeKeyGenerator::BtreeKeyGenerator(std::vector<const char*> fieldNames,
60 std::vector<BSONElement> fixed,
61 bool isSparse)
62 : _fieldNames(fieldNames), _isSparse(isSparse), _fixed(fixed) {
63 BSONObjBuilder nullKeyBuilder;
64 for (size_t i = 0; i < fieldNames.size(); ++i) {
65 nullKeyBuilder.appendNull("");
66 }
67 _nullKey = nullKeyBuilder.obj();
68
69 _isIdIndex = fieldNames.size() == 1 && std::string("_id") == fieldNames[0];
70 }
71
make(IndexVersion indexVersion,std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse,const CollatorInterface * collator)72 std::unique_ptr<BtreeKeyGenerator> BtreeKeyGenerator::make(IndexVersion indexVersion,
73 std::vector<const char*> fieldNames,
74 std::vector<BSONElement> fixed,
75 bool isSparse,
76 const CollatorInterface* collator) {
77 switch (indexVersion) {
78 case IndexVersion::kV0:
79 return stdx::make_unique<BtreeKeyGeneratorV0>(fieldNames, fixed, isSparse);
80 case IndexVersion::kV1:
81 case IndexVersion::kV2:
82 return stdx::make_unique<BtreeKeyGeneratorV1>(fieldNames, fixed, isSparse, collator);
83 }
84 return nullptr;
85 }
86
getKeys(const BSONObj & obj,BSONObjSet * keys,MultikeyPaths * multikeyPaths) const87 void BtreeKeyGenerator::getKeys(const BSONObj& obj,
88 BSONObjSet* keys,
89 MultikeyPaths* multikeyPaths) const {
90 // '_fieldNames' and '_fixed' are passed by value so that they can be mutated as part of the
91 // getKeys call. :|
92 getKeysImpl(_fieldNames, _fixed, obj, keys, multikeyPaths);
93 if (keys->empty() && !_isSparse) {
94 keys->insert(_nullKey);
95 }
96 }
97
assertParallelArrays(const char * first,const char * second)98 static void assertParallelArrays(const char* first, const char* second) {
99 std::stringstream ss;
100 ss << "cannot index parallel arrays [" << first << "] [" << second << "]";
101 uasserted(ErrorCodes::CannotIndexParallelArrays, ss.str());
102 }
103
BtreeKeyGeneratorV0(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse)104 BtreeKeyGeneratorV0::BtreeKeyGeneratorV0(std::vector<const char*> fieldNames,
105 std::vector<BSONElement> fixed,
106 bool isSparse)
107 : BtreeKeyGenerator(fieldNames, fixed, isSparse) {}
108
getKeysImpl(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,const BSONObj & obj,BSONObjSet * keys,MultikeyPaths * multikeyPaths) const109 void BtreeKeyGeneratorV0::getKeysImpl(std::vector<const char*> fieldNames,
110 std::vector<BSONElement> fixed,
111 const BSONObj& obj,
112 BSONObjSet* keys,
113 MultikeyPaths* multikeyPaths) const {
114 if (_isIdIndex) {
115 // we special case for speed
116 BSONElement e = obj["_id"];
117 if (e.eoo()) {
118 keys->insert(_nullKey);
119 } else {
120 int size = e.size() + 5 /* bson over head*/ - 3 /* remove _id string */;
121 BSONObjBuilder b(size);
122 b.appendAs(e, "");
123 keys->insert(b.obj());
124 invariant(keys->begin()->objsize() == size);
125 }
126 return;
127 }
128
129 BSONElement arrElt;
130 unsigned arrIdx = ~0;
131 unsigned numNotFound = 0;
132
133 for (unsigned i = 0; i < fieldNames.size(); ++i) {
134 if (*fieldNames[i] == '\0')
135 continue;
136
137 BSONElement e = dps::extractElementAtPathOrArrayAlongPath(obj, fieldNames[i]);
138
139 if (e.eoo()) {
140 e = nullElt; // no matching field
141 numNotFound++;
142 }
143
144 if (e.type() != Array)
145 fieldNames[i] = ""; // no matching field or non-array match
146
147 if (*fieldNames[i] == '\0')
148 // no need for further object expansion (though array expansion still possible)
149 fixed[i] = e;
150
151 if (e.type() == Array && arrElt.eoo()) {
152 // we only expand arrays on a single path -- track the path here
153 arrIdx = i;
154 arrElt = e;
155 }
156
157 // enforce single array path here
158 if (e.type() == Array && e.rawdata() != arrElt.rawdata()) {
159 assertParallelArrays(e.fieldName(), arrElt.fieldName());
160 }
161 }
162
163 bool allFound = true; // have we found elements for all field names in the key spec?
164 for (std::vector<const char*>::const_iterator i = fieldNames.begin(); i != fieldNames.end();
165 ++i) {
166 if (**i != '\0') {
167 allFound = false;
168 break;
169 }
170 }
171
172 if (_isSparse && numNotFound == _fieldNames.size()) {
173 // we didn't find any fields
174 // so we're not going to index this document
175 return;
176 }
177
178 bool insertArrayNull = false;
179
180 if (allFound) {
181 if (arrElt.eoo()) {
182 // no terminal array element to expand
183 BSONObjBuilder b(_sizeTracker);
184 for (std::vector<BSONElement>::iterator i = fixed.begin(); i != fixed.end(); ++i)
185 b.appendAs(*i, "");
186 keys->insert(b.obj());
187 } else {
188 // terminal array element to expand, so generate all keys
189 BSONObjIterator i(arrElt.embeddedObject());
190 if (i.more()) {
191 while (i.more()) {
192 BSONObjBuilder b(_sizeTracker);
193 for (unsigned j = 0; j < fixed.size(); ++j) {
194 if (j == arrIdx)
195 b.appendAs(i.next(), "");
196 else
197 b.appendAs(fixed[j], "");
198 }
199 keys->insert(b.obj());
200 }
201 } else if (fixed.size() > 1) {
202 insertArrayNull = true;
203 }
204 }
205 } else {
206 // nonterminal array element to expand, so recurse
207 verify(!arrElt.eoo());
208 BSONObjIterator i(arrElt.embeddedObject());
209 if (i.more()) {
210 while (i.more()) {
211 BSONElement e = i.next();
212 if (e.type() == Object) {
213 getKeysImpl(fieldNames, fixed, e.embeddedObject(), keys, multikeyPaths);
214 }
215 }
216 } else {
217 insertArrayNull = true;
218 }
219 }
220
221 if (insertArrayNull) {
222 // x : [] - need to insert undefined
223 BSONObjBuilder b(_sizeTracker);
224 for (unsigned j = 0; j < fixed.size(); ++j) {
225 if (j == arrIdx) {
226 b.appendUndefined("");
227 } else {
228 BSONElement e = fixed[j];
229 if (e.eoo())
230 b.appendNull("");
231 else
232 b.appendAs(e, "");
233 }
234 }
235 keys->insert(b.obj());
236 }
237 }
238
BtreeKeyGeneratorV1(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,bool isSparse,const CollatorInterface * collator)239 BtreeKeyGeneratorV1::BtreeKeyGeneratorV1(std::vector<const char*> fieldNames,
240 std::vector<BSONElement> fixed,
241 bool isSparse,
242 const CollatorInterface* collator)
243 : BtreeKeyGenerator(fieldNames, fixed, isSparse),
244 _emptyPositionalInfo(fieldNames.size()),
245 _collator(collator) {
246 for (const char* fieldName : fieldNames) {
247 size_t pathLength = FieldRef{fieldName}.numParts();
248 invariant(pathLength > 0);
249 _pathLengths.push_back(pathLength);
250 }
251 }
252
extractNextElement(const BSONObj & obj,const PositionalPathInfo & positionalInfo,const char ** field,bool * arrayNestedArray) const253 BSONElement BtreeKeyGeneratorV1::extractNextElement(const BSONObj& obj,
254 const PositionalPathInfo& positionalInfo,
255 const char** field,
256 bool* arrayNestedArray) const {
257 std::string firstField = mongoutils::str::before(*field, '.');
258 bool haveObjField = !obj.getField(firstField).eoo();
259 BSONElement arrField = positionalInfo.positionallyIndexedElt;
260
261 // An index component field name cannot exist in both a document
262 // array and one of that array's children.
263 uassert(16746,
264 mongoutils::str::stream()
265 << "Ambiguous field name found in array (do not use numeric field names in "
266 "embedded elements in an array), field: '"
267 << arrField.fieldName()
268 << "' for array: "
269 << positionalInfo.arrayObj,
270 !haveObjField || !positionalInfo.hasPositionallyIndexedElt());
271
272 *arrayNestedArray = false;
273 if (haveObjField) {
274 return dps::extractElementAtPathOrArrayAlongPath(obj, *field);
275 } else if (positionalInfo.hasPositionallyIndexedElt()) {
276 if (arrField.type() == Array) {
277 *arrayNestedArray = true;
278 }
279 *field = positionalInfo.remainingPath;
280 return positionalInfo.dottedElt;
281 }
282 return BSONElement();
283 }
284
_getKeysArrEltFixed(std::vector<const char * > * fieldNames,std::vector<BSONElement> * fixed,const BSONElement & arrEntry,BSONObjSet * keys,unsigned numNotFound,const BSONElement & arrObjElt,const std::set<size_t> & arrIdxs,bool mayExpandArrayUnembedded,const std::vector<PositionalPathInfo> & positionalInfo,MultikeyPaths * multikeyPaths) const285 void BtreeKeyGeneratorV1::_getKeysArrEltFixed(std::vector<const char*>* fieldNames,
286 std::vector<BSONElement>* fixed,
287 const BSONElement& arrEntry,
288 BSONObjSet* keys,
289 unsigned numNotFound,
290 const BSONElement& arrObjElt,
291 const std::set<size_t>& arrIdxs,
292 bool mayExpandArrayUnembedded,
293 const std::vector<PositionalPathInfo>& positionalInfo,
294 MultikeyPaths* multikeyPaths) const {
295 // Set up any terminal array values.
296 for (const auto idx : arrIdxs) {
297 if (*(*fieldNames)[idx] == '\0') {
298 (*fixed)[idx] = mayExpandArrayUnembedded ? arrEntry : arrObjElt;
299 }
300 }
301
302 // Recurse.
303 getKeysImplWithArray(*fieldNames,
304 *fixed,
305 arrEntry.type() == Object ? arrEntry.embeddedObject() : BSONObj(),
306 keys,
307 numNotFound,
308 positionalInfo,
309 multikeyPaths);
310 }
311
getKeysImpl(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,const BSONObj & obj,BSONObjSet * keys,MultikeyPaths * multikeyPaths) const312 void BtreeKeyGeneratorV1::getKeysImpl(std::vector<const char*> fieldNames,
313 std::vector<BSONElement> fixed,
314 const BSONObj& obj,
315 BSONObjSet* keys,
316 MultikeyPaths* multikeyPaths) const {
317 if (_isIdIndex) {
318 // we special case for speed
319 BSONElement e = obj["_id"];
320 if (e.eoo()) {
321 keys->insert(_nullKey);
322 } else if (_collator) {
323 BSONObjBuilder b;
324 CollationIndexKey::collationAwareIndexKeyAppend(e, _collator, &b);
325
326 // Insert a copy so its buffer size fits the object size.
327 keys->insert(b.obj().copy());
328 } else {
329 int size = e.size() + 5 /* bson over head*/ - 3 /* remove _id string */;
330 BSONObjBuilder b(size);
331 b.appendAs(e, "");
332 keys->insert(b.obj());
333 invariant(keys->begin()->objsize() == size);
334 }
335
336 // The {_id: 1} index can never be multikey because the _id field isn't allowed to be an
337 // array value. We therefore always set 'multikeyPaths' as [ [ ] ].
338 if (multikeyPaths) {
339 multikeyPaths->resize(1);
340 }
341 return;
342 }
343
344 if (multikeyPaths) {
345 invariant(multikeyPaths->empty());
346 multikeyPaths->resize(fieldNames.size());
347 }
348 getKeysImplWithArray(
349 std::move(fieldNames), std::move(fixed), obj, keys, 0, _emptyPositionalInfo, multikeyPaths);
350 }
351
getKeysImplWithArray(std::vector<const char * > fieldNames,std::vector<BSONElement> fixed,const BSONObj & obj,BSONObjSet * keys,unsigned numNotFound,const std::vector<PositionalPathInfo> & positionalInfo,MultikeyPaths * multikeyPaths) const352 void BtreeKeyGeneratorV1::getKeysImplWithArray(
353 std::vector<const char*> fieldNames,
354 std::vector<BSONElement> fixed,
355 const BSONObj& obj,
356 BSONObjSet* keys,
357 unsigned numNotFound,
358 const std::vector<PositionalPathInfo>& positionalInfo,
359 MultikeyPaths* multikeyPaths) const {
360 BSONElement arrElt;
361
362 // A set containing the position of any indexed fields in the key pattern that traverse through
363 // the 'arrElt' array value.
364 std::set<size_t> arrIdxs;
365
366 // A vector with size equal to the number of elements in the index key pattern. Each element in
367 // the vector, if initialized, refers to the component within the indexed field that traverses
368 // through the 'arrElt' array value. We say that this component within the indexed field
369 // corresponds to a path that causes the index to be multikey if the 'arrElt' array value
370 // contains multiple elements.
371 //
372 // For example, consider the index {'a.b': 1, 'a.c'} and the document
373 // {a: [{b: 1, c: 'x'}, {b: 2, c: 'y'}]}. The path "a" causes the index to be multikey, so we'd
374 // have a std::vector<boost::optional<size_t>>{{0U}, {0U}}.
375 //
376 // Furthermore, due to how positional key patterns are specified, it's possible for an indexed
377 // field to cause the index to be multikey at a different component than another indexed field
378 // that also traverses through the 'arrElt' array value. It's then also possible for an indexed
379 // field not to cause the index to be multikey, even if it traverses through the 'arrElt' array
380 // value, because only a particular element would be indexed.
381 //
382 // For example, consider the index {'a.b': 1, 'a.b.0'} and the document {a: {b: [1, 2]}}. The
383 // path "a.b" causes the index to be multikey, but the key pattern "a.b.0" only indexes the
384 // first element of the array, so we'd have a
385 // std::vector<boost::optional<size_t>>{{1U}, boost::none}.
386 std::vector<boost::optional<size_t>> arrComponents(fieldNames.size());
387
388 bool mayExpandArrayUnembedded = true;
389 for (size_t i = 0; i < fieldNames.size(); ++i) {
390 if (*fieldNames[i] == '\0') {
391 continue;
392 }
393
394 bool arrayNestedArray;
395 // Extract element matching fieldName[ i ] from object xor array.
396 BSONElement e =
397 extractNextElement(obj, positionalInfo[i], &fieldNames[i], &arrayNestedArray);
398
399 if (e.eoo()) {
400 // if field not present, set to null
401 fixed[i] = nullElt;
402 // done expanding this field name
403 fieldNames[i] = "";
404 numNotFound++;
405 } else if (e.type() == Array) {
406 arrIdxs.insert(i);
407 if (arrElt.eoo()) {
408 // we only expand arrays on a single path -- track the path here
409 arrElt = e;
410 } else if (e.rawdata() != arrElt.rawdata()) {
411 // enforce single array path here
412 assertParallelArrays(e.fieldName(), arrElt.fieldName());
413 }
414 if (arrayNestedArray) {
415 mayExpandArrayUnembedded = false;
416 }
417 } else {
418 // not an array - no need for further expansion
419 fixed[i] = e;
420 }
421 }
422
423 if (arrElt.eoo()) {
424 // No array, so generate a single key.
425 if (_isSparse && numNotFound == fieldNames.size()) {
426 return;
427 }
428 BSONObjBuilder b(_sizeTracker);
429 for (std::vector<BSONElement>::iterator i = fixed.begin(); i != fixed.end(); ++i) {
430 CollationIndexKey::collationAwareIndexKeyAppend(*i, _collator, &b);
431 }
432 keys->insert(b.obj());
433 } else if (arrElt.embeddedObject().firstElement().eoo()) {
434 // We've encountered an empty array.
435 if (multikeyPaths && mayExpandArrayUnembedded) {
436 // Any indexed path which traverses through the empty array must be recorded as an array
437 // component.
438 for (auto i : arrIdxs) {
439 // We need to determine which component of the indexed field causes the index to be
440 // multikey as a result of the empty array. Indexed empty arrays are considered
441 // multikey and may occur mid-path. For instance, the indexed path "a.b.c" has
442 // multikey components {0, 1} given the document {a: [{b: []}, {b: 1}]}.
443 size_t fullPathLength = _pathLengths[i];
444 size_t suffixPathLength = FieldRef{fieldNames[i]}.numParts();
445 invariant(suffixPathLength < fullPathLength);
446 arrComponents[i] = fullPathLength - suffixPathLength - 1;
447 }
448 }
449
450 // For an empty array, set matching fields to undefined.
451 _getKeysArrEltFixed(&fieldNames,
452 &fixed,
453 undefinedElt,
454 keys,
455 numNotFound,
456 arrElt,
457 arrIdxs,
458 true,
459 _emptyPositionalInfo,
460 multikeyPaths);
461 } else {
462 BSONObj arrObj = arrElt.embeddedObject();
463
464 // For positional key patterns, e.g. {'a.1.b': 1}, we lookup the indexed array element
465 // and then traverse the remainder of the field path up front. This prevents us from
466 // having to look up the indexed element again on each recursive call (i.e. once per
467 // array element).
468 std::vector<PositionalPathInfo> subPositionalInfo(fixed.size());
469 for (size_t i = 0; i < fieldNames.size(); ++i) {
470 const bool fieldIsArray = arrIdxs.find(i) != arrIdxs.end();
471
472 if (*fieldNames[i] == '\0') {
473 // We've reached the end of the path.
474 if (multikeyPaths && fieldIsArray && mayExpandArrayUnembedded) {
475 // The 'arrElt' array value isn't expanded into multiple elements when the last
476 // component of the indexed field is positional and 'arrElt' contains nested
477 // array values. In all other cases, the 'arrElt' array value may be expanded
478 // into multiple element and can therefore cause the index to be multikey.
479 arrComponents[i] = _pathLengths[i] - 1;
480 }
481 continue;
482 }
483
484 // The earlier call to dps::extractElementAtPathOrArrayAlongPath(..., fieldNames[i])
485 // modified fieldNames[i] to refer to the suffix of the path immediately following the
486 // 'arrElt' array value. If we haven't reached the end of this indexed field yet, then
487 // we must have traversed through 'arrElt'.
488 invariant(fieldIsArray);
489
490 StringData part = fieldNames[i];
491 part = part.substr(0, part.find('.'));
492 subPositionalInfo[i].positionallyIndexedElt = arrObj[part];
493 if (subPositionalInfo[i].positionallyIndexedElt.eoo()) {
494 // We aren't indexing a particular element of the 'arrElt' array value, so it may be
495 // expanded into multiple elements. It can therefore cause the index to be multikey.
496 if (multikeyPaths) {
497 // We need to determine which component of the indexed field causes the index to
498 // be multikey as a result of the 'arrElt' array value. Since
499 //
500 // NumComponents("<pathPrefix>") + NumComponents("<pathSuffix>")
501 // = NumComponents("<pathPrefix>.<pathSuffix>"),
502 //
503 // we can compute the number of components in a prefix of the indexed field by
504 // subtracting the number of components in the suffix 'fieldNames[i]' from the
505 // number of components in the indexed field '_fieldNames[i]'.
506 //
507 // For example, consider the indexed field "a.b.c" and the suffix "c". The path
508 // "a.b.c" has 3 components and the suffix "c" has 1 component. Subtracting the
509 // latter from the former yields the number of components in the prefix "a.b",
510 // i.e. 2.
511 size_t fullPathLength = _pathLengths[i];
512 size_t suffixPathLength = FieldRef{fieldNames[i]}.numParts();
513 invariant(suffixPathLength < fullPathLength);
514 arrComponents[i] = fullPathLength - suffixPathLength - 1;
515 }
516 continue;
517 }
518
519 // We're indexing an array element by its position. Traverse the remainder of the
520 // field path now.
521 //
522 // Indexing an array element by its position selects a particular element of the
523 // 'arrElt' array value when generating keys. It therefore cannot cause the index to be
524 // multikey.
525 subPositionalInfo[i].arrayObj = arrObj;
526 subPositionalInfo[i].remainingPath = fieldNames[i];
527 subPositionalInfo[i].dottedElt = dps::extractElementAtPathOrArrayAlongPath(
528 arrObj, subPositionalInfo[i].remainingPath);
529 }
530
531 // Generate a key for each element of the indexed array.
532 for (const auto arrObjElem : arrObj) {
533 _getKeysArrEltFixed(&fieldNames,
534 &fixed,
535 arrObjElem,
536 keys,
537 numNotFound,
538 arrElt,
539 arrIdxs,
540 mayExpandArrayUnembedded,
541 subPositionalInfo,
542 multikeyPaths);
543 }
544 }
545
546 // Record multikey path components.
547 if (multikeyPaths) {
548 for (size_t i = 0; i < arrComponents.size(); ++i) {
549 if (auto arrComponent = arrComponents[i]) {
550 (*multikeyPaths)[i].insert(*arrComponent);
551 }
552 }
553 }
554 }
555
556 } // namespace mongo
557