1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kSharding
32 
33 #include "mongo/platform/basic.h"
34 
35 #include "mongo/db/s/metadata_manager.h"
36 
37 #include "mongo/base/string_data.h"
38 #include "mongo/bson/simple_bsonobj_comparator.h"
39 #include "mongo/bson/util/builder.h"
40 #include "mongo/db/bson/dotted_path_support.h"
41 #include "mongo/db/cursor_manager.h"
42 #include "mongo/db/db_raii.h"
43 #include "mongo/db/query/internal_plans.h"
44 #include "mongo/db/range_arithmetic.h"
45 #include "mongo/db/s/collection_sharding_state.h"
46 #include "mongo/db/s/sharding_state.h"
47 #include "mongo/stdx/memory.h"
48 #include "mongo/util/assert_util.h"
49 #include "mongo/util/fail_point_service.h"
50 #include "mongo/util/log.h"
51 #include "mongo/util/time_support.h"
52 
53 // MetadataManager maintains pointers to CollectionMetadata objects in a member list named
54 // _metadata.  Each CollectionMetadata contains an immutable _chunksMap of chunks assigned to this
55 // shard, along with details related to its own lifecycle in a member _tracker.
56 //
57 // The current chunk mapping, used by queries starting up, is at _metadata.back().  Each query,
58 // when it starts up, requests and holds a ScopedCollectionMetadata object, and destroys it on
59 // termination. Each ScopedCollectionMetadata keeps a shared_ptr to its CollectionMetadata chunk
60 // mapping, and to the MetadataManager itself.  CollectionMetadata mappings also keep a record of
61 // chunk ranges that may be deleted when it is determined that the range can no longer be in use.
62 //
63 // ScopedCollectionMetadata's destructor decrements the CollectionMetadata's usageCounter.
64 // Whenever a usageCounter drops to zero, we check whether any now-unused CollectionMetadata
65 // elements can be popped off the front of _metadata.  We need to keep the unused elements in the
66 // middle (as seen below) because they may schedule deletions of chunks depended on by older
67 // mappings.
68 //
69 // New chunk mappings are pushed onto the back of _metadata. Subsequently started queries use the
70 // new mapping while still-running queries continue using the older "snapshot" mappings.  We treat
71 // _metadata.back()'s usage count differently from the snapshots because it can't reliably be
72 // compared to zero; a new query may increment it at any time.
73 //
74 // (Note that the collection may be dropped or become unsharded, and even get made and sharded
75 // again, between construction and destruction of a ScopedCollectionMetadata).
76 //
77 // MetadataManager also contains a CollectionRangeDeleter _rangesToClean that queues orphan ranges
78 // being deleted in a background thread, and a mapping _receivingChunks of the ranges being migrated
79 // in, to avoid deleting them.  Each range deletion is paired with a notification object triggered
80 // when the deletion is completed or abandoned.
81 //
82 //                                        ____________________________
83 //  (s): std::shared_ptr<>       Clients:| ScopedCollectionMetadata   |
84 //   _________________________        +----(s) manager   metadata (s)------------------+
85 //  | CollectionShardingState |       |  |____________________________|  |             |
86 //  |  _metadataManager (s)   |       +-------(s) manager  metadata (s)--------------+ |
87 //  |____________________|____|       |     |____________________________|   |       | |
88 //   ____________________v________    +------------(s) manager  metadata (s)-----+   | |
89 //  | MetadataManager             |   |         |____________________________|   |   | |
90 //  |                             |<--+                                          |   | |
91 //  |                             |        ___________________________  (1 use)  |   | |
92 //  | getActiveMetadata():    /---------->| CollectionMetadata        |<---------+   | |
93 //  |     back(): [(s),------/    |       |  _________________________|_             | |
94 //  |              (s),-------------------->| CollectionMetadata        | (0 uses)   | |
95 //  |  _metadata:  (s)]------\    |       | |  _________________________|_           | |
96 //  |                         \-------------->| CollectionMetadata        |          | |
97 //  |  _receivingChunks           |       | | |                           | (2 uses) | |
98 //  |  _rangesToClean:            |       | | |  _tracker:                |<---------+ |
99 //  |  _________________________  |       | | |  _______________________  |<-----------+
100 //  | | CollectionRangeDeleter  | |       | | | | Tracker               | |
101 //  | |                         | |       | | | |                       | |
102 //  | |  _orphans [range,notif, | |       | | | | usageCounter          | |
103 //  | |            range,notif, | |       | | | | orphans [range,notif, | |
104 //  | |                 ...   ] | |       | | | |          range,notif, | |
105 //  | |                         | |       | | | |              ...    ] | |
106 //  | |_________________________| |       |_| | |_______________________| |
107 //  |_____________________________|         | |  _chunksMap               |
108 //                                          |_|  _chunkVersion            |
109 //                                            |  ...                      |
110 //                                            |___________________________|
111 //
112 //  Note that _metadata as shown here has its front() at the bottom, back() at the top. As usual,
113 //  new entries are pushed onto the back, popped off the front.
114 
115 namespace mongo {
116 namespace {
117 
118 using TaskExecutor = executor::TaskExecutor;
119 using CallbackArgs = TaskExecutor::CallbackArgs;
120 
121 MONGO_FP_DECLARE(suspendRangeDeletion);
122 
123 /**
124  * Deletes ranges, in background, until done, normally using a task executor attached to the
125  * ShardingState.
126  *
127  * Each time it completes cleaning up a range, it wakes up clients waiting on completion of that
128  * range, which may then verify that their range has no more deletions scheduled, and proceed.
129  */
scheduleCleanup(executor::TaskExecutor * executor,NamespaceString nss,OID epoch,Date_t when)130 void scheduleCleanup(executor::TaskExecutor* executor,
131                      NamespaceString nss,
132                      OID epoch,
133                      Date_t when) {
134     LOG(1) << "Scheduling cleanup on " << nss.ns() << " at " << when;
135     auto swCallbackHandle = executor->scheduleWorkAt(
136         when, [ executor, nss = std::move(nss), epoch = std::move(epoch) ](auto&) {
137             Client::initThreadIfNotAlready("Collection Range Deleter");
138             auto uniqueOpCtx = Client::getCurrent()->makeOperationContext();
139             auto opCtx = uniqueOpCtx.get();
140 
141             MONGO_FAIL_POINT_PAUSE_WHILE_SET(suspendRangeDeletion);
142 
143             auto next = CollectionRangeDeleter::cleanUpNextRange(opCtx, nss, epoch);
144             if (next) {
145                 scheduleCleanup(executor, std::move(nss), std::move(epoch), *next);
146             }
147         });
148 
149     if (!swCallbackHandle.isOK()) {
150         log() << "Failed to schedule the orphan data cleanup task"
151               << causedBy(redact(swCallbackHandle.getStatus()));
152     }
153 }
154 
logRangeDeletionWaitingOnOpenCursors(const OperationContext * opCtx,const Collection * collection,const NamespaceString & nss,const ChunkRange & range)155 void logRangeDeletionWaitingOnOpenCursors(const OperationContext* opCtx,
156                                           const Collection* collection,
157                                           const NamespaceString& nss,
158                                           const ChunkRange& range) {
159     invariant(opCtx->lockState()->isCollectionLockedForMode(nss.toString(), MODE_IS));
160     std::vector<CursorId> cursorIds;
161     // If the collection exists, gather a list of all cursors related to the collection.
162     if (collection) {
163         auto cursorIdsFromCollectionCursorManager =
164             collection->getCursorManager()->getCursorIdsForNamespace(nss);
165         cursorIds.insert(cursorIds.end(),
166                          cursorIdsFromCollectionCursorManager.begin(),
167                          cursorIdsFromCollectionCursorManager.end());
168 
169         // Aggregation cursors are registered on the global cursor manager. A cursor on
170         // the global cursor manager can involve any number of collections, but is registered with
171         // the namespace of the aggregate command. This works well for this purpose since any other
172         // namespaces would come through a $lookup or $graphLookup which cannot read from sharded
173         // collections and so could not be contributing to the delay of a range deletion. The
174         // namespace of the aggregate command can be sharded, so we do want to include those
175         // aggregation cursors in this message.
176         auto cursorIdsFromGlobalCursorManager =
177             CursorManager::getGlobalCursorManager()->getCursorIdsForNamespace(nss);
178         cursorIds.insert(cursorIds.end(),
179                          cursorIdsFromGlobalCursorManager.begin(),
180                          cursorIdsFromGlobalCursorManager.end());
181     }
182 
183     // Join cursorIds as a comma-separated list.
184     std::string cursorIdList =
185         cursorIds.empty() ? "" : std::accumulate(std::next(cursorIds.begin()),
186                                                  cursorIds.end(),
187                                                  std::to_string(cursorIds[0]),
188                                                  [](std::string a, CursorId b) {
189                                                      return std::move(a) + ',' + std::to_string(b);
190                                                  });
191 
192     log() << "Deletion of " << nss.ns() << " range " << redact(range.toString())
193           << " will be scheduled after all possibly dependent queries finish. "
194              "All open cursors for namespace: ["
195           << cursorIdList << "]";
196 }
197 
198 }  // namespace
199 
MetadataManager(ServiceContext * serviceContext,NamespaceString nss,TaskExecutor * executor)200 MetadataManager::MetadataManager(ServiceContext* serviceContext,
201                                  NamespaceString nss,
202                                  TaskExecutor* executor)
203     : _serviceContext(serviceContext),
204       _nss(std::move(nss)),
205       _executor(executor),
206       _receivingChunks(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<BSONObj>()) {}
207 
~MetadataManager()208 MetadataManager::~MetadataManager() {
209     stdx::lock_guard<stdx::mutex> lg(_managerLock);
210     _clearAllCleanups(lg);
211     _metadata.clear();
212 }
213 
_clearAllCleanups(WithLock lock)214 void MetadataManager::_clearAllCleanups(WithLock lock) {
215     _clearAllCleanups(
216         lock,
217         {ErrorCodes::InterruptedDueToReplStateChange,
218          str::stream() << "Range deletions in " << _nss.ns()
219                        << " abandoned because collection was dropped or became unsharded"});
220 }
221 
_clearAllCleanups(WithLock,Status status)222 void MetadataManager::_clearAllCleanups(WithLock, Status status) {
223     for (auto& tracker : _metadata) {
224         std::ignore = _rangesToClean.add(std::move(tracker->orphans));
225     }
226     _rangesToClean.clear(status);
227 }
228 
getActiveMetadata(std::shared_ptr<MetadataManager> self)229 ScopedCollectionMetadata MetadataManager::getActiveMetadata(std::shared_ptr<MetadataManager> self) {
230     stdx::lock_guard<stdx::mutex> lg(_managerLock);
231     if (!_metadata.empty()) {
232         return ScopedCollectionMetadata(lg, std::move(self), _metadata.back());
233     }
234 
235     return ScopedCollectionMetadata();
236 }
237 
numberOfMetadataSnapshots() const238 size_t MetadataManager::numberOfMetadataSnapshots() const {
239     stdx::lock_guard<stdx::mutex> lg(_managerLock);
240     if (_metadata.empty())
241         return 0;
242 
243     return _metadata.size() - 1;
244 }
245 
numberOfEmptyMetadataSnapshots() const246 int MetadataManager::numberOfEmptyMetadataSnapshots() const {
247     stdx::lock_guard<stdx::mutex> lg(_managerLock);
248 
249     int emptyMetadataSnapshots = 0;
250     for (const auto& collMetadataTracker : _metadata) {
251         if (!collMetadataTracker->metadata)
252             emptyMetadataSnapshots++;
253     }
254 
255     return emptyMetadataSnapshots;
256 }
257 
refreshActiveMetadata(std::unique_ptr<CollectionMetadata> remoteMetadata)258 void MetadataManager::refreshActiveMetadata(std::unique_ptr<CollectionMetadata> remoteMetadata) {
259     stdx::lock_guard<stdx::mutex> lg(_managerLock);
260 
261     // Collection was never sharded in the first place. This check is necessary in order to avoid
262     // extraneous logging in the not-a-shard case, because all call sites always try to get the
263     // collection sharding information regardless of whether the node is sharded or not.
264     if (!remoteMetadata && _metadata.empty()) {
265         invariant(_receivingChunks.empty());
266         invariant(_rangesToClean.isEmpty());
267         return;
268     }
269 
270     // Collection is becoming unsharded
271     if (!remoteMetadata) {
272         log() << "Marking collection " << _nss.ns() << " with "
273               << redact(_metadata.back()->metadata->toStringBasic()) << " as unsharded";
274 
275         _receivingChunks.clear();
276         _clearAllCleanups(lg);
277         _metadata.clear();
278         return;
279     }
280 
281     // Collection is becoming sharded
282     if (_metadata.empty()) {
283         log() << "Marking collection " << _nss.ns() << " as sharded with "
284               << remoteMetadata->toStringBasic();
285 
286         invariant(_receivingChunks.empty());
287         _setActiveMetadata(lg, std::move(*remoteMetadata));
288         invariant(_rangesToClean.isEmpty());
289         return;
290     }
291 
292     auto* const activeMetadata = &_metadata.back()->metadata.get();
293 
294     // If the metadata being installed has a different epoch from ours, this means the collection
295     // was dropped and recreated, so we must entirely reset the metadata state
296     if (activeMetadata->getCollVersion().epoch() != remoteMetadata->getCollVersion().epoch()) {
297         log() << "Overwriting metadata for collection " << _nss.ns() << " from "
298               << activeMetadata->toStringBasic() << " to " << remoteMetadata->toStringBasic()
299               << " due to epoch change";
300 
301         _receivingChunks.clear();
302         _setActiveMetadata(lg, std::move(*remoteMetadata));
303         _clearAllCleanups(lg);
304         return;
305     }
306 
307     // We already have newer version
308     if (activeMetadata->getCollVersion() >= remoteMetadata->getCollVersion()) {
309         LOG(1) << "Ignoring update of active metadata " << activeMetadata->toStringBasic()
310                << " with an older " << remoteMetadata->toStringBasic();
311         return;
312     }
313 
314     log() << "Updating collection metadata for " << _nss.ns() << " from "
315           << activeMetadata->toStringBasic() << " to " << remoteMetadata->toStringBasic();
316 
317     // Resolve any receiving chunks, which might have completed by now
318     for (auto it = _receivingChunks.begin(); it != _receivingChunks.end();) {
319         const ChunkRange receivingRange(it->first, it->second);
320 
321         if (!remoteMetadata->rangeOverlapsChunk(receivingRange)) {
322             ++it;
323             continue;
324         }
325 
326         // The remote metadata contains a chunk we were earlier in the process of receiving, so we
327         // deem it successfully received
328         LOG(2) << "Verified chunk " << redact(receivingRange.toString()) << " for collection "
329                << _nss.ns() << " has been migrated to this shard earlier";
330 
331         _receivingChunks.erase(it);
332         it = _receivingChunks.begin();
333     }
334 
335     _setActiveMetadata(lg, std::move(*remoteMetadata));
336 }
337 
_setActiveMetadata(WithLock wl,CollectionMetadata newMetadata)338 void MetadataManager::_setActiveMetadata(WithLock wl, CollectionMetadata newMetadata) {
339     _metadata.emplace_back(std::make_shared<CollectionMetadataTracker>(std::move(newMetadata)));
340     _retireExpiredMetadata(wl);
341 }
342 
_retireExpiredMetadata(WithLock lock)343 void MetadataManager::_retireExpiredMetadata(WithLock lock) {
344     // Remove entries and schedule orphans for deletion only from the front of _metadata. We cannot
345     // remove an entry from the middle of _metadata because a previous entry (whose usageCount is
346     // not 0) could have a query that is actually still accessing those documents.
347     while (_metadata.size() > 1 && !_metadata.front()->usageCounter) {
348         if (!_metadata.front()->orphans.empty()) {
349             LOG(0) << "Queries possibly dependent on " << _nss.ns()
350                    << " range(s) finished; scheduling ranges for deletion";
351 
352             _pushListToClean(lock, std::move(_metadata.front()->orphans));
353         }
354 
355         _metadata.pop_front();
356     }
357 
358     // To avoid memory build up of ChunkManager objects, we can clear the CollectionMetadata object
359     // in an entry when its usageCount is 0 as long as it is not the last item in _metadata (which
360     // is the active metadata). If _metadata is empty, decrementing iter will be out of bounds, so
361     // we must check that the size is > 1 as well.
362     if (_metadata.size() > 1) {
363         auto iter = _metadata.begin();
364         while (iter != (--_metadata.end())) {
365             if ((*iter)->usageCounter == 0) {
366                 (*iter)->metadata = boost::none;
367             }
368             ++iter;
369         }
370     }
371 }
372 
toBSONPending(BSONArrayBuilder & bb) const373 void MetadataManager::toBSONPending(BSONArrayBuilder& bb) const {
374     stdx::lock_guard<stdx::mutex> lg(_managerLock);
375 
376     for (auto it = _receivingChunks.begin(); it != _receivingChunks.end(); ++it) {
377         BSONArrayBuilder pendingBB(bb.subarrayStart());
378         pendingBB.append(it->first);
379         pendingBB.append(it->second);
380         pendingBB.done();
381     }
382 }
383 
append(BSONObjBuilder * builder) const384 void MetadataManager::append(BSONObjBuilder* builder) const {
385     stdx::lock_guard<stdx::mutex> lg(_managerLock);
386 
387     _rangesToClean.append(builder);
388 
389     BSONArrayBuilder pcArr(builder->subarrayStart("pendingChunks"));
390     for (const auto& entry : _receivingChunks) {
391         BSONObjBuilder obj;
392         ChunkRange r = ChunkRange(entry.first, entry.second);
393         r.append(&obj);
394         pcArr.append(obj.done());
395     }
396     pcArr.done();
397 
398     if (_metadata.empty()) {
399         return;
400     }
401 
402     BSONArrayBuilder amrArr(builder->subarrayStart("activeMetadataRanges"));
403     for (const auto& entry : _metadata.back()->metadata->getChunks()) {
404         BSONObjBuilder obj;
405         ChunkRange r = ChunkRange(entry.first, entry.second);
406         r.append(&obj);
407         amrArr.append(obj.done());
408     }
409     amrArr.done();
410 }
411 
_pushRangeToClean(WithLock lock,ChunkRange const & range,Date_t when)412 auto MetadataManager::_pushRangeToClean(WithLock lock, ChunkRange const& range, Date_t when)
413     -> CleanupNotification {
414     std::list<Deletion> ranges;
415     ranges.emplace_back(ChunkRange(range.getMin().getOwned(), range.getMax().getOwned()), when);
416     auto& notifn = ranges.back().notification;
417     _pushListToClean(lock, std::move(ranges));
418     return notifn;
419 }
420 
_pushListToClean(WithLock,std::list<Deletion> ranges)421 void MetadataManager::_pushListToClean(WithLock, std::list<Deletion> ranges) {
422     auto when = _rangesToClean.add(std::move(ranges));
423     if (when) {
424         scheduleCleanup(
425             _executor, _nss, _metadata.back()->metadata->getCollVersion().epoch(), *when);
426     }
427     invariant(ranges.empty());
428 }
429 
beginReceive(ChunkRange const & range)430 auto MetadataManager::beginReceive(ChunkRange const& range) -> CleanupNotification {
431     stdx::lock_guard<stdx::mutex> lg(_managerLock);
432     invariant(!_metadata.empty());
433 
434     if (_overlapsInUseChunk(lg, range)) {
435         return Status{ErrorCodes::RangeOverlapConflict,
436                       "Documents in target range may still be in use on the destination shard."};
437     }
438 
439     _receivingChunks.emplace(range.getMin().getOwned(), range.getMax().getOwned());
440 
441     log() << "Scheduling deletion of any documents in " << _nss.ns() << " range "
442           << redact(range.toString()) << " before migrating in a chunk covering the range";
443 
444     return _pushRangeToClean(lg, range, Date_t{});
445 }
446 
forgetReceive(ChunkRange const & range)447 void MetadataManager::forgetReceive(ChunkRange const& range) {
448     stdx::lock_guard<stdx::mutex> lg(_managerLock);
449     invariant(!_metadata.empty());
450 
451     // This is potentially a partially received chunk, which needs to be cleaned up. We know none
452     // of these documents are in use, so they can go straight to the deletion queue.
453     log() << "Abandoning in-migration of " << _nss.ns() << " range " << range
454           << "; scheduling deletion of any documents already copied";
455 
456     invariant(!_overlapsInUseChunk(lg, range));
457 
458     auto it = _receivingChunks.find(range.getMin());
459     invariant(it != _receivingChunks.end());
460     _receivingChunks.erase(it);
461 
462     _pushRangeToClean(lg, range, Date_t{}).abandon();
463 }
464 
cleanUpRange(OperationContext * opCtx,const Collection * collection,ChunkRange const & range,Date_t whenToDelete)465 auto MetadataManager::cleanUpRange(OperationContext* opCtx,
466                                    const Collection* collection,
467                                    ChunkRange const& range,
468                                    Date_t whenToDelete) -> CleanupNotification {
469     stdx::lock_guard<stdx::mutex> lg(_managerLock);
470     invariant(!_metadata.empty());
471 
472     auto* const activeMetadata = _metadata.back().get();
473     auto* const overlapMetadata = _findNewestOverlappingMetadata(lg, range);
474 
475     if (overlapMetadata == activeMetadata) {
476         return Status{ErrorCodes::RangeOverlapConflict,
477                       str::stream() << "Requested deletion range overlaps a live shard chunk"};
478     }
479 
480     if (rangeMapOverlaps(_receivingChunks, range.getMin(), range.getMax())) {
481         return Status{ErrorCodes::RangeOverlapConflict,
482                       str::stream() << "Requested deletion range overlaps a chunk being"
483                                        " migrated in"};
484     }
485 
486     if (!overlapMetadata) {
487         // No running queries can depend on it, so queue it for deletion immediately.
488         const auto whenStr = (whenToDelete == Date_t{}) ? "immediate"_sd : "deferred"_sd;
489         log() << "Scheduling " << whenStr << " deletion of " << _nss.ns() << " range "
490               << redact(range.toString());
491         return _pushRangeToClean(lg, range, whenToDelete);
492     }
493 
494     logRangeDeletionWaitingOnOpenCursors(opCtx, collection, _nss, range);
495 
496     // Put it on the oldest metadata permissible; the current one might live a long time.
497     auto& orphans = overlapMetadata->orphans;
498     orphans.emplace_back(ChunkRange(range.getMin().getOwned(), range.getMax().getOwned()),
499                          whenToDelete);
500     return orphans.back().notification;
501 }
502 
overlappingMetadata(std::shared_ptr<MetadataManager> const & self,ChunkRange const & range)503 std::vector<ScopedCollectionMetadata> MetadataManager::overlappingMetadata(
504     std::shared_ptr<MetadataManager> const& self, ChunkRange const& range) {
505     stdx::lock_guard<stdx::mutex> lg(_managerLock);
506     invariant(!_metadata.empty());
507 
508     std::vector<ScopedCollectionMetadata> result;
509     result.reserve(_metadata.size());
510 
511     // Start with the active metadata
512     auto it = _metadata.rbegin();
513     if ((*it)->metadata->rangeOverlapsChunk(range)) {
514         // We ignore the refcount of the active mapping; effectively, we assume it is in use.
515         result.push_back(ScopedCollectionMetadata(lg, self, (*it)));
516     }
517 
518     // Continue to snapshots
519     ++it;
520     for (; it != _metadata.rend(); ++it) {
521         auto& tracker = *it;
522 
523         // We want all the overlapping snapshot mappings still possibly in use by a query.
524         if (tracker->usageCounter > 0 && tracker->metadata->rangeOverlapsChunk(range)) {
525             result.push_back(ScopedCollectionMetadata(lg, self, tracker));
526         }
527     }
528 
529     return result;
530 }
531 
numberOfRangesToCleanStillInUse() const532 size_t MetadataManager::numberOfRangesToCleanStillInUse() const {
533     stdx::lock_guard<stdx::mutex> lg(_managerLock);
534     size_t count = 0;
535     for (auto& tracker : _metadata) {
536         count += tracker->orphans.size();
537     }
538     return count;
539 }
540 
numberOfRangesToClean() const541 size_t MetadataManager::numberOfRangesToClean() const {
542     stdx::lock_guard<stdx::mutex> lg(_managerLock);
543     return _rangesToClean.size();
544 }
545 
trackOrphanedDataCleanup(ChunkRange const & range) const546 auto MetadataManager::trackOrphanedDataCleanup(ChunkRange const& range) const
547     -> boost::optional<CleanupNotification> {
548     stdx::lock_guard<stdx::mutex> lg(_managerLock);
549     auto overlaps = _overlapsInUseCleanups(lg, range);
550     if (overlaps) {
551         return overlaps;
552     }
553 
554     return _rangesToClean.overlaps(range);
555 }
556 
_findNewestOverlappingMetadata(WithLock,ChunkRange const & range)557 auto MetadataManager::_findNewestOverlappingMetadata(WithLock, ChunkRange const& range)
558     -> CollectionMetadataTracker* {
559     invariant(!_metadata.empty());
560 
561     auto it = _metadata.rbegin();
562     if ((*it)->metadata && (*it)->metadata->rangeOverlapsChunk(range)) {
563         return (*it).get();
564     }
565 
566     ++it;
567     for (; it != _metadata.rend(); ++it) {
568         auto& tracker = *it;
569         if (tracker->usageCounter && tracker->metadata &&
570             tracker->metadata->rangeOverlapsChunk(range)) {
571             return tracker.get();
572         }
573     }
574 
575     return nullptr;
576 }
577 
_overlapsInUseChunk(WithLock lk,ChunkRange const & range)578 bool MetadataManager::_overlapsInUseChunk(WithLock lk, ChunkRange const& range) {
579     auto* cm = _findNewestOverlappingMetadata(lk, range);
580     return (cm != nullptr);
581 }
582 
_overlapsInUseCleanups(WithLock,ChunkRange const & range) const583 auto MetadataManager::_overlapsInUseCleanups(WithLock, ChunkRange const& range) const
584     -> boost::optional<CleanupNotification> {
585     invariant(!_metadata.empty());
586 
587     for (auto it = _metadata.rbegin(); it != _metadata.rend(); ++it) {
588         const auto& orphans = (*it)->orphans;
589         for (auto itOrphans = orphans.rbegin(); itOrphans != orphans.rend(); ++itOrphans) {
590             const auto& orphan = *itOrphans;
591             if (orphan.range.overlapWith(range)) {
592                 return orphan.notification;
593             }
594         }
595     }
596 
597     return boost::none;
598 }
599 
getNextOrphanRange(BSONObj const & from) const600 boost::optional<ChunkRange> MetadataManager::getNextOrphanRange(BSONObj const& from) const {
601     stdx::lock_guard<stdx::mutex> lg(_managerLock);
602     invariant(!_metadata.empty());
603     return _metadata.back()->metadata->getNextOrphanRange(_receivingChunks, from);
604 }
605 
606 ScopedCollectionMetadata::ScopedCollectionMetadata() = default;
607 
ScopedCollectionMetadata(WithLock,std::shared_ptr<MetadataManager> metadataManager,std::shared_ptr<MetadataManager::CollectionMetadataTracker> metadataTracker)608 ScopedCollectionMetadata::ScopedCollectionMetadata(
609     WithLock,
610     std::shared_ptr<MetadataManager> metadataManager,
611     std::shared_ptr<MetadataManager::CollectionMetadataTracker> metadataTracker)
612     : _metadataManager(std::move(metadataManager)), _metadataTracker(std::move(metadataTracker)) {
613     invariant(_metadataManager);
614     invariant(_metadataTracker);
615     ++_metadataTracker->usageCounter;
616 }
617 
ScopedCollectionMetadata(ScopedCollectionMetadata && other)618 ScopedCollectionMetadata::ScopedCollectionMetadata(ScopedCollectionMetadata&& other) {
619     *this = std::move(other);
620 }
621 
operator =(ScopedCollectionMetadata && other)622 ScopedCollectionMetadata& ScopedCollectionMetadata::operator=(ScopedCollectionMetadata&& other) {
623     if (this != &other) {
624         _clear();
625 
626         _metadataManager = std::move(other._metadataManager);
627         _metadataTracker = std::move(other._metadataTracker);
628 
629         other._metadataManager = nullptr;
630         other._metadataTracker = nullptr;
631     }
632     return *this;
633 }
634 
getMetadata() const635 CollectionMetadata* ScopedCollectionMetadata::getMetadata() const {
636     return _metadataTracker && _metadataTracker->metadata ? &_metadataTracker->metadata.get()
637                                                           : nullptr;
638 }
639 
extractDocumentKey(BSONObj const & doc) const640 BSONObj ScopedCollectionMetadata::extractDocumentKey(BSONObj const& doc) const {
641     BSONObj key;
642     if (*this) {  // is sharded
643         auto const& pattern = _metadataTracker->metadata->getChunkManager()->getShardKeyPattern();
644         key = dotted_path_support::extractElementsBasedOnTemplate(doc, pattern.toBSON());
645         if (pattern.hasId()) {
646             return key;
647         }
648         // else, try to append an _id field from the document.
649     }
650 
651     if (auto id = doc["_id"]) {
652         return key.isEmpty() ? id.wrap() : BSONObjBuilder(std::move(key)).append(id).obj();
653     }
654 
655     // For legacy documents that lack an _id, use the document itself as its key.
656     return doc;
657 }
658 
_clear()659 void ScopedCollectionMetadata::_clear() {
660     if (!_metadataManager) {
661         return;
662     }
663 
664     stdx::lock_guard<stdx::mutex> managerLock(_metadataManager->_managerLock);
665     invariant(_metadataTracker->usageCounter != 0);
666     if (--_metadataTracker->usageCounter == 0) {
667         // MetadataManager doesn't care which usageCounter went to zero. It just retires all that
668         // are older than the oldest metadata still in use by queries (some start out at zero, some
669         // go to zero but can't be expired yet).
670         //
671         // Note that new instances of ScopedCollectionMetadata may get attached to _metadata.back(),
672         // so its usage count can increase from zero, unlike other reference counts.
673         _metadataManager->_retireExpiredMetadata(managerLock);
674     }
675 
676     _metadataManager.reset();
677     _metadataTracker.reset();
678 }
679 
680 }  // namespace mongo
681