1
2 /**
3 * Copyright (C) 2018-present MongoDB, Inc.
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the Server Side Public License, version 1,
7 * as published by MongoDB, Inc.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * Server Side Public License for more details.
13 *
14 * You should have received a copy of the Server Side Public License
15 * along with this program. If not, see
16 * <http://www.mongodb.com/licensing/server-side-public-license>.
17 *
18 * As a special exception, the copyright holders give permission to link the
19 * code of portions of this program with the OpenSSL library under certain
20 * conditions as described in each individual source file and distribute
21 * linked combinations including the program with the OpenSSL library. You
22 * must comply with the Server Side Public License in all respects for
23 * all of the code used other than as permitted herein. If you modify file(s)
24 * with this exception, you may extend this exception to your version of the
25 * file(s), but you are not obligated to do so. If you do not wish to do so,
26 * delete this exception statement from your version. If you delete this
27 * exception statement from all source files in the program, then also delete
28 * it in the license file.
29 */
30
31 #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kSharding
32
33 #include "mongo/platform/basic.h"
34
35 #include "mongo/db/s/metadata_manager.h"
36
37 #include "mongo/base/string_data.h"
38 #include "mongo/bson/simple_bsonobj_comparator.h"
39 #include "mongo/bson/util/builder.h"
40 #include "mongo/db/bson/dotted_path_support.h"
41 #include "mongo/db/cursor_manager.h"
42 #include "mongo/db/db_raii.h"
43 #include "mongo/db/query/internal_plans.h"
44 #include "mongo/db/range_arithmetic.h"
45 #include "mongo/db/s/collection_sharding_state.h"
46 #include "mongo/db/s/sharding_state.h"
47 #include "mongo/stdx/memory.h"
48 #include "mongo/util/assert_util.h"
49 #include "mongo/util/fail_point_service.h"
50 #include "mongo/util/log.h"
51 #include "mongo/util/time_support.h"
52
53 // MetadataManager maintains pointers to CollectionMetadata objects in a member list named
54 // _metadata. Each CollectionMetadata contains an immutable _chunksMap of chunks assigned to this
55 // shard, along with details related to its own lifecycle in a member _tracker.
56 //
57 // The current chunk mapping, used by queries starting up, is at _metadata.back(). Each query,
58 // when it starts up, requests and holds a ScopedCollectionMetadata object, and destroys it on
59 // termination. Each ScopedCollectionMetadata keeps a shared_ptr to its CollectionMetadata chunk
60 // mapping, and to the MetadataManager itself. CollectionMetadata mappings also keep a record of
61 // chunk ranges that may be deleted when it is determined that the range can no longer be in use.
62 //
63 // ScopedCollectionMetadata's destructor decrements the CollectionMetadata's usageCounter.
64 // Whenever a usageCounter drops to zero, we check whether any now-unused CollectionMetadata
65 // elements can be popped off the front of _metadata. We need to keep the unused elements in the
66 // middle (as seen below) because they may schedule deletions of chunks depended on by older
67 // mappings.
68 //
69 // New chunk mappings are pushed onto the back of _metadata. Subsequently started queries use the
70 // new mapping while still-running queries continue using the older "snapshot" mappings. We treat
71 // _metadata.back()'s usage count differently from the snapshots because it can't reliably be
72 // compared to zero; a new query may increment it at any time.
73 //
74 // (Note that the collection may be dropped or become unsharded, and even get made and sharded
75 // again, between construction and destruction of a ScopedCollectionMetadata).
76 //
77 // MetadataManager also contains a CollectionRangeDeleter _rangesToClean that queues orphan ranges
78 // being deleted in a background thread, and a mapping _receivingChunks of the ranges being migrated
79 // in, to avoid deleting them. Each range deletion is paired with a notification object triggered
80 // when the deletion is completed or abandoned.
81 //
82 // ____________________________
83 // (s): std::shared_ptr<> Clients:| ScopedCollectionMetadata |
84 // _________________________ +----(s) manager metadata (s)------------------+
85 // | CollectionShardingState | | |____________________________| | |
86 // | _metadataManager (s) | +-------(s) manager metadata (s)--------------+ |
87 // |____________________|____| | |____________________________| | | |
88 // ____________________v________ +------------(s) manager metadata (s)-----+ | |
89 // | MetadataManager | | |____________________________| | | |
90 // | |<--+ | | |
91 // | | ___________________________ (1 use) | | |
92 // | getActiveMetadata(): /---------->| CollectionMetadata |<---------+ | |
93 // | back(): [(s),------/ | | _________________________|_ | |
94 // | (s),-------------------->| CollectionMetadata | (0 uses) | |
95 // | _metadata: (s)]------\ | | | _________________________|_ | |
96 // | \-------------->| CollectionMetadata | | |
97 // | _receivingChunks | | | | | (2 uses) | |
98 // | _rangesToClean: | | | | _tracker: |<---------+ |
99 // | _________________________ | | | | _______________________ |<-----------+
100 // | | CollectionRangeDeleter | | | | | | Tracker | |
101 // | | | | | | | | | |
102 // | | _orphans [range,notif, | | | | | | usageCounter | |
103 // | | range,notif, | | | | | | orphans [range,notif, | |
104 // | | ... ] | | | | | | range,notif, | |
105 // | | | | | | | | ... ] | |
106 // | |_________________________| | |_| | |_______________________| |
107 // |_____________________________| | | _chunksMap |
108 // |_| _chunkVersion |
109 // | ... |
110 // |___________________________|
111 //
112 // Note that _metadata as shown here has its front() at the bottom, back() at the top. As usual,
113 // new entries are pushed onto the back, popped off the front.
114
115 namespace mongo {
116 namespace {
117
118 using TaskExecutor = executor::TaskExecutor;
119 using CallbackArgs = TaskExecutor::CallbackArgs;
120
121 MONGO_FP_DECLARE(suspendRangeDeletion);
122
123 /**
124 * Deletes ranges, in background, until done, normally using a task executor attached to the
125 * ShardingState.
126 *
127 * Each time it completes cleaning up a range, it wakes up clients waiting on completion of that
128 * range, which may then verify that their range has no more deletions scheduled, and proceed.
129 */
scheduleCleanup(executor::TaskExecutor * executor,NamespaceString nss,OID epoch,Date_t when)130 void scheduleCleanup(executor::TaskExecutor* executor,
131 NamespaceString nss,
132 OID epoch,
133 Date_t when) {
134 LOG(1) << "Scheduling cleanup on " << nss.ns() << " at " << when;
135 auto swCallbackHandle = executor->scheduleWorkAt(
136 when, [ executor, nss = std::move(nss), epoch = std::move(epoch) ](auto&) {
137 Client::initThreadIfNotAlready("Collection Range Deleter");
138 auto uniqueOpCtx = Client::getCurrent()->makeOperationContext();
139 auto opCtx = uniqueOpCtx.get();
140
141 MONGO_FAIL_POINT_PAUSE_WHILE_SET(suspendRangeDeletion);
142
143 auto next = CollectionRangeDeleter::cleanUpNextRange(opCtx, nss, epoch);
144 if (next) {
145 scheduleCleanup(executor, std::move(nss), std::move(epoch), *next);
146 }
147 });
148
149 if (!swCallbackHandle.isOK()) {
150 log() << "Failed to schedule the orphan data cleanup task"
151 << causedBy(redact(swCallbackHandle.getStatus()));
152 }
153 }
154
logRangeDeletionWaitingOnOpenCursors(const OperationContext * opCtx,const Collection * collection,const NamespaceString & nss,const ChunkRange & range)155 void logRangeDeletionWaitingOnOpenCursors(const OperationContext* opCtx,
156 const Collection* collection,
157 const NamespaceString& nss,
158 const ChunkRange& range) {
159 invariant(opCtx->lockState()->isCollectionLockedForMode(nss.toString(), MODE_IS));
160 std::vector<CursorId> cursorIds;
161 // If the collection exists, gather a list of all cursors related to the collection.
162 if (collection) {
163 auto cursorIdsFromCollectionCursorManager =
164 collection->getCursorManager()->getCursorIdsForNamespace(nss);
165 cursorIds.insert(cursorIds.end(),
166 cursorIdsFromCollectionCursorManager.begin(),
167 cursorIdsFromCollectionCursorManager.end());
168
169 // Aggregation cursors are registered on the global cursor manager. A cursor on
170 // the global cursor manager can involve any number of collections, but is registered with
171 // the namespace of the aggregate command. This works well for this purpose since any other
172 // namespaces would come through a $lookup or $graphLookup which cannot read from sharded
173 // collections and so could not be contributing to the delay of a range deletion. The
174 // namespace of the aggregate command can be sharded, so we do want to include those
175 // aggregation cursors in this message.
176 auto cursorIdsFromGlobalCursorManager =
177 CursorManager::getGlobalCursorManager()->getCursorIdsForNamespace(nss);
178 cursorIds.insert(cursorIds.end(),
179 cursorIdsFromGlobalCursorManager.begin(),
180 cursorIdsFromGlobalCursorManager.end());
181 }
182
183 // Join cursorIds as a comma-separated list.
184 std::string cursorIdList =
185 cursorIds.empty() ? "" : std::accumulate(std::next(cursorIds.begin()),
186 cursorIds.end(),
187 std::to_string(cursorIds[0]),
188 [](std::string a, CursorId b) {
189 return std::move(a) + ',' + std::to_string(b);
190 });
191
192 log() << "Deletion of " << nss.ns() << " range " << redact(range.toString())
193 << " will be scheduled after all possibly dependent queries finish. "
194 "All open cursors for namespace: ["
195 << cursorIdList << "]";
196 }
197
198 } // namespace
199
MetadataManager(ServiceContext * serviceContext,NamespaceString nss,TaskExecutor * executor)200 MetadataManager::MetadataManager(ServiceContext* serviceContext,
201 NamespaceString nss,
202 TaskExecutor* executor)
203 : _serviceContext(serviceContext),
204 _nss(std::move(nss)),
205 _executor(executor),
206 _receivingChunks(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<BSONObj>()) {}
207
~MetadataManager()208 MetadataManager::~MetadataManager() {
209 stdx::lock_guard<stdx::mutex> lg(_managerLock);
210 _clearAllCleanups(lg);
211 _metadata.clear();
212 }
213
_clearAllCleanups(WithLock lock)214 void MetadataManager::_clearAllCleanups(WithLock lock) {
215 _clearAllCleanups(
216 lock,
217 {ErrorCodes::InterruptedDueToReplStateChange,
218 str::stream() << "Range deletions in " << _nss.ns()
219 << " abandoned because collection was dropped or became unsharded"});
220 }
221
_clearAllCleanups(WithLock,Status status)222 void MetadataManager::_clearAllCleanups(WithLock, Status status) {
223 for (auto& tracker : _metadata) {
224 std::ignore = _rangesToClean.add(std::move(tracker->orphans));
225 }
226 _rangesToClean.clear(status);
227 }
228
getActiveMetadata(std::shared_ptr<MetadataManager> self)229 ScopedCollectionMetadata MetadataManager::getActiveMetadata(std::shared_ptr<MetadataManager> self) {
230 stdx::lock_guard<stdx::mutex> lg(_managerLock);
231 if (!_metadata.empty()) {
232 return ScopedCollectionMetadata(lg, std::move(self), _metadata.back());
233 }
234
235 return ScopedCollectionMetadata();
236 }
237
numberOfMetadataSnapshots() const238 size_t MetadataManager::numberOfMetadataSnapshots() const {
239 stdx::lock_guard<stdx::mutex> lg(_managerLock);
240 if (_metadata.empty())
241 return 0;
242
243 return _metadata.size() - 1;
244 }
245
numberOfEmptyMetadataSnapshots() const246 int MetadataManager::numberOfEmptyMetadataSnapshots() const {
247 stdx::lock_guard<stdx::mutex> lg(_managerLock);
248
249 int emptyMetadataSnapshots = 0;
250 for (const auto& collMetadataTracker : _metadata) {
251 if (!collMetadataTracker->metadata)
252 emptyMetadataSnapshots++;
253 }
254
255 return emptyMetadataSnapshots;
256 }
257
refreshActiveMetadata(std::unique_ptr<CollectionMetadata> remoteMetadata)258 void MetadataManager::refreshActiveMetadata(std::unique_ptr<CollectionMetadata> remoteMetadata) {
259 stdx::lock_guard<stdx::mutex> lg(_managerLock);
260
261 // Collection was never sharded in the first place. This check is necessary in order to avoid
262 // extraneous logging in the not-a-shard case, because all call sites always try to get the
263 // collection sharding information regardless of whether the node is sharded or not.
264 if (!remoteMetadata && _metadata.empty()) {
265 invariant(_receivingChunks.empty());
266 invariant(_rangesToClean.isEmpty());
267 return;
268 }
269
270 // Collection is becoming unsharded
271 if (!remoteMetadata) {
272 log() << "Marking collection " << _nss.ns() << " with "
273 << redact(_metadata.back()->metadata->toStringBasic()) << " as unsharded";
274
275 _receivingChunks.clear();
276 _clearAllCleanups(lg);
277 _metadata.clear();
278 return;
279 }
280
281 // Collection is becoming sharded
282 if (_metadata.empty()) {
283 log() << "Marking collection " << _nss.ns() << " as sharded with "
284 << remoteMetadata->toStringBasic();
285
286 invariant(_receivingChunks.empty());
287 _setActiveMetadata(lg, std::move(*remoteMetadata));
288 invariant(_rangesToClean.isEmpty());
289 return;
290 }
291
292 auto* const activeMetadata = &_metadata.back()->metadata.get();
293
294 // If the metadata being installed has a different epoch from ours, this means the collection
295 // was dropped and recreated, so we must entirely reset the metadata state
296 if (activeMetadata->getCollVersion().epoch() != remoteMetadata->getCollVersion().epoch()) {
297 log() << "Overwriting metadata for collection " << _nss.ns() << " from "
298 << activeMetadata->toStringBasic() << " to " << remoteMetadata->toStringBasic()
299 << " due to epoch change";
300
301 _receivingChunks.clear();
302 _setActiveMetadata(lg, std::move(*remoteMetadata));
303 _clearAllCleanups(lg);
304 return;
305 }
306
307 // We already have newer version
308 if (activeMetadata->getCollVersion() >= remoteMetadata->getCollVersion()) {
309 LOG(1) << "Ignoring update of active metadata " << activeMetadata->toStringBasic()
310 << " with an older " << remoteMetadata->toStringBasic();
311 return;
312 }
313
314 log() << "Updating collection metadata for " << _nss.ns() << " from "
315 << activeMetadata->toStringBasic() << " to " << remoteMetadata->toStringBasic();
316
317 // Resolve any receiving chunks, which might have completed by now
318 for (auto it = _receivingChunks.begin(); it != _receivingChunks.end();) {
319 const ChunkRange receivingRange(it->first, it->second);
320
321 if (!remoteMetadata->rangeOverlapsChunk(receivingRange)) {
322 ++it;
323 continue;
324 }
325
326 // The remote metadata contains a chunk we were earlier in the process of receiving, so we
327 // deem it successfully received
328 LOG(2) << "Verified chunk " << redact(receivingRange.toString()) << " for collection "
329 << _nss.ns() << " has been migrated to this shard earlier";
330
331 _receivingChunks.erase(it);
332 it = _receivingChunks.begin();
333 }
334
335 _setActiveMetadata(lg, std::move(*remoteMetadata));
336 }
337
_setActiveMetadata(WithLock wl,CollectionMetadata newMetadata)338 void MetadataManager::_setActiveMetadata(WithLock wl, CollectionMetadata newMetadata) {
339 _metadata.emplace_back(std::make_shared<CollectionMetadataTracker>(std::move(newMetadata)));
340 _retireExpiredMetadata(wl);
341 }
342
_retireExpiredMetadata(WithLock lock)343 void MetadataManager::_retireExpiredMetadata(WithLock lock) {
344 // Remove entries and schedule orphans for deletion only from the front of _metadata. We cannot
345 // remove an entry from the middle of _metadata because a previous entry (whose usageCount is
346 // not 0) could have a query that is actually still accessing those documents.
347 while (_metadata.size() > 1 && !_metadata.front()->usageCounter) {
348 if (!_metadata.front()->orphans.empty()) {
349 LOG(0) << "Queries possibly dependent on " << _nss.ns()
350 << " range(s) finished; scheduling ranges for deletion";
351
352 _pushListToClean(lock, std::move(_metadata.front()->orphans));
353 }
354
355 _metadata.pop_front();
356 }
357
358 // To avoid memory build up of ChunkManager objects, we can clear the CollectionMetadata object
359 // in an entry when its usageCount is 0 as long as it is not the last item in _metadata (which
360 // is the active metadata). If _metadata is empty, decrementing iter will be out of bounds, so
361 // we must check that the size is > 1 as well.
362 if (_metadata.size() > 1) {
363 auto iter = _metadata.begin();
364 while (iter != (--_metadata.end())) {
365 if ((*iter)->usageCounter == 0) {
366 (*iter)->metadata = boost::none;
367 }
368 ++iter;
369 }
370 }
371 }
372
toBSONPending(BSONArrayBuilder & bb) const373 void MetadataManager::toBSONPending(BSONArrayBuilder& bb) const {
374 stdx::lock_guard<stdx::mutex> lg(_managerLock);
375
376 for (auto it = _receivingChunks.begin(); it != _receivingChunks.end(); ++it) {
377 BSONArrayBuilder pendingBB(bb.subarrayStart());
378 pendingBB.append(it->first);
379 pendingBB.append(it->second);
380 pendingBB.done();
381 }
382 }
383
append(BSONObjBuilder * builder) const384 void MetadataManager::append(BSONObjBuilder* builder) const {
385 stdx::lock_guard<stdx::mutex> lg(_managerLock);
386
387 _rangesToClean.append(builder);
388
389 BSONArrayBuilder pcArr(builder->subarrayStart("pendingChunks"));
390 for (const auto& entry : _receivingChunks) {
391 BSONObjBuilder obj;
392 ChunkRange r = ChunkRange(entry.first, entry.second);
393 r.append(&obj);
394 pcArr.append(obj.done());
395 }
396 pcArr.done();
397
398 if (_metadata.empty()) {
399 return;
400 }
401
402 BSONArrayBuilder amrArr(builder->subarrayStart("activeMetadataRanges"));
403 for (const auto& entry : _metadata.back()->metadata->getChunks()) {
404 BSONObjBuilder obj;
405 ChunkRange r = ChunkRange(entry.first, entry.second);
406 r.append(&obj);
407 amrArr.append(obj.done());
408 }
409 amrArr.done();
410 }
411
_pushRangeToClean(WithLock lock,ChunkRange const & range,Date_t when)412 auto MetadataManager::_pushRangeToClean(WithLock lock, ChunkRange const& range, Date_t when)
413 -> CleanupNotification {
414 std::list<Deletion> ranges;
415 ranges.emplace_back(ChunkRange(range.getMin().getOwned(), range.getMax().getOwned()), when);
416 auto& notifn = ranges.back().notification;
417 _pushListToClean(lock, std::move(ranges));
418 return notifn;
419 }
420
_pushListToClean(WithLock,std::list<Deletion> ranges)421 void MetadataManager::_pushListToClean(WithLock, std::list<Deletion> ranges) {
422 auto when = _rangesToClean.add(std::move(ranges));
423 if (when) {
424 scheduleCleanup(
425 _executor, _nss, _metadata.back()->metadata->getCollVersion().epoch(), *when);
426 }
427 invariant(ranges.empty());
428 }
429
beginReceive(ChunkRange const & range)430 auto MetadataManager::beginReceive(ChunkRange const& range) -> CleanupNotification {
431 stdx::lock_guard<stdx::mutex> lg(_managerLock);
432 invariant(!_metadata.empty());
433
434 if (_overlapsInUseChunk(lg, range)) {
435 return Status{ErrorCodes::RangeOverlapConflict,
436 "Documents in target range may still be in use on the destination shard."};
437 }
438
439 _receivingChunks.emplace(range.getMin().getOwned(), range.getMax().getOwned());
440
441 log() << "Scheduling deletion of any documents in " << _nss.ns() << " range "
442 << redact(range.toString()) << " before migrating in a chunk covering the range";
443
444 return _pushRangeToClean(lg, range, Date_t{});
445 }
446
forgetReceive(ChunkRange const & range)447 void MetadataManager::forgetReceive(ChunkRange const& range) {
448 stdx::lock_guard<stdx::mutex> lg(_managerLock);
449 invariant(!_metadata.empty());
450
451 // This is potentially a partially received chunk, which needs to be cleaned up. We know none
452 // of these documents are in use, so they can go straight to the deletion queue.
453 log() << "Abandoning in-migration of " << _nss.ns() << " range " << range
454 << "; scheduling deletion of any documents already copied";
455
456 invariant(!_overlapsInUseChunk(lg, range));
457
458 auto it = _receivingChunks.find(range.getMin());
459 invariant(it != _receivingChunks.end());
460 _receivingChunks.erase(it);
461
462 _pushRangeToClean(lg, range, Date_t{}).abandon();
463 }
464
cleanUpRange(OperationContext * opCtx,const Collection * collection,ChunkRange const & range,Date_t whenToDelete)465 auto MetadataManager::cleanUpRange(OperationContext* opCtx,
466 const Collection* collection,
467 ChunkRange const& range,
468 Date_t whenToDelete) -> CleanupNotification {
469 stdx::lock_guard<stdx::mutex> lg(_managerLock);
470 invariant(!_metadata.empty());
471
472 auto* const activeMetadata = _metadata.back().get();
473 auto* const overlapMetadata = _findNewestOverlappingMetadata(lg, range);
474
475 if (overlapMetadata == activeMetadata) {
476 return Status{ErrorCodes::RangeOverlapConflict,
477 str::stream() << "Requested deletion range overlaps a live shard chunk"};
478 }
479
480 if (rangeMapOverlaps(_receivingChunks, range.getMin(), range.getMax())) {
481 return Status{ErrorCodes::RangeOverlapConflict,
482 str::stream() << "Requested deletion range overlaps a chunk being"
483 " migrated in"};
484 }
485
486 if (!overlapMetadata) {
487 // No running queries can depend on it, so queue it for deletion immediately.
488 const auto whenStr = (whenToDelete == Date_t{}) ? "immediate"_sd : "deferred"_sd;
489 log() << "Scheduling " << whenStr << " deletion of " << _nss.ns() << " range "
490 << redact(range.toString());
491 return _pushRangeToClean(lg, range, whenToDelete);
492 }
493
494 logRangeDeletionWaitingOnOpenCursors(opCtx, collection, _nss, range);
495
496 // Put it on the oldest metadata permissible; the current one might live a long time.
497 auto& orphans = overlapMetadata->orphans;
498 orphans.emplace_back(ChunkRange(range.getMin().getOwned(), range.getMax().getOwned()),
499 whenToDelete);
500 return orphans.back().notification;
501 }
502
overlappingMetadata(std::shared_ptr<MetadataManager> const & self,ChunkRange const & range)503 std::vector<ScopedCollectionMetadata> MetadataManager::overlappingMetadata(
504 std::shared_ptr<MetadataManager> const& self, ChunkRange const& range) {
505 stdx::lock_guard<stdx::mutex> lg(_managerLock);
506 invariant(!_metadata.empty());
507
508 std::vector<ScopedCollectionMetadata> result;
509 result.reserve(_metadata.size());
510
511 // Start with the active metadata
512 auto it = _metadata.rbegin();
513 if ((*it)->metadata->rangeOverlapsChunk(range)) {
514 // We ignore the refcount of the active mapping; effectively, we assume it is in use.
515 result.push_back(ScopedCollectionMetadata(lg, self, (*it)));
516 }
517
518 // Continue to snapshots
519 ++it;
520 for (; it != _metadata.rend(); ++it) {
521 auto& tracker = *it;
522
523 // We want all the overlapping snapshot mappings still possibly in use by a query.
524 if (tracker->usageCounter > 0 && tracker->metadata->rangeOverlapsChunk(range)) {
525 result.push_back(ScopedCollectionMetadata(lg, self, tracker));
526 }
527 }
528
529 return result;
530 }
531
numberOfRangesToCleanStillInUse() const532 size_t MetadataManager::numberOfRangesToCleanStillInUse() const {
533 stdx::lock_guard<stdx::mutex> lg(_managerLock);
534 size_t count = 0;
535 for (auto& tracker : _metadata) {
536 count += tracker->orphans.size();
537 }
538 return count;
539 }
540
numberOfRangesToClean() const541 size_t MetadataManager::numberOfRangesToClean() const {
542 stdx::lock_guard<stdx::mutex> lg(_managerLock);
543 return _rangesToClean.size();
544 }
545
trackOrphanedDataCleanup(ChunkRange const & range) const546 auto MetadataManager::trackOrphanedDataCleanup(ChunkRange const& range) const
547 -> boost::optional<CleanupNotification> {
548 stdx::lock_guard<stdx::mutex> lg(_managerLock);
549 auto overlaps = _overlapsInUseCleanups(lg, range);
550 if (overlaps) {
551 return overlaps;
552 }
553
554 return _rangesToClean.overlaps(range);
555 }
556
_findNewestOverlappingMetadata(WithLock,ChunkRange const & range)557 auto MetadataManager::_findNewestOverlappingMetadata(WithLock, ChunkRange const& range)
558 -> CollectionMetadataTracker* {
559 invariant(!_metadata.empty());
560
561 auto it = _metadata.rbegin();
562 if ((*it)->metadata && (*it)->metadata->rangeOverlapsChunk(range)) {
563 return (*it).get();
564 }
565
566 ++it;
567 for (; it != _metadata.rend(); ++it) {
568 auto& tracker = *it;
569 if (tracker->usageCounter && tracker->metadata &&
570 tracker->metadata->rangeOverlapsChunk(range)) {
571 return tracker.get();
572 }
573 }
574
575 return nullptr;
576 }
577
_overlapsInUseChunk(WithLock lk,ChunkRange const & range)578 bool MetadataManager::_overlapsInUseChunk(WithLock lk, ChunkRange const& range) {
579 auto* cm = _findNewestOverlappingMetadata(lk, range);
580 return (cm != nullptr);
581 }
582
_overlapsInUseCleanups(WithLock,ChunkRange const & range) const583 auto MetadataManager::_overlapsInUseCleanups(WithLock, ChunkRange const& range) const
584 -> boost::optional<CleanupNotification> {
585 invariant(!_metadata.empty());
586
587 for (auto it = _metadata.rbegin(); it != _metadata.rend(); ++it) {
588 const auto& orphans = (*it)->orphans;
589 for (auto itOrphans = orphans.rbegin(); itOrphans != orphans.rend(); ++itOrphans) {
590 const auto& orphan = *itOrphans;
591 if (orphan.range.overlapWith(range)) {
592 return orphan.notification;
593 }
594 }
595 }
596
597 return boost::none;
598 }
599
getNextOrphanRange(BSONObj const & from) const600 boost::optional<ChunkRange> MetadataManager::getNextOrphanRange(BSONObj const& from) const {
601 stdx::lock_guard<stdx::mutex> lg(_managerLock);
602 invariant(!_metadata.empty());
603 return _metadata.back()->metadata->getNextOrphanRange(_receivingChunks, from);
604 }
605
606 ScopedCollectionMetadata::ScopedCollectionMetadata() = default;
607
ScopedCollectionMetadata(WithLock,std::shared_ptr<MetadataManager> metadataManager,std::shared_ptr<MetadataManager::CollectionMetadataTracker> metadataTracker)608 ScopedCollectionMetadata::ScopedCollectionMetadata(
609 WithLock,
610 std::shared_ptr<MetadataManager> metadataManager,
611 std::shared_ptr<MetadataManager::CollectionMetadataTracker> metadataTracker)
612 : _metadataManager(std::move(metadataManager)), _metadataTracker(std::move(metadataTracker)) {
613 invariant(_metadataManager);
614 invariant(_metadataTracker);
615 ++_metadataTracker->usageCounter;
616 }
617
ScopedCollectionMetadata(ScopedCollectionMetadata && other)618 ScopedCollectionMetadata::ScopedCollectionMetadata(ScopedCollectionMetadata&& other) {
619 *this = std::move(other);
620 }
621
operator =(ScopedCollectionMetadata && other)622 ScopedCollectionMetadata& ScopedCollectionMetadata::operator=(ScopedCollectionMetadata&& other) {
623 if (this != &other) {
624 _clear();
625
626 _metadataManager = std::move(other._metadataManager);
627 _metadataTracker = std::move(other._metadataTracker);
628
629 other._metadataManager = nullptr;
630 other._metadataTracker = nullptr;
631 }
632 return *this;
633 }
634
getMetadata() const635 CollectionMetadata* ScopedCollectionMetadata::getMetadata() const {
636 return _metadataTracker && _metadataTracker->metadata ? &_metadataTracker->metadata.get()
637 : nullptr;
638 }
639
extractDocumentKey(BSONObj const & doc) const640 BSONObj ScopedCollectionMetadata::extractDocumentKey(BSONObj const& doc) const {
641 BSONObj key;
642 if (*this) { // is sharded
643 auto const& pattern = _metadataTracker->metadata->getChunkManager()->getShardKeyPattern();
644 key = dotted_path_support::extractElementsBasedOnTemplate(doc, pattern.toBSON());
645 if (pattern.hasId()) {
646 return key;
647 }
648 // else, try to append an _id field from the document.
649 }
650
651 if (auto id = doc["_id"]) {
652 return key.isEmpty() ? id.wrap() : BSONObjBuilder(std::move(key)).append(id).obj();
653 }
654
655 // For legacy documents that lack an _id, use the document itself as its key.
656 return doc;
657 }
658
_clear()659 void ScopedCollectionMetadata::_clear() {
660 if (!_metadataManager) {
661 return;
662 }
663
664 stdx::lock_guard<stdx::mutex> managerLock(_metadataManager->_managerLock);
665 invariant(_metadataTracker->usageCounter != 0);
666 if (--_metadataTracker->usageCounter == 0) {
667 // MetadataManager doesn't care which usageCounter went to zero. It just retires all that
668 // are older than the oldest metadata still in use by queries (some start out at zero, some
669 // go to zero but can't be expired yet).
670 //
671 // Note that new instances of ScopedCollectionMetadata may get attached to _metadata.back(),
672 // so its usage count can increase from zero, unlike other reference counts.
673 _metadataManager->_retireExpiredMetadata(managerLock);
674 }
675
676 _metadataManager.reset();
677 _metadataTracker.reset();
678 }
679
680 } // namespace mongo
681