1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
32 
33 #include "mongo/platform/basic.h"
34 
35 #include "mongo/db/concurrency/lock_manager.h"
36 
37 #include <third_party/murmurhash3/MurmurHash3.h>
38 
39 #include "mongo/base/data_type_endian.h"
40 #include "mongo/base/data_view.h"
41 #include "mongo/base/static_assert.h"
42 #include "mongo/bson/bsonobjbuilder.h"
43 #include "mongo/config.h"
44 #include "mongo/db/concurrency/d_concurrency.h"
45 #include "mongo/db/concurrency/locker.h"
46 #include "mongo/util/assert_util.h"
47 #include "mongo/util/log.h"
48 #include "mongo/util/stringutils.h"
49 #include "mongo/util/timer.h"
50 
51 namespace mongo {
52 namespace {
53 
54 /**
55  * Map of conflicts. 'LockConflictsTable[newMode] & existingMode != 0' means that a new request
56  * with the given 'newMode' conflicts with an existing request with mode 'existingMode'.
57  */
58 static const int LockConflictsTable[] = {
59     // MODE_NONE
60     0,
61 
62     // MODE_IS
63     (1 << MODE_X),
64 
65     // MODE_IX
66     (1 << MODE_S) | (1 << MODE_X),
67 
68     // MODE_S
69     (1 << MODE_IX) | (1 << MODE_X),
70 
71     // MODE_X
72     (1 << MODE_S) | (1 << MODE_X) | (1 << MODE_IS) | (1 << MODE_IX),
73 };
74 
75 // Mask of modes
76 const uint64_t intentModes = (1 << MODE_IS) | (1 << MODE_IX);
77 
78 // Ensure we do not add new modes without updating the conflicts table
79 MONGO_STATIC_ASSERT((sizeof(LockConflictsTable) / sizeof(LockConflictsTable[0])) == LockModesCount);
80 
81 
82 /**
83  * Maps the mode id to a string.
84  */
85 static const char* LockModeNames[] = {"NONE", "IS", "IX", "S", "X"};
86 
87 static const char* LegacyLockModeNames[] = {"", "r", "w", "R", "W"};
88 
89 // Ensure we do not add new modes without updating the names array
90 MONGO_STATIC_ASSERT((sizeof(LockModeNames) / sizeof(LockModeNames[0])) == LockModesCount);
91 MONGO_STATIC_ASSERT((sizeof(LegacyLockModeNames) / sizeof(LegacyLockModeNames[0])) ==
92                     LockModesCount);
93 
94 // Helper functions for the lock modes
conflicts(LockMode newMode,uint32_t existingModesMask)95 bool conflicts(LockMode newMode, uint32_t existingModesMask) {
96     return (LockConflictsTable[newMode] & existingModesMask) != 0;
97 }
98 
modeMask(LockMode mode)99 uint32_t modeMask(LockMode mode) {
100     return 1 << mode;
101 }
102 
hashStringData(StringData str)103 uint64_t hashStringData(StringData str) {
104     char hash[16];
105     MurmurHash3_x64_128(str.rawData(), str.size(), 0, hash);
106     return static_cast<size_t>(ConstDataView(hash).read<LittleEndian<std::uint64_t>>());
107 }
108 
109 /**
110  * Maps the resource id to a human-readable string.
111  */
112 static const char* ResourceTypeNames[] = {
113     "Invalid", "Global", "MMAPV1Journal", "Database", "Collection", "Metadata", "Mutex"};
114 
115 // Ensure we do not add new types without updating the names array
116 MONGO_STATIC_ASSERT((sizeof(ResourceTypeNames) / sizeof(ResourceTypeNames[0])) ==
117                     ResourceTypesCount);
118 
119 
120 /**
121  * Maps the LockRequest status to a human-readable string.
122  */
123 static const char* LockRequestStatusNames[] = {
124     "new", "granted", "waiting", "converting",
125 };
126 
127 // Ensure we do not add new status types without updating the names array
128 MONGO_STATIC_ASSERT((sizeof(LockRequestStatusNames) / sizeof(LockRequestStatusNames[0])) ==
129                     LockRequest::StatusCount);
130 
131 }  // namespace
132 
133 /**
134  * There is one of these objects for each resource that has a lock request. Empty objects (i.e.
135  * LockHead with no requests) are allowed to exist on the lock manager's hash table.
136  *
137  * The memory and lifetime is controlled entirely by the LockManager class.
138  *
139  * Not thread-safe and should only be accessed under the LockManager's bucket lock. Must be locked
140  * before locking a partition, not after.
141  */
142 struct LockHead {
143 
144     /**
145      * Used for initialization of a LockHead, which might have been retrieved from cache and also in
146      * order to keep the LockHead structure a POD.
147      */
initNewmongo::LockHead148     void initNew(ResourceId resId) {
149         resourceId = resId;
150 
151         grantedList.reset();
152         memset(grantedCounts, 0, sizeof(grantedCounts));
153         grantedModes = 0;
154 
155         conflictList.reset();
156         memset(conflictCounts, 0, sizeof(conflictCounts));
157         conflictModes = 0;
158 
159         conversionsCount = 0;
160         compatibleFirstCount = 0;
161     }
162 
163     /**
164      * True iff there may be partitions with granted requests for this resource.
165      */
partitionedmongo::LockHead166     bool partitioned() const {
167         return !partitions.empty();
168     }
169 
170     /**
171      * Locates the request corresponding to the particular locker or returns nullptr. Must be called
172      * with the bucket holding this lock head locked.
173      */
findRequestmongo::LockHead174     LockRequest* findRequest(LockerId lockerId) const {
175         // Check the granted queue first
176         for (LockRequest* it = grantedList._front; it != nullptr; it = it->next) {
177             if (it->locker->getId() == lockerId) {
178                 return it;
179             }
180         }
181 
182         // Check the conflict queue second
183         for (LockRequest* it = conflictList._front; it != nullptr; it = it->next) {
184             if (it->locker->getId() == lockerId) {
185                 return it;
186             }
187         }
188 
189         return nullptr;
190     }
191 
192     /**
193      * Finish creation of request and put it on the lockhead's conflict or granted queues. Returns
194      * LOCK_WAITING for conflict case and LOCK_OK otherwise.
195      */
newRequestmongo::LockHead196     LockResult newRequest(LockRequest* request) {
197         invariant(!request->partitionedLock);
198         request->lock = this;
199 
200         // We cannot set request->partitioned to false, as this might be a migration, in which case
201         // access to that field is not protected. The 'partitioned' member instead indicates if a
202         // request was initially partitioned.
203 
204         // New lock request. Queue after all granted modes and after any already requested
205         // conflicting modes
206         if (conflicts(request->mode, grantedModes) ||
207             (!compatibleFirstCount && conflicts(request->mode, conflictModes))) {
208             request->status = LockRequest::STATUS_WAITING;
209 
210             // Put it on the conflict queue. Conflicts are granted front to back.
211             if (request->enqueueAtFront) {
212                 conflictList.push_front(request);
213             } else {
214                 conflictList.push_back(request);
215             }
216 
217             incConflictModeCount(request->mode);
218 
219             return LOCK_WAITING;
220         }
221 
222         // No conflict, new request
223         request->status = LockRequest::STATUS_GRANTED;
224 
225         grantedList.push_back(request);
226         incGrantedModeCount(request->mode);
227 
228         if (request->compatibleFirst) {
229             compatibleFirstCount++;
230         }
231 
232         return LOCK_OK;
233     }
234 
235     /**
236      * Lock each partitioned LockHead in turn, and move any (granted) intent mode requests for
237      * lock->resourceId to lock, which must itself already be locked.
238      */
239     void migratePartitionedLockHeads();
240 
241     // Methods to maintain the granted queue
incGrantedModeCountmongo::LockHead242     void incGrantedModeCount(LockMode mode) {
243         invariant(grantedCounts[mode] >= 0);
244         if (++grantedCounts[mode] == 1) {
245             invariant((grantedModes & modeMask(mode)) == 0);
246             grantedModes |= modeMask(mode);
247         }
248     }
249 
decGrantedModeCountmongo::LockHead250     void decGrantedModeCount(LockMode mode) {
251         invariant(grantedCounts[mode] >= 1);
252         if (--grantedCounts[mode] == 0) {
253             invariant((grantedModes & modeMask(mode)) == modeMask(mode));
254             grantedModes &= ~modeMask(mode);
255         }
256     }
257 
258     // Methods to maintain the conflict queue
incConflictModeCountmongo::LockHead259     void incConflictModeCount(LockMode mode) {
260         invariant(conflictCounts[mode] >= 0);
261         if (++conflictCounts[mode] == 1) {
262             invariant((conflictModes & modeMask(mode)) == 0);
263             conflictModes |= modeMask(mode);
264         }
265     }
266 
decConflictModeCountmongo::LockHead267     void decConflictModeCount(LockMode mode) {
268         invariant(conflictCounts[mode] >= 1);
269         if (--conflictCounts[mode] == 0) {
270             invariant((conflictModes & modeMask(mode)) == modeMask(mode));
271             conflictModes &= ~modeMask(mode);
272         }
273     }
274 
275     // Id of the resource which is protected by this lock. Initialized at construction time and does
276     // not change.
277     ResourceId resourceId;
278 
279     //
280     // Granted queue
281     //
282 
283     // Doubly-linked list of requests, which have been granted. Newly granted requests go to
284     // the end of the queue. Conversion requests are granted from the beginning forward.
285     LockRequestList grantedList;
286 
287     // Counts the grants and coversion counts for each of the supported lock modes. These
288     // counts should exactly match the aggregated modes on the granted list.
289     uint32_t grantedCounts[LockModesCount];
290 
291     // Bit-mask of the granted + converting modes on the granted queue. Maintained in lock-step
292     // with the grantedCounts array.
293     uint32_t grantedModes;
294 
295     //
296     // Conflict queue
297     //
298 
299     // Doubly-linked list of requests, which have not been granted yet because they conflict
300     // with the set of granted modes. Requests are queued at the end of the queue and are
301     // granted from the beginning forward, which gives these locks FIFO ordering. Exceptions
302     // are high-priorty locks, such as the MMAP V1 flush lock.
303     LockRequestList conflictList;
304 
305     // Counts the conflicting requests for each of the lock modes. These counts should exactly
306     // match the aggregated modes on the conflicts list.
307     uint32_t conflictCounts[LockModesCount];
308 
309     // Bit-mask of the conflict modes on the conflict queue. Maintained in lock-step with the
310     // conflictCounts array.
311     uint32_t conflictModes;
312 
313     // References partitions that may have PartitionedLockHeads for this LockHead.
314     // Non-empty implies the lock has no conflicts and only has intent modes as grantedModes.
315     // TODO: Remove this vector and make LockHead a POD
316     std::vector<LockManager::Partition*> partitions;
317 
318     //
319     // Conversion
320     //
321 
322     // Counts the number of requests on the granted queue, which have requested any kind of
323     // conflicting conversion and are blocked (i.e. all requests which are currently
324     // STATUS_CONVERTING). This is an optimization for unlocking in that we do not need to
325     // check the granted queue for requests in STATUS_CONVERTING if this count is zero. This
326     // saves cycles in the regular case and only burdens the less-frequent lock upgrade case.
327     uint32_t conversionsCount;
328 
329     // Counts the number of requests on the granted queue, which have requested that the policy
330     // be switched to compatible-first. As long as this value is > 0, the policy will stay
331     // compatible-first.
332     uint32_t compatibleFirstCount;
333 };
334 
335 /**
336  * The PartitionedLockHead allows optimizing the case where requests overwhelmingly use
337  * the intent lock modes MODE_IS and MODE_IX, which are compatible with each other.
338  * Having to use a single LockHead causes contention where none would be needed.
339  * So, each Locker is associated with a specific partition containing a mapping
340  * of resourceId to PartitionedLockHead.
341  *
342  * As long as all lock requests for a resource have an intent mode, as opposed to a conflicting
343  * mode, its LockHead may reference ParitionedLockHeads. A partitioned LockHead will not have
344  * any conflicts. The total set of granted requests (with intent mode) is the union of
345  * its grantedList and all grantedLists in PartitionedLockHeads.
346  *
347  * The existence of a PartitionedLockHead for a resource implies that its LockHead is
348  * partitioned. If a conflicting request is made on a LockHead, all requests from
349  * PartitionedLockHeads are migrated to that LockHead and the LockHead no longer partitioned.
350  *
351  * Not thread-safe, must be accessed under its partition lock.
352  * May not lock a LockManager bucket while holding a partition lock.
353  */
354 struct PartitionedLockHead {
355 
initNewmongo::PartitionedLockHead356     void initNew(ResourceId resId) {
357         grantedList.reset();
358     }
359 
newRequestmongo::PartitionedLockHead360     void newRequest(LockRequest* request) {
361         invariant(request->partitioned);
362         invariant(!request->lock);
363         request->partitionedLock = this;
364         request->status = LockRequest::STATUS_GRANTED;
365 
366         grantedList.push_back(request);
367     }
368 
369     // Doubly-linked list of requests, which have been granted. Newly granted requests go to the end
370     // of the queue. The PartitionedLockHead never contains anything but granted requests with
371     // intent modes.
372     LockRequestList grantedList;
373 };
374 
migratePartitionedLockHeads()375 void LockHead::migratePartitionedLockHeads() {
376     invariant(partitioned());
377 
378     // There can't be non-intent modes or conflicts when the lock is partitioned
379     invariant(!(grantedModes & ~intentModes) && !conflictModes);
380 
381     // Migration time: lock each partition in turn and transfer its requests, if any
382     while (partitioned()) {
383         LockManager::Partition* partition = partitions.back();
384         stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
385 
386         LockManager::Partition::Map::iterator it = partition->data.find(resourceId);
387         if (it != partition->data.end()) {
388             PartitionedLockHead* partitionedLock = it->second;
389 
390             while (!partitionedLock->grantedList.empty()) {
391                 LockRequest* request = partitionedLock->grantedList._front;
392                 partitionedLock->grantedList.remove(request);
393                 request->partitionedLock = nullptr;
394                 // Ordering is important here, as the next/prev fields are shared.
395                 // Note that newRequest() will preserve the recursiveCount in this case
396                 LockResult res = newRequest(request);
397                 invariant(res == LOCK_OK);  // Lock must still be granted
398             }
399             partition->data.erase(it);
400             delete partitionedLock;
401         }
402         // Don't pop-back to early as otherwise the lock will be considered not partioned in
403         // newRequest().
404         partitions.pop_back();
405     }
406 }
407 
408 //
409 // LockManager
410 //
411 
412 // Have more buckets than CPUs to reduce contention on lock and caches
413 const unsigned LockManager::_numLockBuckets(128);
414 
415 // Balance scalability of intent locks against potential added cost of conflicting locks.
416 // The exact value doesn't appear very important, but should be power of two
417 const unsigned LockManager::_numPartitions = 32;
418 
LockManager()419 LockManager::LockManager() {
420     _lockBuckets = new LockBucket[_numLockBuckets];
421     _partitions = new Partition[_numPartitions];
422 }
423 
~LockManager()424 LockManager::~LockManager() {
425     cleanupUnusedLocks();
426 
427     for (unsigned i = 0; i < _numLockBuckets; i++) {
428         // TODO: dump more information about the non-empty bucket to see what locks were leaked
429         invariant(_lockBuckets[i].data.empty());
430     }
431 
432     delete[] _lockBuckets;
433     delete[] _partitions;
434 }
435 
lock(ResourceId resId,LockRequest * request,LockMode mode)436 LockResult LockManager::lock(ResourceId resId, LockRequest* request, LockMode mode) {
437     // Sanity check that requests are not being reused without proper cleanup
438     invariant(request->status == LockRequest::STATUS_NEW);
439     invariant(request->recursiveCount == 1);
440 
441     request->partitioned = (mode == MODE_IX || mode == MODE_IS);
442     request->mode = mode;
443 
444     // For intent modes, try the PartitionedLockHead
445     if (request->partitioned) {
446         Partition* partition = _getPartition(request);
447         stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
448 
449         // Fast path for intent locks
450         PartitionedLockHead* partitionedLock = partition->find(resId);
451 
452         if (partitionedLock) {
453             partitionedLock->newRequest(request);
454             return LOCK_OK;
455         }
456         // Unsuccessful: there was no PartitionedLockHead yet, so use regular LockHead.
457         // Must not hold any locks. It is OK for requests with intent modes to be on
458         // both a PartitionedLockHead and a regular LockHead, so the race here is benign.
459     }
460 
461     // Use regular LockHead, maybe start partitioning
462     LockBucket* bucket = _getBucket(resId);
463     stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
464 
465     LockHead* lock = bucket->findOrInsert(resId);
466 
467     // Start a partitioned lock if possible
468     if (request->partitioned && !(lock->grantedModes & (~intentModes)) && !lock->conflictModes) {
469         Partition* partition = _getPartition(request);
470         stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
471         PartitionedLockHead* partitionedLock = partition->findOrInsert(resId);
472         invariant(partitionedLock);
473         lock->partitions.push_back(partition);
474         partitionedLock->newRequest(request);
475         return LOCK_OK;
476     }
477 
478     // For the first lock with a non-intent mode, migrate requests from partitioned lock heads
479     if (lock->partitioned()) {
480         lock->migratePartitionedLockHeads();
481     }
482 
483     request->partitioned = false;
484     return lock->newRequest(request);
485 }
486 
convert(ResourceId resId,LockRequest * request,LockMode newMode)487 LockResult LockManager::convert(ResourceId resId, LockRequest* request, LockMode newMode) {
488     // If we are here, we already hold the lock in some mode. In order to keep it simple, we do
489     // not allow requesting a conversion while a lock is already waiting or pending conversion.
490     invariant(request->status == LockRequest::STATUS_GRANTED);
491     invariant(request->recursiveCount > 0);
492 
493     request->recursiveCount++;
494 
495     // Fast path for acquiring the same lock multiple times in modes, which are already covered
496     // by the current mode. It is safe to do this without locking, because 1) all calls for the
497     // same lock request must be done on the same thread and 2) if there are lock requests
498     // hanging off a given LockHead, then this lock will never disappear.
499     if ((LockConflictsTable[request->mode] | LockConflictsTable[newMode]) ==
500         LockConflictsTable[request->mode]) {
501         return LOCK_OK;
502     }
503 
504     // TODO: For the time being we do not need conversions between unrelated lock modes (i.e.,
505     // modes which both add and remove to the conflicts set), so these are not implemented yet
506     // (e.g., S -> IX).
507     invariant((LockConflictsTable[request->mode] | LockConflictsTable[newMode]) ==
508               LockConflictsTable[newMode]);
509 
510     LockBucket* bucket = _getBucket(resId);
511     stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
512 
513     LockBucket::Map::iterator it = bucket->data.find(resId);
514     invariant(it != bucket->data.end());
515 
516     LockHead* const lock = it->second;
517 
518     if (lock->partitioned()) {
519         lock->migratePartitionedLockHeads();
520     }
521 
522     // Construct granted mask without our current mode, so that it is not counted as
523     // conflicting
524     uint32_t grantedModesWithoutCurrentRequest = 0;
525 
526     // We start the counting at 1 below, because LockModesCount also includes MODE_NONE
527     // at position 0, which can never be acquired/granted.
528     for (uint32_t i = 1; i < LockModesCount; i++) {
529         const uint32_t currentRequestHolds = (request->mode == static_cast<LockMode>(i) ? 1 : 0);
530 
531         if (lock->grantedCounts[i] > currentRequestHolds) {
532             grantedModesWithoutCurrentRequest |= modeMask(static_cast<LockMode>(i));
533         }
534     }
535 
536     // This check favours conversion requests over pending requests. For example:
537     //
538     // T1 requests lock L in IS
539     // T2 requests lock L in X
540     // T1 then upgrades L from IS -> S
541     //
542     // Because the check does not look into the conflict modes bitmap, it will grant L to
543     // T1 in S mode, instead of block, which would otherwise cause deadlock.
544     if (conflicts(newMode, grantedModesWithoutCurrentRequest)) {
545         request->status = LockRequest::STATUS_CONVERTING;
546         request->convertMode = newMode;
547 
548         lock->conversionsCount++;
549         lock->incGrantedModeCount(request->convertMode);
550 
551         return LOCK_WAITING;
552     } else {  // No conflict, existing request
553         lock->incGrantedModeCount(newMode);
554         lock->decGrantedModeCount(request->mode);
555         request->mode = newMode;
556 
557         return LOCK_OK;
558     }
559 }
560 
unlock(LockRequest * request)561 bool LockManager::unlock(LockRequest* request) {
562     // Fast path for decrementing multiple references of the same lock. It is safe to do this
563     // without locking, because 1) all calls for the same lock request must be done on the same
564     // thread and 2) if there are lock requests hanging of a given LockHead, then this lock
565     // will never disappear.
566     invariant(request->recursiveCount > 0);
567     request->recursiveCount--;
568     if ((request->status == LockRequest::STATUS_GRANTED) && (request->recursiveCount > 0)) {
569         return false;
570     }
571 
572     if (request->partitioned) {
573         // Unlocking a lock that was acquired as partitioned. The lock request may since have
574         // moved to the lock head, but there is no safe way to find out without synchronizing
575         // thorough the partition mutex. Migrations are expected to be rare.
576         invariant(request->status == LockRequest::STATUS_GRANTED ||
577                   request->status == LockRequest::STATUS_CONVERTING);
578         Partition* partition = _getPartition(request);
579         stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
580         //  Fast path: still partitioned.
581         if (request->partitionedLock) {
582             request->partitionedLock->grantedList.remove(request);
583             return true;
584         }
585 
586         // not partitioned anymore, fall through to regular case
587     }
588     invariant(request->lock);
589 
590     LockHead* lock = request->lock;
591     LockBucket* bucket = _getBucket(lock->resourceId);
592     stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
593 
594     if (request->status == LockRequest::STATUS_GRANTED) {
595         // This releases a currently held lock and is the most common path, so it should be
596         // as efficient as possible. The fast path for decrementing multiple references did
597         // already ensure request->recursiveCount == 0.
598 
599         // Remove from the granted list
600         lock->grantedList.remove(request);
601         lock->decGrantedModeCount(request->mode);
602 
603         if (request->compatibleFirst) {
604             invariant(lock->compatibleFirstCount > 0);
605             lock->compatibleFirstCount--;
606             invariant(lock->compatibleFirstCount == 0 || !lock->grantedList.empty());
607         }
608 
609         _onLockModeChanged(lock, lock->grantedCounts[request->mode] == 0);
610     } else if (request->status == LockRequest::STATUS_WAITING) {
611         // This cancels a pending lock request
612         invariant(request->recursiveCount == 0);
613 
614         lock->conflictList.remove(request);
615         lock->decConflictModeCount(request->mode);
616 
617         _onLockModeChanged(lock, true);
618     } else if (request->status == LockRequest::STATUS_CONVERTING) {
619         // This cancels a pending convert request
620         invariant(request->recursiveCount > 0);
621         invariant(lock->conversionsCount > 0);
622 
623         // Lock only goes from GRANTED to CONVERTING, so cancelling the conversion request
624         // brings it back to the previous granted mode.
625         request->status = LockRequest::STATUS_GRANTED;
626 
627         lock->conversionsCount--;
628         lock->decGrantedModeCount(request->convertMode);
629 
630         request->convertMode = MODE_NONE;
631 
632         _onLockModeChanged(lock, lock->grantedCounts[request->convertMode] == 0);
633     } else {
634         // Invalid request status
635         invariant(false);
636     }
637 
638     return (request->recursiveCount == 0);
639 }
640 
downgrade(LockRequest * request,LockMode newMode)641 void LockManager::downgrade(LockRequest* request, LockMode newMode) {
642     invariant(request->lock);
643     invariant(request->status == LockRequest::STATUS_GRANTED);
644     invariant(request->recursiveCount > 0);
645 
646     // The conflict set of the newMode should be a subset of the conflict set of the old mode.
647     // Can't downgrade from S -> IX for example.
648     invariant((LockConflictsTable[request->mode] | LockConflictsTable[newMode]) ==
649               LockConflictsTable[request->mode]);
650 
651     LockHead* lock = request->lock;
652 
653     LockBucket* bucket = _getBucket(lock->resourceId);
654     stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
655 
656     lock->incGrantedModeCount(newMode);
657     lock->decGrantedModeCount(request->mode);
658     request->mode = newMode;
659 
660     _onLockModeChanged(lock, true);
661 }
662 
cleanupUnusedLocks()663 void LockManager::cleanupUnusedLocks() {
664     for (unsigned i = 0; i < _numLockBuckets; i++) {
665         LockBucket* bucket = &_lockBuckets[i];
666         stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
667         _cleanupUnusedLocksInBucket(bucket);
668     }
669 }
670 
_cleanupUnusedLocksInBucket(LockBucket * bucket)671 void LockManager::_cleanupUnusedLocksInBucket(LockBucket* bucket) {
672     LockBucket::Map::iterator it = bucket->data.begin();
673     size_t deletedLockHeads = 0;
674     while (it != bucket->data.end()) {
675         LockHead* lock = it->second;
676 
677         if (lock->partitioned()) {
678             lock->migratePartitionedLockHeads();
679         }
680 
681         if (lock->grantedModes == 0) {
682             invariant(lock->grantedModes == 0);
683             invariant(lock->grantedList._front == nullptr);
684             invariant(lock->grantedList._back == nullptr);
685             invariant(lock->conflictModes == 0);
686             invariant(lock->conflictList._front == nullptr);
687             invariant(lock->conflictList._back == nullptr);
688             invariant(lock->conversionsCount == 0);
689             invariant(lock->compatibleFirstCount == 0);
690 
691             bucket->data.erase(it++);
692             deletedLockHeads++;
693             delete lock;
694         } else {
695             it++;
696         }
697     }
698 }
699 
_onLockModeChanged(LockHead * lock,bool checkConflictQueue)700 void LockManager::_onLockModeChanged(LockHead* lock, bool checkConflictQueue) {
701     // Unblock any converting requests (because conversions are still counted as granted and
702     // are on the granted queue).
703     for (LockRequest* iter = lock->grantedList._front;
704          (iter != nullptr) && (lock->conversionsCount > 0);
705          iter = iter->next) {
706         // Conversion requests are going in a separate queue
707         if (iter->status == LockRequest::STATUS_CONVERTING) {
708             invariant(iter->convertMode != 0);
709 
710             // Construct granted mask without our current mode, so that it is not accounted as
711             // a conflict
712             uint32_t grantedModesWithoutCurrentRequest = 0;
713 
714             // We start the counting at 1 below, because LockModesCount also includes
715             // MODE_NONE at position 0, which can never be acquired/granted.
716             for (uint32_t i = 1; i < LockModesCount; i++) {
717                 const uint32_t currentRequestHolds =
718                     (iter->mode == static_cast<LockMode>(i) ? 1 : 0);
719 
720                 const uint32_t currentRequestWaits =
721                     (iter->convertMode == static_cast<LockMode>(i) ? 1 : 0);
722 
723                 // We cannot both hold and wait on the same lock mode
724                 invariant(currentRequestHolds + currentRequestWaits <= 1);
725 
726                 if (lock->grantedCounts[i] > (currentRequestHolds + currentRequestWaits)) {
727                     grantedModesWithoutCurrentRequest |= modeMask(static_cast<LockMode>(i));
728                 }
729             }
730 
731             if (!conflicts(iter->convertMode, grantedModesWithoutCurrentRequest)) {
732                 lock->conversionsCount--;
733                 lock->decGrantedModeCount(iter->mode);
734                 iter->status = LockRequest::STATUS_GRANTED;
735                 iter->mode = iter->convertMode;
736                 iter->convertMode = MODE_NONE;
737 
738                 iter->notify->notify(lock->resourceId, LOCK_OK);
739             }
740         }
741     }
742 
743     // Grant any conflicting requests, which might now be unblocked. Note that the loop below
744     // slightly violates fairness in that it will grant *all* compatible requests on the line even
745     // though there might be conflicting ones interspersed between them. For example, assume that an
746     // X lock was just freed and the conflict queue looks like this:
747     //
748     //      IS -> IS -> X -> X -> S -> IS
749     //
750     // In strict FIFO, we should grant the first two IS modes and then stop when we reach the first
751     // X mode (the third request on the queue). However, the loop below would actually grant all IS
752     // + S modes and once they all drain it will grant X. The reason for this behaviour is
753     // increasing system throughput in the scenario where mutually compatible requests are
754     // interspersed with conflicting ones. For example, this would be a worst-case scenario for
755     // strict FIFO, because it would make the execution sequential:
756     //
757     //      S -> X -> S -> X -> S -> X
758 
759     LockRequest* iterNext = nullptr;
760 
761     bool newlyCompatibleFirst = false;  // Set on enabling compatibleFirst mode.
762     for (LockRequest* iter = lock->conflictList._front; (iter != nullptr) && checkConflictQueue;
763          iter = iterNext) {
764         invariant(iter->status == LockRequest::STATUS_WAITING);
765 
766         // Store the actual next pointer, because we muck with the iter below and move it to
767         // the granted queue.
768         iterNext = iter->next;
769 
770         if (conflicts(iter->mode, lock->grantedModes)) {
771             // If iter doesn't have a previous pointer, this means that it is at the front of the
772             // queue. If we continue scanning the queue beyond this point, we will starve it by
773             // granting more and more requests. However, if we newly transition to compatibleFirst
774             // mode, grant any waiting compatible requests.
775             if (!iter->prev && !newlyCompatibleFirst) {
776                 break;
777             }
778             continue;
779         }
780 
781         iter->status = LockRequest::STATUS_GRANTED;
782 
783         // Remove from the conflicts list
784         lock->conflictList.remove(iter);
785         lock->decConflictModeCount(iter->mode);
786 
787         // Add to the granted list
788         lock->grantedList.push_back(iter);
789         lock->incGrantedModeCount(iter->mode);
790 
791         if (iter->compatibleFirst) {
792             newlyCompatibleFirst |= (lock->compatibleFirstCount++ == 0);
793         }
794 
795         iter->notify->notify(lock->resourceId, LOCK_OK);
796 
797         // Small optimization - nothing is compatible with a newly granted MODE_X, so no point in
798         // looking further in the conflict queue. Conflicting MODE_X requests are skipped above.
799         if (iter->mode == MODE_X) {
800             break;
801         }
802     }
803 
804     // This is a convenient place to check that the state of the two request queues is in sync
805     // with the bitmask on the modes.
806     invariant((lock->grantedModes == 0) ^ (lock->grantedList._front != nullptr));
807     invariant((lock->conflictModes == 0) ^ (lock->conflictList._front != nullptr));
808 }
809 
_getBucket(ResourceId resId) const810 LockManager::LockBucket* LockManager::_getBucket(ResourceId resId) const {
811     return &_lockBuckets[resId % _numLockBuckets];
812 }
813 
_getPartition(LockRequest * request) const814 LockManager::Partition* LockManager::_getPartition(LockRequest* request) const {
815     return &_partitions[request->locker->getId() % _numPartitions];
816 }
817 
dump() const818 void LockManager::dump() const {
819     log() << "Dumping LockManager @ " << static_cast<const void*>(this) << '\n';
820 
821     for (unsigned i = 0; i < _numLockBuckets; i++) {
822         LockBucket* bucket = &_lockBuckets[i];
823         stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
824 
825         if (!bucket->data.empty()) {
826             _dumpBucket(bucket);
827         }
828     }
829 }
830 
_dumpBucketToBSON(const std::map<LockerId,BSONObj> & lockToClientMap,const LockBucket * bucket,BSONObjBuilder * result)831 void LockManager::_dumpBucketToBSON(const std::map<LockerId, BSONObj>& lockToClientMap,
832                                     const LockBucket* bucket,
833                                     BSONObjBuilder* result) {
834     for (auto& bucketEntry : bucket->data) {
835         const LockHead* lock = bucketEntry.second;
836 
837         if (lock->grantedList.empty()) {
838             // If there are no granted requests, this lock is empty, so no need to print it
839             continue;
840         }
841 
842         result->append("resourceId", lock->resourceId.toString());
843 
844         BSONArrayBuilder grantedLocks;
845         for (const LockRequest* iter = lock->grantedList._front; iter != nullptr;
846              iter = iter->next) {
847             _buildBucketBSON(iter, lockToClientMap, bucket, &grantedLocks);
848         }
849         result->append("granted", grantedLocks.arr());
850 
851         BSONArrayBuilder pendingLocks;
852         for (const LockRequest* iter = lock->conflictList._front; iter != nullptr;
853              iter = iter->next) {
854             _buildBucketBSON(iter, lockToClientMap, bucket, &pendingLocks);
855         }
856         result->append("pending", pendingLocks.arr());
857     }
858 }
859 
_buildBucketBSON(const LockRequest * iter,const std::map<LockerId,BSONObj> & lockToClientMap,const LockBucket * bucket,BSONArrayBuilder * locks)860 void LockManager::_buildBucketBSON(const LockRequest* iter,
861                                    const std::map<LockerId, BSONObj>& lockToClientMap,
862                                    const LockBucket* bucket,
863                                    BSONArrayBuilder* locks) {
864     BSONObjBuilder info;
865     info.append("mode", modeName(iter->mode));
866     info.append("convertMode", modeName(iter->convertMode));
867     info.append("enqueueAtFront", iter->enqueueAtFront);
868     info.append("compatibleFirst", iter->compatibleFirst);
869 
870     LockerId lockerId = iter->locker->getId();
871     std::map<LockerId, BSONObj>::const_iterator it = lockToClientMap.find(lockerId);
872     if (it != lockToClientMap.end()) {
873         info.appendElements(it->second);
874     }
875     locks->append(info.obj());
876 }
877 
getLockInfoBSON(const std::map<LockerId,BSONObj> & lockToClientMap,BSONObjBuilder * result)878 void LockManager::getLockInfoBSON(const std::map<LockerId, BSONObj>& lockToClientMap,
879                                   BSONObjBuilder* result) {
880     BSONArrayBuilder lockInfo;
881     for (unsigned i = 0; i < _numLockBuckets; i++) {
882         LockBucket* bucket = &_lockBuckets[i];
883         stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
884 
885         _cleanupUnusedLocksInBucket(bucket);
886         if (!bucket->data.empty()) {
887             BSONObjBuilder b;
888             _dumpBucketToBSON(lockToClientMap, bucket, &b);
889             lockInfo.append(b.obj());
890         }
891     }
892     result->append("lockInfo", lockInfo.arr());
893 }
894 
_dumpBucket(const LockBucket * bucket) const895 void LockManager::_dumpBucket(const LockBucket* bucket) const {
896     for (LockBucket::Map::const_iterator it = bucket->data.begin(); it != bucket->data.end();
897          it++) {
898         const LockHead* lock = it->second;
899 
900         if (lock->grantedList.empty()) {
901             // If there are no granted requests, this lock is empty, so no need to print it
902             continue;
903         }
904 
905         StringBuilder sb;
906         sb << "Lock @ " << lock << ": " << lock->resourceId.toString() << '\n';
907 
908         sb << "GRANTED:\n";
909         for (const LockRequest* iter = lock->grantedList._front; iter != nullptr;
910              iter = iter->next) {
911             std::stringstream threadId;
912             threadId << iter->locker->getThreadId() << " | " << std::showbase << std::hex
913                      << iter->locker->getThreadId();
914             sb << '\t' << "LockRequest " << iter->locker->getId() << " @ " << iter->locker << ": "
915                << "Mode = " << modeName(iter->mode) << "; "
916                << "Thread = " << threadId.str() << "; "
917                << "ConvertMode = " << modeName(iter->convertMode) << "; "
918                << "EnqueueAtFront = " << iter->enqueueAtFront << "; "
919                << "CompatibleFirst = " << iter->compatibleFirst << "; " << '\n';
920         }
921 
922         sb << "PENDING:\n";
923         for (const LockRequest* iter = lock->conflictList._front; iter != nullptr;
924              iter = iter->next) {
925             std::stringstream threadId;
926             threadId << iter->locker->getThreadId() << " | " << std::showbase << std::hex
927                      << iter->locker->getThreadId();
928             sb << '\t' << "LockRequest " << iter->locker->getId() << " @ " << iter->locker << ": "
929                << "Mode = " << modeName(iter->mode) << "; "
930                << "Thread = " << threadId.str() << "; "
931                << "ConvertMode = " << modeName(iter->convertMode) << "; "
932                << "EnqueueAtFront = " << iter->enqueueAtFront << "; "
933                << "CompatibleFirst = " << iter->compatibleFirst << "; " << '\n';
934         }
935 
936         sb << "-----------------------------------------------------------\n";
937         log() << sb.str();
938     }
939 }
940 
find(ResourceId resId)941 PartitionedLockHead* LockManager::Partition::find(ResourceId resId) {
942     Map::iterator it = data.find(resId);
943     return it == data.end() ? nullptr : it->second;
944 }
945 
findOrInsert(ResourceId resId)946 PartitionedLockHead* LockManager::Partition::findOrInsert(ResourceId resId) {
947     PartitionedLockHead* lock;
948     Map::iterator it = data.find(resId);
949     if (it == data.end()) {
950         lock = new PartitionedLockHead();
951         lock->initNew(resId);
952 
953         data.insert(Map::value_type(resId, lock));
954     } else {
955         lock = it->second;
956     }
957     return lock;
958 }
959 
findOrInsert(ResourceId resId)960 LockHead* LockManager::LockBucket::findOrInsert(ResourceId resId) {
961     LockHead* lock;
962     Map::iterator it = data.find(resId);
963     if (it == data.end()) {
964         lock = new LockHead();
965         lock->initNew(resId);
966 
967         data.insert(Map::value_type(resId, lock));
968     } else {
969         lock = it->second;
970     }
971     return lock;
972 }
973 
974 //
975 // DeadlockDetector
976 //
977 
DeadlockDetector(const LockManager & lockMgr,const Locker * initialLocker)978 DeadlockDetector::DeadlockDetector(const LockManager& lockMgr, const Locker* initialLocker)
979     : _lockMgr(lockMgr), _initialLockerId(initialLocker->getId()), _foundCycle(false) {
980     const ResourceId resId = initialLocker->getWaitingResource();
981 
982     // If there is no resource waiting there is nothing to do
983     if (resId.isValid()) {
984         _queue.push_front(UnprocessedNode(_initialLockerId, resId));
985     }
986 }
987 
next()988 bool DeadlockDetector::next() {
989     if (_queue.empty())
990         return false;
991 
992     UnprocessedNode front = _queue.front();
993     _queue.pop_front();
994 
995     _processNextNode(front);
996 
997     return !_queue.empty();
998 }
999 
hasCycle() const1000 bool DeadlockDetector::hasCycle() const {
1001     invariant(_queue.empty());
1002 
1003     return _foundCycle;
1004 }
1005 
toString() const1006 std::string DeadlockDetector::toString() const {
1007     StringBuilder sb;
1008 
1009     for (WaitForGraph::const_iterator it = _graph.begin(); it != _graph.end(); it++) {
1010         sb << "Locker " << it->first << " waits for resource " << it->second.resId.toString()
1011            << " held by [";
1012 
1013         const ConflictingOwnersList owners = it->second.owners;
1014         for (ConflictingOwnersList::const_iterator itW = owners.begin(); itW != owners.end();
1015              itW++) {
1016             sb << *itW << ", ";
1017         }
1018 
1019         sb << "]\n";
1020     }
1021 
1022     return sb.str();
1023 }
1024 
_processNextNode(const UnprocessedNode & node)1025 void DeadlockDetector::_processNextNode(const UnprocessedNode& node) {
1026     // Locate the request
1027     LockManager::LockBucket* bucket = _lockMgr._getBucket(node.resId);
1028     stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
1029 
1030     LockManager::LockBucket::Map::const_iterator iter = bucket->data.find(node.resId);
1031     if (iter == bucket->data.end()) {
1032         return;
1033     }
1034 
1035     const LockHead* lock = iter->second;
1036 
1037     LockRequest* request = lock->findRequest(node.lockerId);
1038 
1039     // It is possible that a request which was thought to be waiting suddenly became
1040     // granted, so check that before proceeding
1041     if (!request || (request->status == LockRequest::STATUS_GRANTED)) {
1042         return;
1043     }
1044 
1045     std::pair<WaitForGraph::iterator, bool> val =
1046         _graph.insert(WaitForGraphPair(node.lockerId, Edges(node.resId)));
1047     if (!val.second) {
1048         // We already saw this locker id, which means we have a cycle.
1049         if (!_foundCycle) {
1050             _foundCycle = (node.lockerId == _initialLockerId);
1051         }
1052 
1053         return;
1054     }
1055 
1056     Edges& edges = val.first->second;
1057 
1058     bool seen = false;
1059     for (LockRequest* it = lock->grantedList._back; it != nullptr; it = it->prev) {
1060         // We can't conflict with ourselves
1061         if (it == request) {
1062             seen = true;
1063             continue;
1064         }
1065 
1066         // If we are a regular conflicting request, both granted and conversion modes need to
1067         // be checked for conflict, since conversions will be granted first.
1068         if (request->status == LockRequest::STATUS_WAITING) {
1069             if (conflicts(request->mode, modeMask(it->mode)) ||
1070                 conflicts(request->mode, modeMask(it->convertMode))) {
1071                 const LockerId lockerId = it->locker->getId();
1072                 const ResourceId waitResId = it->locker->getWaitingResource();
1073 
1074                 if (waitResId.isValid()) {
1075                     _queue.push_front(UnprocessedNode(lockerId, waitResId));
1076                     edges.owners.push_back(lockerId);
1077                 }
1078             }
1079 
1080             continue;
1081         }
1082 
1083         // If we are a conversion request, only requests, which are before us need to be
1084         // accounted for.
1085         invariant(request->status == LockRequest::STATUS_CONVERTING);
1086 
1087         if (conflicts(request->convertMode, modeMask(it->mode)) ||
1088             (seen && conflicts(request->convertMode, modeMask(it->convertMode)))) {
1089             const LockerId lockerId = it->locker->getId();
1090             const ResourceId waitResId = it->locker->getWaitingResource();
1091 
1092             if (waitResId.isValid()) {
1093                 _queue.push_front(UnprocessedNode(lockerId, waitResId));
1094                 edges.owners.push_back(lockerId);
1095             }
1096         }
1097     }
1098 
1099     // All conflicting waits, which would be granted before us
1100     for (LockRequest* it = request->prev;
1101          (request->status == LockRequest::STATUS_WAITING) && (it != nullptr);
1102          it = it->prev) {
1103         // We started from the previous element, so we should never see ourselves
1104         invariant(it != request);
1105 
1106         if (conflicts(request->mode, modeMask(it->mode))) {
1107             const LockerId lockerId = it->locker->getId();
1108             const ResourceId waitResId = it->locker->getWaitingResource();
1109 
1110             if (waitResId.isValid()) {
1111                 _queue.push_front(UnprocessedNode(lockerId, waitResId));
1112                 edges.owners.push_back(lockerId);
1113             }
1114         }
1115     }
1116 }
1117 
1118 
1119 //
1120 // ResourceId
1121 //
1122 
fullHash(ResourceType type,uint64_t hashId)1123 uint64_t ResourceId::fullHash(ResourceType type, uint64_t hashId) {
1124     return (static_cast<uint64_t>(type) << (64 - resourceTypeBits)) +
1125         (hashId & (std::numeric_limits<uint64_t>::max() >> resourceTypeBits));
1126 }
1127 
ResourceId(ResourceType type,StringData ns)1128 ResourceId::ResourceId(ResourceType type, StringData ns)
1129     : _fullHash(fullHash(type, hashStringData(ns))) {
1130 #ifdef MONGO_CONFIG_DEBUG_BUILD
1131     _nsCopy = ns.toString();
1132 #endif
1133 }
1134 
ResourceId(ResourceType type,const std::string & ns)1135 ResourceId::ResourceId(ResourceType type, const std::string& ns)
1136     : _fullHash(fullHash(type, hashStringData(ns))) {
1137 #ifdef MONGO_CONFIG_DEBUG_BUILD
1138     _nsCopy = ns;
1139 #endif
1140 }
1141 
ResourceId(ResourceType type,uint64_t hashId)1142 ResourceId::ResourceId(ResourceType type, uint64_t hashId) : _fullHash(fullHash(type, hashId)) {}
1143 
toString() const1144 std::string ResourceId::toString() const {
1145     StringBuilder ss;
1146     ss << "{" << _fullHash << ": " << resourceTypeName(getType()) << ", " << getHashId();
1147     if (getType() == RESOURCE_MUTEX) {
1148         ss << ", " << Lock::ResourceMutex::getName(*this);
1149     }
1150 
1151 #ifdef MONGO_CONFIG_DEBUG_BUILD
1152     ss << ", " << _nsCopy;
1153 #endif
1154 
1155     ss << "}";
1156 
1157     return ss.str();
1158 }
1159 
1160 
1161 //
1162 // LockRequest
1163 //
1164 
initNew(Locker * locker,LockGrantNotification * notify)1165 void LockRequest::initNew(Locker* locker, LockGrantNotification* notify) {
1166     this->locker = locker;
1167     this->notify = notify;
1168 
1169     enqueueAtFront = false;
1170     compatibleFirst = false;
1171     recursiveCount = 1;
1172 
1173     lock = nullptr;
1174     partitionedLock = nullptr;
1175     prev = nullptr;
1176     next = nullptr;
1177     status = STATUS_NEW;
1178     partitioned = false;
1179     mode = MODE_NONE;
1180     convertMode = MODE_NONE;
1181 }
1182 
1183 
1184 //
1185 // Helper calls
1186 //
1187 
modeName(LockMode mode)1188 const char* modeName(LockMode mode) {
1189     return LockModeNames[mode];
1190 }
1191 
legacyModeName(LockMode mode)1192 const char* legacyModeName(LockMode mode) {
1193     return LegacyLockModeNames[mode];
1194 }
1195 
isModeCovered(LockMode mode,LockMode coveringMode)1196 bool isModeCovered(LockMode mode, LockMode coveringMode) {
1197     return (LockConflictsTable[coveringMode] | LockConflictsTable[mode]) ==
1198         LockConflictsTable[coveringMode];
1199 }
1200 
resourceTypeName(ResourceType resourceType)1201 const char* resourceTypeName(ResourceType resourceType) {
1202     return ResourceTypeNames[resourceType];
1203 }
1204 
lockRequestStatusName(LockRequest::Status status)1205 const char* lockRequestStatusName(LockRequest::Status status) {
1206     return LockRequestStatusNames[status];
1207 }
1208 
1209 }  // namespace mongo
1210