1
2 /**
3 * Copyright (C) 2018-present MongoDB, Inc.
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the Server Side Public License, version 1,
7 * as published by MongoDB, Inc.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * Server Side Public License for more details.
13 *
14 * You should have received a copy of the Server Side Public License
15 * along with this program. If not, see
16 * <http://www.mongodb.com/licensing/server-side-public-license>.
17 *
18 * As a special exception, the copyright holders give permission to link the
19 * code of portions of this program with the OpenSSL library under certain
20 * conditions as described in each individual source file and distribute
21 * linked combinations including the program with the OpenSSL library. You
22 * must comply with the Server Side Public License in all respects for
23 * all of the code used other than as permitted herein. If you modify file(s)
24 * with this exception, you may extend this exception to your version of the
25 * file(s), but you are not obligated to do so. If you do not wish to do so,
26 * delete this exception statement from your version. If you delete this
27 * exception statement from all source files in the program, then also delete
28 * it in the license file.
29 */
30
31 #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault
32
33 #include "mongo/platform/basic.h"
34
35 #include "mongo/db/concurrency/lock_manager.h"
36
37 #include <third_party/murmurhash3/MurmurHash3.h>
38
39 #include "mongo/base/data_type_endian.h"
40 #include "mongo/base/data_view.h"
41 #include "mongo/base/static_assert.h"
42 #include "mongo/bson/bsonobjbuilder.h"
43 #include "mongo/config.h"
44 #include "mongo/db/concurrency/d_concurrency.h"
45 #include "mongo/db/concurrency/locker.h"
46 #include "mongo/util/assert_util.h"
47 #include "mongo/util/log.h"
48 #include "mongo/util/stringutils.h"
49 #include "mongo/util/timer.h"
50
51 namespace mongo {
52 namespace {
53
54 /**
55 * Map of conflicts. 'LockConflictsTable[newMode] & existingMode != 0' means that a new request
56 * with the given 'newMode' conflicts with an existing request with mode 'existingMode'.
57 */
58 static const int LockConflictsTable[] = {
59 // MODE_NONE
60 0,
61
62 // MODE_IS
63 (1 << MODE_X),
64
65 // MODE_IX
66 (1 << MODE_S) | (1 << MODE_X),
67
68 // MODE_S
69 (1 << MODE_IX) | (1 << MODE_X),
70
71 // MODE_X
72 (1 << MODE_S) | (1 << MODE_X) | (1 << MODE_IS) | (1 << MODE_IX),
73 };
74
75 // Mask of modes
76 const uint64_t intentModes = (1 << MODE_IS) | (1 << MODE_IX);
77
78 // Ensure we do not add new modes without updating the conflicts table
79 MONGO_STATIC_ASSERT((sizeof(LockConflictsTable) / sizeof(LockConflictsTable[0])) == LockModesCount);
80
81
82 /**
83 * Maps the mode id to a string.
84 */
85 static const char* LockModeNames[] = {"NONE", "IS", "IX", "S", "X"};
86
87 static const char* LegacyLockModeNames[] = {"", "r", "w", "R", "W"};
88
89 // Ensure we do not add new modes without updating the names array
90 MONGO_STATIC_ASSERT((sizeof(LockModeNames) / sizeof(LockModeNames[0])) == LockModesCount);
91 MONGO_STATIC_ASSERT((sizeof(LegacyLockModeNames) / sizeof(LegacyLockModeNames[0])) ==
92 LockModesCount);
93
94 // Helper functions for the lock modes
conflicts(LockMode newMode,uint32_t existingModesMask)95 bool conflicts(LockMode newMode, uint32_t existingModesMask) {
96 return (LockConflictsTable[newMode] & existingModesMask) != 0;
97 }
98
modeMask(LockMode mode)99 uint32_t modeMask(LockMode mode) {
100 return 1 << mode;
101 }
102
hashStringData(StringData str)103 uint64_t hashStringData(StringData str) {
104 char hash[16];
105 MurmurHash3_x64_128(str.rawData(), str.size(), 0, hash);
106 return static_cast<size_t>(ConstDataView(hash).read<LittleEndian<std::uint64_t>>());
107 }
108
109 /**
110 * Maps the resource id to a human-readable string.
111 */
112 static const char* ResourceTypeNames[] = {
113 "Invalid", "Global", "MMAPV1Journal", "Database", "Collection", "Metadata", "Mutex"};
114
115 // Ensure we do not add new types without updating the names array
116 MONGO_STATIC_ASSERT((sizeof(ResourceTypeNames) / sizeof(ResourceTypeNames[0])) ==
117 ResourceTypesCount);
118
119
120 /**
121 * Maps the LockRequest status to a human-readable string.
122 */
123 static const char* LockRequestStatusNames[] = {
124 "new", "granted", "waiting", "converting",
125 };
126
127 // Ensure we do not add new status types without updating the names array
128 MONGO_STATIC_ASSERT((sizeof(LockRequestStatusNames) / sizeof(LockRequestStatusNames[0])) ==
129 LockRequest::StatusCount);
130
131 } // namespace
132
133 /**
134 * There is one of these objects for each resource that has a lock request. Empty objects (i.e.
135 * LockHead with no requests) are allowed to exist on the lock manager's hash table.
136 *
137 * The memory and lifetime is controlled entirely by the LockManager class.
138 *
139 * Not thread-safe and should only be accessed under the LockManager's bucket lock. Must be locked
140 * before locking a partition, not after.
141 */
142 struct LockHead {
143
144 /**
145 * Used for initialization of a LockHead, which might have been retrieved from cache and also in
146 * order to keep the LockHead structure a POD.
147 */
initNewmongo::LockHead148 void initNew(ResourceId resId) {
149 resourceId = resId;
150
151 grantedList.reset();
152 memset(grantedCounts, 0, sizeof(grantedCounts));
153 grantedModes = 0;
154
155 conflictList.reset();
156 memset(conflictCounts, 0, sizeof(conflictCounts));
157 conflictModes = 0;
158
159 conversionsCount = 0;
160 compatibleFirstCount = 0;
161 }
162
163 /**
164 * True iff there may be partitions with granted requests for this resource.
165 */
partitionedmongo::LockHead166 bool partitioned() const {
167 return !partitions.empty();
168 }
169
170 /**
171 * Locates the request corresponding to the particular locker or returns nullptr. Must be called
172 * with the bucket holding this lock head locked.
173 */
findRequestmongo::LockHead174 LockRequest* findRequest(LockerId lockerId) const {
175 // Check the granted queue first
176 for (LockRequest* it = grantedList._front; it != nullptr; it = it->next) {
177 if (it->locker->getId() == lockerId) {
178 return it;
179 }
180 }
181
182 // Check the conflict queue second
183 for (LockRequest* it = conflictList._front; it != nullptr; it = it->next) {
184 if (it->locker->getId() == lockerId) {
185 return it;
186 }
187 }
188
189 return nullptr;
190 }
191
192 /**
193 * Finish creation of request and put it on the lockhead's conflict or granted queues. Returns
194 * LOCK_WAITING for conflict case and LOCK_OK otherwise.
195 */
newRequestmongo::LockHead196 LockResult newRequest(LockRequest* request) {
197 invariant(!request->partitionedLock);
198 request->lock = this;
199
200 // We cannot set request->partitioned to false, as this might be a migration, in which case
201 // access to that field is not protected. The 'partitioned' member instead indicates if a
202 // request was initially partitioned.
203
204 // New lock request. Queue after all granted modes and after any already requested
205 // conflicting modes
206 if (conflicts(request->mode, grantedModes) ||
207 (!compatibleFirstCount && conflicts(request->mode, conflictModes))) {
208 request->status = LockRequest::STATUS_WAITING;
209
210 // Put it on the conflict queue. Conflicts are granted front to back.
211 if (request->enqueueAtFront) {
212 conflictList.push_front(request);
213 } else {
214 conflictList.push_back(request);
215 }
216
217 incConflictModeCount(request->mode);
218
219 return LOCK_WAITING;
220 }
221
222 // No conflict, new request
223 request->status = LockRequest::STATUS_GRANTED;
224
225 grantedList.push_back(request);
226 incGrantedModeCount(request->mode);
227
228 if (request->compatibleFirst) {
229 compatibleFirstCount++;
230 }
231
232 return LOCK_OK;
233 }
234
235 /**
236 * Lock each partitioned LockHead in turn, and move any (granted) intent mode requests for
237 * lock->resourceId to lock, which must itself already be locked.
238 */
239 void migratePartitionedLockHeads();
240
241 // Methods to maintain the granted queue
incGrantedModeCountmongo::LockHead242 void incGrantedModeCount(LockMode mode) {
243 invariant(grantedCounts[mode] >= 0);
244 if (++grantedCounts[mode] == 1) {
245 invariant((grantedModes & modeMask(mode)) == 0);
246 grantedModes |= modeMask(mode);
247 }
248 }
249
decGrantedModeCountmongo::LockHead250 void decGrantedModeCount(LockMode mode) {
251 invariant(grantedCounts[mode] >= 1);
252 if (--grantedCounts[mode] == 0) {
253 invariant((grantedModes & modeMask(mode)) == modeMask(mode));
254 grantedModes &= ~modeMask(mode);
255 }
256 }
257
258 // Methods to maintain the conflict queue
incConflictModeCountmongo::LockHead259 void incConflictModeCount(LockMode mode) {
260 invariant(conflictCounts[mode] >= 0);
261 if (++conflictCounts[mode] == 1) {
262 invariant((conflictModes & modeMask(mode)) == 0);
263 conflictModes |= modeMask(mode);
264 }
265 }
266
decConflictModeCountmongo::LockHead267 void decConflictModeCount(LockMode mode) {
268 invariant(conflictCounts[mode] >= 1);
269 if (--conflictCounts[mode] == 0) {
270 invariant((conflictModes & modeMask(mode)) == modeMask(mode));
271 conflictModes &= ~modeMask(mode);
272 }
273 }
274
275 // Id of the resource which is protected by this lock. Initialized at construction time and does
276 // not change.
277 ResourceId resourceId;
278
279 //
280 // Granted queue
281 //
282
283 // Doubly-linked list of requests, which have been granted. Newly granted requests go to
284 // the end of the queue. Conversion requests are granted from the beginning forward.
285 LockRequestList grantedList;
286
287 // Counts the grants and coversion counts for each of the supported lock modes. These
288 // counts should exactly match the aggregated modes on the granted list.
289 uint32_t grantedCounts[LockModesCount];
290
291 // Bit-mask of the granted + converting modes on the granted queue. Maintained in lock-step
292 // with the grantedCounts array.
293 uint32_t grantedModes;
294
295 //
296 // Conflict queue
297 //
298
299 // Doubly-linked list of requests, which have not been granted yet because they conflict
300 // with the set of granted modes. Requests are queued at the end of the queue and are
301 // granted from the beginning forward, which gives these locks FIFO ordering. Exceptions
302 // are high-priorty locks, such as the MMAP V1 flush lock.
303 LockRequestList conflictList;
304
305 // Counts the conflicting requests for each of the lock modes. These counts should exactly
306 // match the aggregated modes on the conflicts list.
307 uint32_t conflictCounts[LockModesCount];
308
309 // Bit-mask of the conflict modes on the conflict queue. Maintained in lock-step with the
310 // conflictCounts array.
311 uint32_t conflictModes;
312
313 // References partitions that may have PartitionedLockHeads for this LockHead.
314 // Non-empty implies the lock has no conflicts and only has intent modes as grantedModes.
315 // TODO: Remove this vector and make LockHead a POD
316 std::vector<LockManager::Partition*> partitions;
317
318 //
319 // Conversion
320 //
321
322 // Counts the number of requests on the granted queue, which have requested any kind of
323 // conflicting conversion and are blocked (i.e. all requests which are currently
324 // STATUS_CONVERTING). This is an optimization for unlocking in that we do not need to
325 // check the granted queue for requests in STATUS_CONVERTING if this count is zero. This
326 // saves cycles in the regular case and only burdens the less-frequent lock upgrade case.
327 uint32_t conversionsCount;
328
329 // Counts the number of requests on the granted queue, which have requested that the policy
330 // be switched to compatible-first. As long as this value is > 0, the policy will stay
331 // compatible-first.
332 uint32_t compatibleFirstCount;
333 };
334
335 /**
336 * The PartitionedLockHead allows optimizing the case where requests overwhelmingly use
337 * the intent lock modes MODE_IS and MODE_IX, which are compatible with each other.
338 * Having to use a single LockHead causes contention where none would be needed.
339 * So, each Locker is associated with a specific partition containing a mapping
340 * of resourceId to PartitionedLockHead.
341 *
342 * As long as all lock requests for a resource have an intent mode, as opposed to a conflicting
343 * mode, its LockHead may reference ParitionedLockHeads. A partitioned LockHead will not have
344 * any conflicts. The total set of granted requests (with intent mode) is the union of
345 * its grantedList and all grantedLists in PartitionedLockHeads.
346 *
347 * The existence of a PartitionedLockHead for a resource implies that its LockHead is
348 * partitioned. If a conflicting request is made on a LockHead, all requests from
349 * PartitionedLockHeads are migrated to that LockHead and the LockHead no longer partitioned.
350 *
351 * Not thread-safe, must be accessed under its partition lock.
352 * May not lock a LockManager bucket while holding a partition lock.
353 */
354 struct PartitionedLockHead {
355
initNewmongo::PartitionedLockHead356 void initNew(ResourceId resId) {
357 grantedList.reset();
358 }
359
newRequestmongo::PartitionedLockHead360 void newRequest(LockRequest* request) {
361 invariant(request->partitioned);
362 invariant(!request->lock);
363 request->partitionedLock = this;
364 request->status = LockRequest::STATUS_GRANTED;
365
366 grantedList.push_back(request);
367 }
368
369 // Doubly-linked list of requests, which have been granted. Newly granted requests go to the end
370 // of the queue. The PartitionedLockHead never contains anything but granted requests with
371 // intent modes.
372 LockRequestList grantedList;
373 };
374
migratePartitionedLockHeads()375 void LockHead::migratePartitionedLockHeads() {
376 invariant(partitioned());
377
378 // There can't be non-intent modes or conflicts when the lock is partitioned
379 invariant(!(grantedModes & ~intentModes) && !conflictModes);
380
381 // Migration time: lock each partition in turn and transfer its requests, if any
382 while (partitioned()) {
383 LockManager::Partition* partition = partitions.back();
384 stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
385
386 LockManager::Partition::Map::iterator it = partition->data.find(resourceId);
387 if (it != partition->data.end()) {
388 PartitionedLockHead* partitionedLock = it->second;
389
390 while (!partitionedLock->grantedList.empty()) {
391 LockRequest* request = partitionedLock->grantedList._front;
392 partitionedLock->grantedList.remove(request);
393 request->partitionedLock = nullptr;
394 // Ordering is important here, as the next/prev fields are shared.
395 // Note that newRequest() will preserve the recursiveCount in this case
396 LockResult res = newRequest(request);
397 invariant(res == LOCK_OK); // Lock must still be granted
398 }
399 partition->data.erase(it);
400 delete partitionedLock;
401 }
402 // Don't pop-back to early as otherwise the lock will be considered not partioned in
403 // newRequest().
404 partitions.pop_back();
405 }
406 }
407
408 //
409 // LockManager
410 //
411
412 // Have more buckets than CPUs to reduce contention on lock and caches
413 const unsigned LockManager::_numLockBuckets(128);
414
415 // Balance scalability of intent locks against potential added cost of conflicting locks.
416 // The exact value doesn't appear very important, but should be power of two
417 const unsigned LockManager::_numPartitions = 32;
418
LockManager()419 LockManager::LockManager() {
420 _lockBuckets = new LockBucket[_numLockBuckets];
421 _partitions = new Partition[_numPartitions];
422 }
423
~LockManager()424 LockManager::~LockManager() {
425 cleanupUnusedLocks();
426
427 for (unsigned i = 0; i < _numLockBuckets; i++) {
428 // TODO: dump more information about the non-empty bucket to see what locks were leaked
429 invariant(_lockBuckets[i].data.empty());
430 }
431
432 delete[] _lockBuckets;
433 delete[] _partitions;
434 }
435
lock(ResourceId resId,LockRequest * request,LockMode mode)436 LockResult LockManager::lock(ResourceId resId, LockRequest* request, LockMode mode) {
437 // Sanity check that requests are not being reused without proper cleanup
438 invariant(request->status == LockRequest::STATUS_NEW);
439 invariant(request->recursiveCount == 1);
440
441 request->partitioned = (mode == MODE_IX || mode == MODE_IS);
442 request->mode = mode;
443
444 // For intent modes, try the PartitionedLockHead
445 if (request->partitioned) {
446 Partition* partition = _getPartition(request);
447 stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
448
449 // Fast path for intent locks
450 PartitionedLockHead* partitionedLock = partition->find(resId);
451
452 if (partitionedLock) {
453 partitionedLock->newRequest(request);
454 return LOCK_OK;
455 }
456 // Unsuccessful: there was no PartitionedLockHead yet, so use regular LockHead.
457 // Must not hold any locks. It is OK for requests with intent modes to be on
458 // both a PartitionedLockHead and a regular LockHead, so the race here is benign.
459 }
460
461 // Use regular LockHead, maybe start partitioning
462 LockBucket* bucket = _getBucket(resId);
463 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
464
465 LockHead* lock = bucket->findOrInsert(resId);
466
467 // Start a partitioned lock if possible
468 if (request->partitioned && !(lock->grantedModes & (~intentModes)) && !lock->conflictModes) {
469 Partition* partition = _getPartition(request);
470 stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
471 PartitionedLockHead* partitionedLock = partition->findOrInsert(resId);
472 invariant(partitionedLock);
473 lock->partitions.push_back(partition);
474 partitionedLock->newRequest(request);
475 return LOCK_OK;
476 }
477
478 // For the first lock with a non-intent mode, migrate requests from partitioned lock heads
479 if (lock->partitioned()) {
480 lock->migratePartitionedLockHeads();
481 }
482
483 request->partitioned = false;
484 return lock->newRequest(request);
485 }
486
convert(ResourceId resId,LockRequest * request,LockMode newMode)487 LockResult LockManager::convert(ResourceId resId, LockRequest* request, LockMode newMode) {
488 // If we are here, we already hold the lock in some mode. In order to keep it simple, we do
489 // not allow requesting a conversion while a lock is already waiting or pending conversion.
490 invariant(request->status == LockRequest::STATUS_GRANTED);
491 invariant(request->recursiveCount > 0);
492
493 request->recursiveCount++;
494
495 // Fast path for acquiring the same lock multiple times in modes, which are already covered
496 // by the current mode. It is safe to do this without locking, because 1) all calls for the
497 // same lock request must be done on the same thread and 2) if there are lock requests
498 // hanging off a given LockHead, then this lock will never disappear.
499 if ((LockConflictsTable[request->mode] | LockConflictsTable[newMode]) ==
500 LockConflictsTable[request->mode]) {
501 return LOCK_OK;
502 }
503
504 // TODO: For the time being we do not need conversions between unrelated lock modes (i.e.,
505 // modes which both add and remove to the conflicts set), so these are not implemented yet
506 // (e.g., S -> IX).
507 invariant((LockConflictsTable[request->mode] | LockConflictsTable[newMode]) ==
508 LockConflictsTable[newMode]);
509
510 LockBucket* bucket = _getBucket(resId);
511 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
512
513 LockBucket::Map::iterator it = bucket->data.find(resId);
514 invariant(it != bucket->data.end());
515
516 LockHead* const lock = it->second;
517
518 if (lock->partitioned()) {
519 lock->migratePartitionedLockHeads();
520 }
521
522 // Construct granted mask without our current mode, so that it is not counted as
523 // conflicting
524 uint32_t grantedModesWithoutCurrentRequest = 0;
525
526 // We start the counting at 1 below, because LockModesCount also includes MODE_NONE
527 // at position 0, which can never be acquired/granted.
528 for (uint32_t i = 1; i < LockModesCount; i++) {
529 const uint32_t currentRequestHolds = (request->mode == static_cast<LockMode>(i) ? 1 : 0);
530
531 if (lock->grantedCounts[i] > currentRequestHolds) {
532 grantedModesWithoutCurrentRequest |= modeMask(static_cast<LockMode>(i));
533 }
534 }
535
536 // This check favours conversion requests over pending requests. For example:
537 //
538 // T1 requests lock L in IS
539 // T2 requests lock L in X
540 // T1 then upgrades L from IS -> S
541 //
542 // Because the check does not look into the conflict modes bitmap, it will grant L to
543 // T1 in S mode, instead of block, which would otherwise cause deadlock.
544 if (conflicts(newMode, grantedModesWithoutCurrentRequest)) {
545 request->status = LockRequest::STATUS_CONVERTING;
546 request->convertMode = newMode;
547
548 lock->conversionsCount++;
549 lock->incGrantedModeCount(request->convertMode);
550
551 return LOCK_WAITING;
552 } else { // No conflict, existing request
553 lock->incGrantedModeCount(newMode);
554 lock->decGrantedModeCount(request->mode);
555 request->mode = newMode;
556
557 return LOCK_OK;
558 }
559 }
560
unlock(LockRequest * request)561 bool LockManager::unlock(LockRequest* request) {
562 // Fast path for decrementing multiple references of the same lock. It is safe to do this
563 // without locking, because 1) all calls for the same lock request must be done on the same
564 // thread and 2) if there are lock requests hanging of a given LockHead, then this lock
565 // will never disappear.
566 invariant(request->recursiveCount > 0);
567 request->recursiveCount--;
568 if ((request->status == LockRequest::STATUS_GRANTED) && (request->recursiveCount > 0)) {
569 return false;
570 }
571
572 if (request->partitioned) {
573 // Unlocking a lock that was acquired as partitioned. The lock request may since have
574 // moved to the lock head, but there is no safe way to find out without synchronizing
575 // thorough the partition mutex. Migrations are expected to be rare.
576 invariant(request->status == LockRequest::STATUS_GRANTED ||
577 request->status == LockRequest::STATUS_CONVERTING);
578 Partition* partition = _getPartition(request);
579 stdx::lock_guard<SimpleMutex> scopedLock(partition->mutex);
580 // Fast path: still partitioned.
581 if (request->partitionedLock) {
582 request->partitionedLock->grantedList.remove(request);
583 return true;
584 }
585
586 // not partitioned anymore, fall through to regular case
587 }
588 invariant(request->lock);
589
590 LockHead* lock = request->lock;
591 LockBucket* bucket = _getBucket(lock->resourceId);
592 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
593
594 if (request->status == LockRequest::STATUS_GRANTED) {
595 // This releases a currently held lock and is the most common path, so it should be
596 // as efficient as possible. The fast path for decrementing multiple references did
597 // already ensure request->recursiveCount == 0.
598
599 // Remove from the granted list
600 lock->grantedList.remove(request);
601 lock->decGrantedModeCount(request->mode);
602
603 if (request->compatibleFirst) {
604 invariant(lock->compatibleFirstCount > 0);
605 lock->compatibleFirstCount--;
606 invariant(lock->compatibleFirstCount == 0 || !lock->grantedList.empty());
607 }
608
609 _onLockModeChanged(lock, lock->grantedCounts[request->mode] == 0);
610 } else if (request->status == LockRequest::STATUS_WAITING) {
611 // This cancels a pending lock request
612 invariant(request->recursiveCount == 0);
613
614 lock->conflictList.remove(request);
615 lock->decConflictModeCount(request->mode);
616
617 _onLockModeChanged(lock, true);
618 } else if (request->status == LockRequest::STATUS_CONVERTING) {
619 // This cancels a pending convert request
620 invariant(request->recursiveCount > 0);
621 invariant(lock->conversionsCount > 0);
622
623 // Lock only goes from GRANTED to CONVERTING, so cancelling the conversion request
624 // brings it back to the previous granted mode.
625 request->status = LockRequest::STATUS_GRANTED;
626
627 lock->conversionsCount--;
628 lock->decGrantedModeCount(request->convertMode);
629
630 request->convertMode = MODE_NONE;
631
632 _onLockModeChanged(lock, lock->grantedCounts[request->convertMode] == 0);
633 } else {
634 // Invalid request status
635 invariant(false);
636 }
637
638 return (request->recursiveCount == 0);
639 }
640
downgrade(LockRequest * request,LockMode newMode)641 void LockManager::downgrade(LockRequest* request, LockMode newMode) {
642 invariant(request->lock);
643 invariant(request->status == LockRequest::STATUS_GRANTED);
644 invariant(request->recursiveCount > 0);
645
646 // The conflict set of the newMode should be a subset of the conflict set of the old mode.
647 // Can't downgrade from S -> IX for example.
648 invariant((LockConflictsTable[request->mode] | LockConflictsTable[newMode]) ==
649 LockConflictsTable[request->mode]);
650
651 LockHead* lock = request->lock;
652
653 LockBucket* bucket = _getBucket(lock->resourceId);
654 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
655
656 lock->incGrantedModeCount(newMode);
657 lock->decGrantedModeCount(request->mode);
658 request->mode = newMode;
659
660 _onLockModeChanged(lock, true);
661 }
662
cleanupUnusedLocks()663 void LockManager::cleanupUnusedLocks() {
664 for (unsigned i = 0; i < _numLockBuckets; i++) {
665 LockBucket* bucket = &_lockBuckets[i];
666 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
667 _cleanupUnusedLocksInBucket(bucket);
668 }
669 }
670
_cleanupUnusedLocksInBucket(LockBucket * bucket)671 void LockManager::_cleanupUnusedLocksInBucket(LockBucket* bucket) {
672 LockBucket::Map::iterator it = bucket->data.begin();
673 size_t deletedLockHeads = 0;
674 while (it != bucket->data.end()) {
675 LockHead* lock = it->second;
676
677 if (lock->partitioned()) {
678 lock->migratePartitionedLockHeads();
679 }
680
681 if (lock->grantedModes == 0) {
682 invariant(lock->grantedModes == 0);
683 invariant(lock->grantedList._front == nullptr);
684 invariant(lock->grantedList._back == nullptr);
685 invariant(lock->conflictModes == 0);
686 invariant(lock->conflictList._front == nullptr);
687 invariant(lock->conflictList._back == nullptr);
688 invariant(lock->conversionsCount == 0);
689 invariant(lock->compatibleFirstCount == 0);
690
691 bucket->data.erase(it++);
692 deletedLockHeads++;
693 delete lock;
694 } else {
695 it++;
696 }
697 }
698 }
699
_onLockModeChanged(LockHead * lock,bool checkConflictQueue)700 void LockManager::_onLockModeChanged(LockHead* lock, bool checkConflictQueue) {
701 // Unblock any converting requests (because conversions are still counted as granted and
702 // are on the granted queue).
703 for (LockRequest* iter = lock->grantedList._front;
704 (iter != nullptr) && (lock->conversionsCount > 0);
705 iter = iter->next) {
706 // Conversion requests are going in a separate queue
707 if (iter->status == LockRequest::STATUS_CONVERTING) {
708 invariant(iter->convertMode != 0);
709
710 // Construct granted mask without our current mode, so that it is not accounted as
711 // a conflict
712 uint32_t grantedModesWithoutCurrentRequest = 0;
713
714 // We start the counting at 1 below, because LockModesCount also includes
715 // MODE_NONE at position 0, which can never be acquired/granted.
716 for (uint32_t i = 1; i < LockModesCount; i++) {
717 const uint32_t currentRequestHolds =
718 (iter->mode == static_cast<LockMode>(i) ? 1 : 0);
719
720 const uint32_t currentRequestWaits =
721 (iter->convertMode == static_cast<LockMode>(i) ? 1 : 0);
722
723 // We cannot both hold and wait on the same lock mode
724 invariant(currentRequestHolds + currentRequestWaits <= 1);
725
726 if (lock->grantedCounts[i] > (currentRequestHolds + currentRequestWaits)) {
727 grantedModesWithoutCurrentRequest |= modeMask(static_cast<LockMode>(i));
728 }
729 }
730
731 if (!conflicts(iter->convertMode, grantedModesWithoutCurrentRequest)) {
732 lock->conversionsCount--;
733 lock->decGrantedModeCount(iter->mode);
734 iter->status = LockRequest::STATUS_GRANTED;
735 iter->mode = iter->convertMode;
736 iter->convertMode = MODE_NONE;
737
738 iter->notify->notify(lock->resourceId, LOCK_OK);
739 }
740 }
741 }
742
743 // Grant any conflicting requests, which might now be unblocked. Note that the loop below
744 // slightly violates fairness in that it will grant *all* compatible requests on the line even
745 // though there might be conflicting ones interspersed between them. For example, assume that an
746 // X lock was just freed and the conflict queue looks like this:
747 //
748 // IS -> IS -> X -> X -> S -> IS
749 //
750 // In strict FIFO, we should grant the first two IS modes and then stop when we reach the first
751 // X mode (the third request on the queue). However, the loop below would actually grant all IS
752 // + S modes and once they all drain it will grant X. The reason for this behaviour is
753 // increasing system throughput in the scenario where mutually compatible requests are
754 // interspersed with conflicting ones. For example, this would be a worst-case scenario for
755 // strict FIFO, because it would make the execution sequential:
756 //
757 // S -> X -> S -> X -> S -> X
758
759 LockRequest* iterNext = nullptr;
760
761 bool newlyCompatibleFirst = false; // Set on enabling compatibleFirst mode.
762 for (LockRequest* iter = lock->conflictList._front; (iter != nullptr) && checkConflictQueue;
763 iter = iterNext) {
764 invariant(iter->status == LockRequest::STATUS_WAITING);
765
766 // Store the actual next pointer, because we muck with the iter below and move it to
767 // the granted queue.
768 iterNext = iter->next;
769
770 if (conflicts(iter->mode, lock->grantedModes)) {
771 // If iter doesn't have a previous pointer, this means that it is at the front of the
772 // queue. If we continue scanning the queue beyond this point, we will starve it by
773 // granting more and more requests. However, if we newly transition to compatibleFirst
774 // mode, grant any waiting compatible requests.
775 if (!iter->prev && !newlyCompatibleFirst) {
776 break;
777 }
778 continue;
779 }
780
781 iter->status = LockRequest::STATUS_GRANTED;
782
783 // Remove from the conflicts list
784 lock->conflictList.remove(iter);
785 lock->decConflictModeCount(iter->mode);
786
787 // Add to the granted list
788 lock->grantedList.push_back(iter);
789 lock->incGrantedModeCount(iter->mode);
790
791 if (iter->compatibleFirst) {
792 newlyCompatibleFirst |= (lock->compatibleFirstCount++ == 0);
793 }
794
795 iter->notify->notify(lock->resourceId, LOCK_OK);
796
797 // Small optimization - nothing is compatible with a newly granted MODE_X, so no point in
798 // looking further in the conflict queue. Conflicting MODE_X requests are skipped above.
799 if (iter->mode == MODE_X) {
800 break;
801 }
802 }
803
804 // This is a convenient place to check that the state of the two request queues is in sync
805 // with the bitmask on the modes.
806 invariant((lock->grantedModes == 0) ^ (lock->grantedList._front != nullptr));
807 invariant((lock->conflictModes == 0) ^ (lock->conflictList._front != nullptr));
808 }
809
_getBucket(ResourceId resId) const810 LockManager::LockBucket* LockManager::_getBucket(ResourceId resId) const {
811 return &_lockBuckets[resId % _numLockBuckets];
812 }
813
_getPartition(LockRequest * request) const814 LockManager::Partition* LockManager::_getPartition(LockRequest* request) const {
815 return &_partitions[request->locker->getId() % _numPartitions];
816 }
817
dump() const818 void LockManager::dump() const {
819 log() << "Dumping LockManager @ " << static_cast<const void*>(this) << '\n';
820
821 for (unsigned i = 0; i < _numLockBuckets; i++) {
822 LockBucket* bucket = &_lockBuckets[i];
823 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
824
825 if (!bucket->data.empty()) {
826 _dumpBucket(bucket);
827 }
828 }
829 }
830
_dumpBucketToBSON(const std::map<LockerId,BSONObj> & lockToClientMap,const LockBucket * bucket,BSONObjBuilder * result)831 void LockManager::_dumpBucketToBSON(const std::map<LockerId, BSONObj>& lockToClientMap,
832 const LockBucket* bucket,
833 BSONObjBuilder* result) {
834 for (auto& bucketEntry : bucket->data) {
835 const LockHead* lock = bucketEntry.second;
836
837 if (lock->grantedList.empty()) {
838 // If there are no granted requests, this lock is empty, so no need to print it
839 continue;
840 }
841
842 result->append("resourceId", lock->resourceId.toString());
843
844 BSONArrayBuilder grantedLocks;
845 for (const LockRequest* iter = lock->grantedList._front; iter != nullptr;
846 iter = iter->next) {
847 _buildBucketBSON(iter, lockToClientMap, bucket, &grantedLocks);
848 }
849 result->append("granted", grantedLocks.arr());
850
851 BSONArrayBuilder pendingLocks;
852 for (const LockRequest* iter = lock->conflictList._front; iter != nullptr;
853 iter = iter->next) {
854 _buildBucketBSON(iter, lockToClientMap, bucket, &pendingLocks);
855 }
856 result->append("pending", pendingLocks.arr());
857 }
858 }
859
_buildBucketBSON(const LockRequest * iter,const std::map<LockerId,BSONObj> & lockToClientMap,const LockBucket * bucket,BSONArrayBuilder * locks)860 void LockManager::_buildBucketBSON(const LockRequest* iter,
861 const std::map<LockerId, BSONObj>& lockToClientMap,
862 const LockBucket* bucket,
863 BSONArrayBuilder* locks) {
864 BSONObjBuilder info;
865 info.append("mode", modeName(iter->mode));
866 info.append("convertMode", modeName(iter->convertMode));
867 info.append("enqueueAtFront", iter->enqueueAtFront);
868 info.append("compatibleFirst", iter->compatibleFirst);
869
870 LockerId lockerId = iter->locker->getId();
871 std::map<LockerId, BSONObj>::const_iterator it = lockToClientMap.find(lockerId);
872 if (it != lockToClientMap.end()) {
873 info.appendElements(it->second);
874 }
875 locks->append(info.obj());
876 }
877
getLockInfoBSON(const std::map<LockerId,BSONObj> & lockToClientMap,BSONObjBuilder * result)878 void LockManager::getLockInfoBSON(const std::map<LockerId, BSONObj>& lockToClientMap,
879 BSONObjBuilder* result) {
880 BSONArrayBuilder lockInfo;
881 for (unsigned i = 0; i < _numLockBuckets; i++) {
882 LockBucket* bucket = &_lockBuckets[i];
883 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
884
885 _cleanupUnusedLocksInBucket(bucket);
886 if (!bucket->data.empty()) {
887 BSONObjBuilder b;
888 _dumpBucketToBSON(lockToClientMap, bucket, &b);
889 lockInfo.append(b.obj());
890 }
891 }
892 result->append("lockInfo", lockInfo.arr());
893 }
894
_dumpBucket(const LockBucket * bucket) const895 void LockManager::_dumpBucket(const LockBucket* bucket) const {
896 for (LockBucket::Map::const_iterator it = bucket->data.begin(); it != bucket->data.end();
897 it++) {
898 const LockHead* lock = it->second;
899
900 if (lock->grantedList.empty()) {
901 // If there are no granted requests, this lock is empty, so no need to print it
902 continue;
903 }
904
905 StringBuilder sb;
906 sb << "Lock @ " << lock << ": " << lock->resourceId.toString() << '\n';
907
908 sb << "GRANTED:\n";
909 for (const LockRequest* iter = lock->grantedList._front; iter != nullptr;
910 iter = iter->next) {
911 std::stringstream threadId;
912 threadId << iter->locker->getThreadId() << " | " << std::showbase << std::hex
913 << iter->locker->getThreadId();
914 sb << '\t' << "LockRequest " << iter->locker->getId() << " @ " << iter->locker << ": "
915 << "Mode = " << modeName(iter->mode) << "; "
916 << "Thread = " << threadId.str() << "; "
917 << "ConvertMode = " << modeName(iter->convertMode) << "; "
918 << "EnqueueAtFront = " << iter->enqueueAtFront << "; "
919 << "CompatibleFirst = " << iter->compatibleFirst << "; " << '\n';
920 }
921
922 sb << "PENDING:\n";
923 for (const LockRequest* iter = lock->conflictList._front; iter != nullptr;
924 iter = iter->next) {
925 std::stringstream threadId;
926 threadId << iter->locker->getThreadId() << " | " << std::showbase << std::hex
927 << iter->locker->getThreadId();
928 sb << '\t' << "LockRequest " << iter->locker->getId() << " @ " << iter->locker << ": "
929 << "Mode = " << modeName(iter->mode) << "; "
930 << "Thread = " << threadId.str() << "; "
931 << "ConvertMode = " << modeName(iter->convertMode) << "; "
932 << "EnqueueAtFront = " << iter->enqueueAtFront << "; "
933 << "CompatibleFirst = " << iter->compatibleFirst << "; " << '\n';
934 }
935
936 sb << "-----------------------------------------------------------\n";
937 log() << sb.str();
938 }
939 }
940
find(ResourceId resId)941 PartitionedLockHead* LockManager::Partition::find(ResourceId resId) {
942 Map::iterator it = data.find(resId);
943 return it == data.end() ? nullptr : it->second;
944 }
945
findOrInsert(ResourceId resId)946 PartitionedLockHead* LockManager::Partition::findOrInsert(ResourceId resId) {
947 PartitionedLockHead* lock;
948 Map::iterator it = data.find(resId);
949 if (it == data.end()) {
950 lock = new PartitionedLockHead();
951 lock->initNew(resId);
952
953 data.insert(Map::value_type(resId, lock));
954 } else {
955 lock = it->second;
956 }
957 return lock;
958 }
959
findOrInsert(ResourceId resId)960 LockHead* LockManager::LockBucket::findOrInsert(ResourceId resId) {
961 LockHead* lock;
962 Map::iterator it = data.find(resId);
963 if (it == data.end()) {
964 lock = new LockHead();
965 lock->initNew(resId);
966
967 data.insert(Map::value_type(resId, lock));
968 } else {
969 lock = it->second;
970 }
971 return lock;
972 }
973
974 //
975 // DeadlockDetector
976 //
977
DeadlockDetector(const LockManager & lockMgr,const Locker * initialLocker)978 DeadlockDetector::DeadlockDetector(const LockManager& lockMgr, const Locker* initialLocker)
979 : _lockMgr(lockMgr), _initialLockerId(initialLocker->getId()), _foundCycle(false) {
980 const ResourceId resId = initialLocker->getWaitingResource();
981
982 // If there is no resource waiting there is nothing to do
983 if (resId.isValid()) {
984 _queue.push_front(UnprocessedNode(_initialLockerId, resId));
985 }
986 }
987
next()988 bool DeadlockDetector::next() {
989 if (_queue.empty())
990 return false;
991
992 UnprocessedNode front = _queue.front();
993 _queue.pop_front();
994
995 _processNextNode(front);
996
997 return !_queue.empty();
998 }
999
hasCycle() const1000 bool DeadlockDetector::hasCycle() const {
1001 invariant(_queue.empty());
1002
1003 return _foundCycle;
1004 }
1005
toString() const1006 std::string DeadlockDetector::toString() const {
1007 StringBuilder sb;
1008
1009 for (WaitForGraph::const_iterator it = _graph.begin(); it != _graph.end(); it++) {
1010 sb << "Locker " << it->first << " waits for resource " << it->second.resId.toString()
1011 << " held by [";
1012
1013 const ConflictingOwnersList owners = it->second.owners;
1014 for (ConflictingOwnersList::const_iterator itW = owners.begin(); itW != owners.end();
1015 itW++) {
1016 sb << *itW << ", ";
1017 }
1018
1019 sb << "]\n";
1020 }
1021
1022 return sb.str();
1023 }
1024
_processNextNode(const UnprocessedNode & node)1025 void DeadlockDetector::_processNextNode(const UnprocessedNode& node) {
1026 // Locate the request
1027 LockManager::LockBucket* bucket = _lockMgr._getBucket(node.resId);
1028 stdx::lock_guard<SimpleMutex> scopedLock(bucket->mutex);
1029
1030 LockManager::LockBucket::Map::const_iterator iter = bucket->data.find(node.resId);
1031 if (iter == bucket->data.end()) {
1032 return;
1033 }
1034
1035 const LockHead* lock = iter->second;
1036
1037 LockRequest* request = lock->findRequest(node.lockerId);
1038
1039 // It is possible that a request which was thought to be waiting suddenly became
1040 // granted, so check that before proceeding
1041 if (!request || (request->status == LockRequest::STATUS_GRANTED)) {
1042 return;
1043 }
1044
1045 std::pair<WaitForGraph::iterator, bool> val =
1046 _graph.insert(WaitForGraphPair(node.lockerId, Edges(node.resId)));
1047 if (!val.second) {
1048 // We already saw this locker id, which means we have a cycle.
1049 if (!_foundCycle) {
1050 _foundCycle = (node.lockerId == _initialLockerId);
1051 }
1052
1053 return;
1054 }
1055
1056 Edges& edges = val.first->second;
1057
1058 bool seen = false;
1059 for (LockRequest* it = lock->grantedList._back; it != nullptr; it = it->prev) {
1060 // We can't conflict with ourselves
1061 if (it == request) {
1062 seen = true;
1063 continue;
1064 }
1065
1066 // If we are a regular conflicting request, both granted and conversion modes need to
1067 // be checked for conflict, since conversions will be granted first.
1068 if (request->status == LockRequest::STATUS_WAITING) {
1069 if (conflicts(request->mode, modeMask(it->mode)) ||
1070 conflicts(request->mode, modeMask(it->convertMode))) {
1071 const LockerId lockerId = it->locker->getId();
1072 const ResourceId waitResId = it->locker->getWaitingResource();
1073
1074 if (waitResId.isValid()) {
1075 _queue.push_front(UnprocessedNode(lockerId, waitResId));
1076 edges.owners.push_back(lockerId);
1077 }
1078 }
1079
1080 continue;
1081 }
1082
1083 // If we are a conversion request, only requests, which are before us need to be
1084 // accounted for.
1085 invariant(request->status == LockRequest::STATUS_CONVERTING);
1086
1087 if (conflicts(request->convertMode, modeMask(it->mode)) ||
1088 (seen && conflicts(request->convertMode, modeMask(it->convertMode)))) {
1089 const LockerId lockerId = it->locker->getId();
1090 const ResourceId waitResId = it->locker->getWaitingResource();
1091
1092 if (waitResId.isValid()) {
1093 _queue.push_front(UnprocessedNode(lockerId, waitResId));
1094 edges.owners.push_back(lockerId);
1095 }
1096 }
1097 }
1098
1099 // All conflicting waits, which would be granted before us
1100 for (LockRequest* it = request->prev;
1101 (request->status == LockRequest::STATUS_WAITING) && (it != nullptr);
1102 it = it->prev) {
1103 // We started from the previous element, so we should never see ourselves
1104 invariant(it != request);
1105
1106 if (conflicts(request->mode, modeMask(it->mode))) {
1107 const LockerId lockerId = it->locker->getId();
1108 const ResourceId waitResId = it->locker->getWaitingResource();
1109
1110 if (waitResId.isValid()) {
1111 _queue.push_front(UnprocessedNode(lockerId, waitResId));
1112 edges.owners.push_back(lockerId);
1113 }
1114 }
1115 }
1116 }
1117
1118
1119 //
1120 // ResourceId
1121 //
1122
fullHash(ResourceType type,uint64_t hashId)1123 uint64_t ResourceId::fullHash(ResourceType type, uint64_t hashId) {
1124 return (static_cast<uint64_t>(type) << (64 - resourceTypeBits)) +
1125 (hashId & (std::numeric_limits<uint64_t>::max() >> resourceTypeBits));
1126 }
1127
ResourceId(ResourceType type,StringData ns)1128 ResourceId::ResourceId(ResourceType type, StringData ns)
1129 : _fullHash(fullHash(type, hashStringData(ns))) {
1130 #ifdef MONGO_CONFIG_DEBUG_BUILD
1131 _nsCopy = ns.toString();
1132 #endif
1133 }
1134
ResourceId(ResourceType type,const std::string & ns)1135 ResourceId::ResourceId(ResourceType type, const std::string& ns)
1136 : _fullHash(fullHash(type, hashStringData(ns))) {
1137 #ifdef MONGO_CONFIG_DEBUG_BUILD
1138 _nsCopy = ns;
1139 #endif
1140 }
1141
ResourceId(ResourceType type,uint64_t hashId)1142 ResourceId::ResourceId(ResourceType type, uint64_t hashId) : _fullHash(fullHash(type, hashId)) {}
1143
toString() const1144 std::string ResourceId::toString() const {
1145 StringBuilder ss;
1146 ss << "{" << _fullHash << ": " << resourceTypeName(getType()) << ", " << getHashId();
1147 if (getType() == RESOURCE_MUTEX) {
1148 ss << ", " << Lock::ResourceMutex::getName(*this);
1149 }
1150
1151 #ifdef MONGO_CONFIG_DEBUG_BUILD
1152 ss << ", " << _nsCopy;
1153 #endif
1154
1155 ss << "}";
1156
1157 return ss.str();
1158 }
1159
1160
1161 //
1162 // LockRequest
1163 //
1164
initNew(Locker * locker,LockGrantNotification * notify)1165 void LockRequest::initNew(Locker* locker, LockGrantNotification* notify) {
1166 this->locker = locker;
1167 this->notify = notify;
1168
1169 enqueueAtFront = false;
1170 compatibleFirst = false;
1171 recursiveCount = 1;
1172
1173 lock = nullptr;
1174 partitionedLock = nullptr;
1175 prev = nullptr;
1176 next = nullptr;
1177 status = STATUS_NEW;
1178 partitioned = false;
1179 mode = MODE_NONE;
1180 convertMode = MODE_NONE;
1181 }
1182
1183
1184 //
1185 // Helper calls
1186 //
1187
modeName(LockMode mode)1188 const char* modeName(LockMode mode) {
1189 return LockModeNames[mode];
1190 }
1191
legacyModeName(LockMode mode)1192 const char* legacyModeName(LockMode mode) {
1193 return LegacyLockModeNames[mode];
1194 }
1195
isModeCovered(LockMode mode,LockMode coveringMode)1196 bool isModeCovered(LockMode mode, LockMode coveringMode) {
1197 return (LockConflictsTable[coveringMode] | LockConflictsTable[mode]) ==
1198 LockConflictsTable[coveringMode];
1199 }
1200
resourceTypeName(ResourceType resourceType)1201 const char* resourceTypeName(ResourceType resourceType) {
1202 return ResourceTypeNames[resourceType];
1203 }
1204
lockRequestStatusName(LockRequest::Status status)1205 const char* lockRequestStatusName(LockRequest::Status status) {
1206 return LockRequestStatusNames[status];
1207 }
1208
1209 } // namespace mongo
1210