1 /* Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 Without limiting anything contained in the foregoing, this file,
15 which is part of C Driver for MySQL (Connector/C), is also subject to the
16 Universal FOSS Exception, version 1.0, a copy of which can be found at
17 http://oss.oracle.com/licenses/universal-foss-exception.
18
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License, version 2.0, for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, write to the Free Software
26 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
27
28 /**
29 @file
30
31 "waiting threads" subsystem - a unified interface for threads to wait
32 on each other, with built-in deadlock detection.
33
34 Main concepts
35 ^^^^^^^^^^^^^
36 a thread - is represented by a WT_THD structure. One physical thread
37 can have only one WT_THD descriptor at any given moment.
38
39 a resource - a thread does not wait for other threads directly,
40 instead it waits for a "resource", which is "owned" by other threads.
41 It waits, exactly, for all "owners" to "release" a resource.
42 It does not have to correspond to a physical resource. For example, it
43 may be convenient in certain cases to force resource == thread.
44 A resource is represented by a WT_RESOURCE structure.
45
46 a resource identifier - a pair of {resource type, value}. A value is
47 an ulonglong number. Represented by a WT_RESOURCE_ID structure.
48
49 a resource type - a pointer to a statically defined instance of
50 WT_RESOURCE_TYPE structure. This structure contains a pointer to
51 a function that knows how to compare values of this resource type.
52 In the simple case it could be wt_resource_id_memcmp().
53
54 a wait-for graph - a graph, that represenst "wait-for" relationships.
55 It has two types of nodes - threads and resources. There are directed
56 edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
57 from a thread to resources that it "owns" (WT_THD::my_resources),
58 and from a resource to threads that "own" it (WT_RESOURCE::owners)
59
60 Graph completeness
61 ^^^^^^^^^^^^^^^^^^
62
63 For flawless deadlock detection wait-for graph must be complete.
64 It means that when a thread starts waiting it needs to know *all* its
65 blockers, and call wt_thd_will_wait_for() for every one of them.
66 Otherwise two phenomena should be expected:
67
68 1. Fuzzy timeouts:
69
70 thread A needs to get a lock, and is blocked by a thread B.
71 it waits.
72 Just before the timeout thread B releases the lock.
73 thread A is ready to grab the lock but discovers that it is also
74 blocked by a thread C.
75 It waits and times out.
76
77 As a result thread A has waited two timeout intervals, instead of one.
78
79 2. Unreliable cycle detection:
80
81 Thread A waits for threads B and C
82 Thread C waits for D
83 Thread D wants to start waiting for A
84
85 one can see immediately that thread D creates a cycle, and thus
86 a deadlock is detected.
87
88 But if thread A would only wait for B, and start waiting for C
89 when B would unlock, thread D would be allowed to wait, a deadlock
90 would be only detected when B unlocks or somebody times out.
91
92 These two phenomena don't affect a correctness, and strictly speaking,
93 the caller is not required to call wt_thd_will_wait_for() for *all*
94 blockers - it may optimize wt_thd_will_wait_for() calls. But they
95 may be perceived as bugs by users, it must be understood that such
96 an optimization comes with its price.
97
98 Usage
99 ^^^^^
100
101 First, the wt* subsystem must be initialized by calling
102 wt_init(). In the server you don't need to do it, it's done
103 in mysqld.cc.
104
105 Similarly, wt_end() frees wt* structures, should be called
106 at the end, but in the server mysqld.cc takes care of that.
107
108 Every WT_THD should be initialized with wt_thd_lazy_init().
109 After that they can be used in other wt_thd_* calls.
110 Before discarding, WT_THD should be free'd with
111 wt_thd_destroy(). In the server both are handled in sql_class.cc,
112 it's an error to try to do it manually.
113
114 To use the deadlock detection one needs to use this thread's WT_THD,
115 call wt_thd_will_wait_for() for every thread it needs to wait on,
116 then call wt_thd_cond_timedwait(). When thread releases a resource
117 it should call wt_thd_release() (or wt_thd_release_all()) - it will
118 notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
119 if appropriate.
120
121 Just like with pthread's cond_wait, there could be spurious
122 wake-ups from wt_thd_cond_timedwait(). A caller is expected to
123 handle that (that is, to re-check the blocking criteria).
124
125 wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
126 WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
127 WT_TIMEOUT. Out of memory and other fatal errors are reported as
128 WT_DEADLOCK - and a transaction must be aborted just the same.
129
130 Configuration
131 ^^^^^^^^^^^^^
132 There are four config variables. Two deadlock search depths - short and
133 long - and two timeouts. Deadlock search is performed with the short
134 depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
135 waits with a short timeout, performs a deadlock search with the long
136 depth, and waits with a long timeout. As most deadlock cycles are supposed
137 to be short, most deadlocks will be detected at once, and waits will
138 rarely be necessary.
139
140 These config variables are thread-local. Different threads may have
141 different search depth and timeout values.
142
143 Also, deadlock detector supports different killing strategies, the victim
144 in a deadlock cycle is selected based on the "weight". See "weight"
145 description in waiting_threads.h for details. It's up to the caller to
146 set weights accordingly.
147
148 Status
149 ^^^^^^
150 We calculate the number of successfull waits (WT_OK returned from
151 wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
152 length distribution - number of deadlocks with every length from
153 1 to WT_CYCLE_STATS, and a wait time distribution - number
154 of waits with a time from 1 us to 1 min in WT_WAIT_STATS
155 intervals on a log e scale.
156
157 Sample usage as was done in the Maria engine
158 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
159 - in class THD, THD::transaction had a WT_THD object; there were session
160 variables to set the short/long depth/timeout.
161 - when mysqld started, wt_init() was called; when it ended, wt_end() was
162 called.
163 - in THD's constructor,
164 wt_thd_lazy_init(&transaction.wt, &variables.wt_deadlock_search_depth_short,
165 &variables.wt_timeout_short,
166 &variables.wt_deadlock_search_depth_long,
167 &variables.wt_timeout_long);
168 - in THD::cleanup():
169 wt_thd_destroy(&transaction.wt);
170 - this was sufficient to make the deadlock-detector available to the Maria
171 engine (which can grab THD); the engine used it this way:
172 - when it wrote a row, and hit a duplicate key, it would find who wrote this
173 key, the "blocker" transaction. If "blocker" had committed, duplicate key
174 error would be sent. Otherwise, we would wait for it, in the following code
175 snippet (originally from storage/maria/ma_write.c). After the blocker is
176 gone, we would retry the write:
177
178 while (keyinfo->ck_insert(info,
179 (*keyinfo->make_key)(info, &int_key, i, buff, record,
180 filepos, info->trn->trid)))
181 {
182 / * we got a write error * /
183 if error is not "duplicate key" then return error;
184 info->dup_key_trid has the culprit:
185 if the culprit is ourselves then return error;
186 otherwise:
187 blocker= trnman_trid_to_trn(info->trn, info->dup_key_trid);
188 / *
189 if blocker TRN was not found, it means that the conflicting
190 transaction was committed long time ago. It could not be
191 aborted, as it would have to wait on the key tree lock
192 to remove the conflicting key it has inserted.
193 * /
194 if (!blocker || blocker->commit_trid != ~(TrID)0)
195 { / * committed * /
196 if (blocker)
197 pthread_mutex_unlock(& blocker->state_lock);
198 rw_unlock(&keyinfo->root_lock);
199 goto err;
200 }
201 / * release root_lock to let blocker finish its work * /
202 rw_unlock(&keyinfo->root_lock);
203 {
204 / * running. now we wait * /
205 WT_RESOURCE_ID rc;
206 int res;
207 const char *old_proc_info;
208
209 rc.type= &ma_rc_dup_unique;
210 rc.value= (intptr)blocker;
211 res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc);
212 if (res != WT_OK)
213 {
214 pthread_mutex_unlock(& blocker->state_lock);
215 my_errno= HA_ERR_LOCK_DEADLOCK;
216 goto err;
217 }
218 old_proc_info= proc_info_hook(0,
219 "waiting for a resource",
220 __func__, __FILE__, __LINE__);
221 res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock);
222 proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__);
223
224 pthread_mutex_unlock(& blocker->state_lock);
225 if (res != WT_OK)
226 {
227 my_errno= res == WT_TIMEOUT ? HA_ERR_LOCK_WAIT_TIMEOUT
228 : HA_ERR_LOCK_DEADLOCK;
229 goto err;
230 }
231 / * if we come here, blocker has rolled back or committed,
232 so is gone, we can retry the ck_insert() * /
233 }
234 rw_wrlock(&keyinfo->root_lock);
235 #ifndef MARIA_CANNOT_ROLLBACK
236 keyinfo->version++;
237 #endif
238 }
239 - ma_rc_dup_unique was:
240 / * a WT_RESOURCE_TYPE for transactions waiting on a unique key conflict * /
241 WT_RESOURCE_TYPE ma_rc_dup_unique={ wt_resource_id_memcmp, 0};
242 - When a Maria transaction would commit or rollback it would call:
243 / * Wake up threads waiting for this transaction * /
244 static void wt_thd_release_self(TRN *trn)
245 {
246 if (trn->wt)
247 {
248 WT_RESOURCE_ID rc;
249 rc.type= &ma_rc_dup_unique;
250 rc.value= (intptr)trn;
251 wt_thd_release(trn->wt, & rc);
252 trn->wt= 0;
253 }
254 }
255
256 Tests
257 ^^^^^
258 unittest/mysys/waiting_threads-t.c, currently disabled.
259 */
260
261 /*
262 Note that if your lock system satisfy the following condition:
263
264 there exist four lock levels A, B, C, D, such as
265 A is compatible with B
266 A is not compatible with C
267 D is not compatible with B
268
269 (example A=IX, B=IS, C=S, D=X)
270
271 you need to include lock level in the resource identifier - a
272 thread waiting for lock of the type A on resource R and another
273 thread waiting for lock of the type B on resource R should wait on
274 different WT_RESOURCE structures, on different {lock, resource}
275 pairs. Otherwise the following is possible:
276
277 thread1> take S-lock on R
278 thread2> take IS-lock on R
279 thread3> wants X-lock on R, starts waiting for threads 1 and 2 on R.
280 thread3 is killed (or timeout or whatever)
281 WT_RESOURCE structure for R is still in the hash, as it has two owners
282 thread4> wants an IX-lock on R
283 WT_RESOURCE for R is found in the hash, thread4 starts waiting on it.
284 !! now thread4 is waiting for both thread1 and thread2
285 !! while, in fact, IX-lock and IS-lock are compatible and
286 !! thread4 should not wait for thread2.
287 */
288
289 #include <waiting_threads.h>
290 #include <m_string.h>
291
292 /* status variables */
293
294 /**
295 preset table of wait intervals
296 */
297 ulonglong wt_wait_table[WT_WAIT_STATS];
298 /**
299 wait time distribution (log e scale)
300 */
301 uint32 wt_wait_stats[WT_WAIT_STATS+1];
302 /**
303 distribution of cycle lengths
304 first column tells whether this was during short or long detection
305 */
306 uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
307 uint32 wt_success_stats;
308
309 static my_atomic_rwlock_t
310 cycle_stats_lock MY_ATTRIBUTE((unused)),
311 wait_stats_lock MY_ATTRIBUTE((unused)),
312 success_stats_lock MY_ATTRIBUTE((unused));
313
314 #ifdef SAFE_STATISTICS
315 #define incr(VAR, LOCK) \
316 do { \
317 my_atomic_rwlock_wrlock(&(LOCK)); \
318 my_atomic_add32(&(VAR), 1); \
319 my_atomic_rwlock_wrunlock(&(LOCK)); \
320 } while(0)
321 #else
322 #define incr(VAR,LOCK) do { (VAR)++; } while(0)
323 #endif
324
increment_success_stats()325 static void increment_success_stats()
326 {
327 incr(wt_success_stats, success_stats_lock);
328 }
329
increment_cycle_stats(uint depth,uint slot)330 static void increment_cycle_stats(uint depth, uint slot)
331 {
332 if (depth >= WT_CYCLE_STATS)
333 depth= WT_CYCLE_STATS;
334 incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
335 }
336
increment_wait_stats(ulonglong waited,int ret)337 static void increment_wait_stats(ulonglong waited,int ret)
338 {
339 uint i;
340 if ((ret) == ETIMEDOUT)
341 i= WT_WAIT_STATS;
342 else
343 for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
344 incr(wt_wait_stats[i], wait_stats_lock);
345 }
346
347 /*
348 'lock' protects 'owners', 'state', and 'waiter_count'
349 'id' is read-only
350
351 a resource is picked up from a hash in a lock-free manner
352 it's returned pinned, so it cannot be freed at once
353 but it may be freed right after the pin is removed
354 to free a resource it should
355 1. have no owners
356 2. have no waiters
357
358 two ways to access a resource:
359 1. find it in a hash
360 - it's returned pinned.
361 a) take a lock in exclusive mode
362 b) check the state, it should be ACTIVE to be usable
363 c) unpin
364 2. by a direct reference
365 - could only used if a resource cannot be freed
366 e.g. accessing a resource by thd->waiting_for is safe,
367 a resource cannot be freed as there's a thread waiting for it
368 */
369 struct st_wt_resource {
370 WT_RESOURCE_ID id;
371 uint waiter_count;
372 enum { ACTIVE, FREE } state;
373 #ifndef DBUG_OFF
374 mysql_mutex_t *cond_mutex; /* a mutex for the 'cond' below */
375 #endif
376 /*
377 before the 'lock' all elements are mutable, after (and including) -
378 immutable in the sense that lf_hash_insert() won't memcpy() over them.
379 See wt_init().
380 */
381 #ifdef WT_RWLOCKS_USE_MUTEXES
382 /*
383 we need a special rwlock-like 'lock' to allow readers bypass
384 waiting writers, otherwise readers can deadlock. For example:
385
386 A waits on resource x, owned by B, B waits on resource y, owned
387 by A, we have a cycle (A->x->B->y->A)
388 Both A and B start deadlock detection:
389
390 A locks x B locks y
391 A goes deeper B goes deeper
392 A locks y B locks x
393
394 with mutexes it would deadlock. With rwlocks it won't, as long
395 as both A and B are taking read locks (and they do).
396 But other threads may take write locks. Assume there's
397 C who wants to start waiting on x, and D who wants to start
398 waiting on y.
399
400 A read-locks x B read-locks y
401 A goes deeper B goes deeper
402 => C write-locks x (to add a new edge) D write-locks y
403 .. C is blocked D is blocked
404 A read-locks y B read-locks x
405
406 Now, if a read lock can bypass a pending wrote lock request, we're fine.
407 If it can not, we have a deadlock.
408
409 writer starvation is technically possible, but unlikely, because
410 the contention is expected to be low.
411 */
412 struct {
413 mysql_cond_t cond;
414 mysql_mutex_t mutex;
415 uint readers: 16;
416 uint pending_writers: 15;
417 uint write_locked: 1;
418 } lock;
419 #else
420 rw_lock_t lock;
421 #endif
422 mysql_cond_t cond; /* the corresponding mutex is provided by the caller */
423 DYNAMIC_ARRAY owners;
424 };
425
426 #ifdef WT_RWLOCKS_USE_MUTEXES
rc_rwlock_init(WT_RESOURCE * rc)427 static void rc_rwlock_init(WT_RESOURCE *rc)
428 {
429 mysql_cond_init(0, &rc->lock.cond, 0);
430 mysql_mutex_init(0, &rc->lock.mutex, MY_MUTEX_INIT_FAST);
431 }
rc_rwlock_destroy(WT_RESOURCE * rc)432 static void rc_rwlock_destroy(WT_RESOURCE *rc)
433 {
434 DBUG_ASSERT(rc->lock.write_locked == 0);
435 DBUG_ASSERT(rc->lock.readers == 0);
436 mysql_cond_destroy(&rc->lock.cond);
437 mysql_mutex_destroy(&rc->lock.mutex);
438 }
rc_rdlock(WT_RESOURCE * rc)439 static void rc_rdlock(WT_RESOURCE *rc)
440 {
441 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
442 mysql_mutex_lock(&rc->lock.mutex);
443 while (rc->lock.write_locked)
444 mysql_cond_wait(&rc->lock.cond, &rc->lock.mutex);
445 rc->lock.readers++;
446 mysql_mutex_unlock(&rc->lock.mutex);
447 DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
448 }
rc_wrlock(WT_RESOURCE * rc)449 static void rc_wrlock(WT_RESOURCE *rc)
450 {
451 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
452 mysql_mutex_lock(&rc->lock.mutex);
453 while (rc->lock.write_locked || rc->lock.readers)
454 mysql_cond_wait(&rc->lock.cond, &rc->lock.mutex);
455 rc->lock.write_locked= 1;
456 mysql_mutex_unlock(&rc->lock.mutex);
457 DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
458 }
rc_unlock(WT_RESOURCE * rc)459 static void rc_unlock(WT_RESOURCE *rc)
460 {
461 DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
462 mysql_mutex_lock(&rc->lock.mutex);
463 if (rc->lock.write_locked)
464 {
465 rc->lock.write_locked= 0;
466 mysql_cond_broadcast(&rc->lock.cond);
467 }
468 else if (--rc->lock.readers == 0)
469 mysql_cond_broadcast(&rc->lock.cond);
470 mysql_mutex_unlock(&rc->lock.mutex);
471 }
472 #else
rc_rwlock_init(WT_RESOURCE * rc)473 static void rc_rwlock_init(WT_RESOURCE *rc)
474 {
475 my_rwlock_init(&rc->lock, 0);
476 }
rc_rwlock_destroy(WT_RESOURCE * rc)477 static void rc_rwlock_destroy(WT_RESOURCE *rc)
478 {
479 rwlock_destroy(&rc->lock);
480 }
rc_rdlock(WT_RESOURCE * rc)481 static void rc_rdlock(WT_RESOURCE *rc)
482 {
483 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
484 rw_rdlock(&rc->lock);
485 DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
486 }
rc_wrlock(WT_RESOURCE * rc)487 static void rc_wrlock(WT_RESOURCE *rc)
488 {
489 DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
490 rw_wrlock(&rc->lock);
491 DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
492 }
rc_unlock(WT_RESOURCE * rc)493 static void rc_unlock(WT_RESOURCE *rc)
494 {
495 DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
496 rw_unlock(&rc->lock);
497 }
498 #endif
499
500 /*
501 All resources are stored in a lock-free hash. Different threads
502 may add new resources and perform deadlock detection concurrently.
503 */
504 static LF_HASH reshash;
505
506 /**
507 WT_RESOURCE constructor
508
509 It's called from lf_hash and takes a pointer to an LF_SLIST instance.
510 WT_RESOURCE is located at arg+sizeof(LF_SLIST)
511 */
wt_resource_init(uchar * arg)512 static void wt_resource_init(uchar *arg)
513 {
514 WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
515 DBUG_ENTER("wt_resource_init");
516
517 memset(rc, 0, sizeof(*rc));
518 rc_rwlock_init(rc);
519 mysql_cond_init(0, &rc->cond, 0);
520 my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 0, 5);
521 DBUG_VOID_RETURN;
522 }
523
524 /**
525 WT_RESOURCE destructor
526
527 It's called from lf_hash and takes a pointer to an LF_SLIST instance.
528 WT_RESOURCE is located at arg+sizeof(LF_SLIST)
529 */
wt_resource_destroy(uchar * arg)530 static void wt_resource_destroy(uchar *arg)
531 {
532 WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
533 DBUG_ENTER("wt_resource_destroy");
534
535 DBUG_ASSERT(rc->owners.elements == 0);
536 rc_rwlock_destroy(rc);
537 mysql_cond_destroy(&rc->cond);
538 delete_dynamic(&rc->owners);
539 DBUG_VOID_RETURN;
540 }
541
wt_init()542 void wt_init()
543 {
544 DBUG_ENTER("wt_init");
545 DBUG_ASSERT(reshash.alloc.constructor != wt_resource_init);
546
547 lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
548 sizeof_WT_RESOURCE_ID, 0, 0);
549 reshash.alloc.constructor= wt_resource_init;
550 reshash.alloc.destructor= wt_resource_destroy;
551 /*
552 Note a trick: we initialize the hash with the real element size,
553 but fix it later to a shortened element size. This way
554 the allocator will allocate elements correctly, but
555 lf_hash_insert() will only overwrite part of the element with memcpy().
556 lock, condition, and dynamic array will be intact.
557 */
558 reshash.element_size= offsetof(WT_RESOURCE, lock);
559 memset(wt_wait_stats, 0, sizeof(wt_wait_stats));
560 memset(wt_cycle_stats, 0, sizeof(wt_cycle_stats));
561 wt_success_stats= 0;
562 { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
563 int i;
564 double from= log(1); /* 1 us */
565 double to= log(60e6); /* 1 min */
566 for (i= 0; i < WT_WAIT_STATS; i++)
567 {
568 wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
569 DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
570 }
571 }
572 my_atomic_rwlock_init(&cycle_stats_lock);
573 my_atomic_rwlock_init(&success_stats_lock);
574 my_atomic_rwlock_init(&wait_stats_lock);
575 DBUG_VOID_RETURN;
576 }
577
wt_end()578 void wt_end()
579 {
580 DBUG_ENTER("wt_end");
581
582 DBUG_ASSERT(reshash.count == 0);
583 lf_hash_destroy(&reshash);
584 my_atomic_rwlock_destroy(&cycle_stats_lock);
585 my_atomic_rwlock_destroy(&success_stats_lock);
586 my_atomic_rwlock_destroy(&wait_stats_lock);
587 DBUG_VOID_RETURN;
588 }
589
590 /**
591 Lazy WT_THD initialization
592
593 Cheap initialization of WT_THD. Only initialize fields that don't require
594 memory allocations - basically, it only does assignments. The rest of the
595 WT_THD structure will be initialized on demand, on the first use.
596 This allows one to initialize lazily all WT_THD structures, even if some
597 (or even most) of them will never be used for deadlock detection.
598
599 @param ds a pointer to deadlock search depth short value
600 @param ts a pointer to deadlock timeout short value
601 @param dl a pointer to deadlock search depth long value
602 @param tl a pointer to deadlock timeout long value
603
604 @note these are pointers to values, and WT_THD stores them as pointers.
605 It allows one later to change search depths and timeouts for existing
606 threads. It also means that the pointers must stay valid for the lifetime
607 of WT_THD.
608 */
wt_thd_lazy_init(WT_THD * thd,const ulong * ds,const ulong * ts,const ulong * dl,const ulong * tl)609 void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
610 const ulong *dl, const ulong *tl)
611 {
612 DBUG_ENTER("wt_thd_lazy_init");
613 thd->waiting_for= 0;
614 thd->weight= 0;
615 thd->deadlock_search_depth_short= ds;
616 thd->timeout_short= ts;
617 thd->deadlock_search_depth_long= dl;
618 thd->timeout_long= tl;
619 /* dynamic array is also initialized lazily - without memory allocations */
620 my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5);
621 #ifndef DBUG_OFF
622 thd->name= my_thread_name();
623 #endif
624 DBUG_VOID_RETURN;
625 }
626
627 /**
628 Finalize WT_THD initialization
629
630 After lazy WT_THD initialization, parts of the structure are still
631 uninitialized. This function completes the initialization, allocating
632 memory, if necessary. It's called automatically on demand, when WT_THD
633 is about to be used.
634 */
fix_thd_pins(WT_THD * thd)635 static int fix_thd_pins(WT_THD *thd)
636 {
637 if (unlikely(thd->pins == 0))
638 {
639 thd->pins= lf_hash_get_pins(&reshash);
640 #ifndef DBUG_OFF
641 thd->name= my_thread_name();
642 #endif
643 }
644 return thd->pins == 0;
645 }
646
wt_thd_destroy(WT_THD * thd)647 void wt_thd_destroy(WT_THD *thd)
648 {
649 DBUG_ENTER("wt_thd_destroy");
650
651 DBUG_ASSERT(thd->my_resources.elements == 0);
652 DBUG_ASSERT(thd->waiting_for == 0);
653
654 if (thd->pins != 0)
655 lf_hash_put_pins(thd->pins);
656
657 delete_dynamic(&thd->my_resources);
658 DBUG_VOID_RETURN;
659 }
660 /**
661 Trivial resource id comparison function - bytewise memcmp.
662
663 It can be used in WT_RESOURCE_TYPE structures where bytewise
664 comparison of values is sufficient.
665 */
wt_resource_id_memcmp(const void * a,const void * b)666 my_bool wt_resource_id_memcmp(const void *a, const void *b)
667 {
668 /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
669 compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
670 return memcmp(a, b, sizeof_WT_RESOURCE_ID);
671 }
672
673 /**
674 arguments for the recursive deadlock_search function
675 */
676 struct deadlock_arg {
677 WT_THD * const thd; /**< starting point of a search */
678 uint const max_depth; /**< search depth limit */
679 WT_THD *victim; /**< a thread to be killed to resolve a deadlock */
680 WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
681 };
682
683 /**
684 helper function to change the victim, according to the weight
685 */
change_victim(WT_THD * found,struct deadlock_arg * arg)686 static void change_victim(WT_THD* found, struct deadlock_arg *arg)
687 {
688 if (found->weight < arg->victim->weight)
689 {
690 if (arg->victim != arg->thd)
691 {
692 rc_unlock(arg->victim->waiting_for); /* release the previous victim */
693 DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
694 }
695 arg->victim= found;
696 arg->last_locked_rc= 0;
697 }
698 }
699
700 /**
701 recursive loop detection in a wait-for graph with a limited search depth
702 */
deadlock_search(struct deadlock_arg * arg,WT_THD * blocker,uint depth)703 static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker,
704 uint depth)
705 {
706 WT_RESOURCE *rc, *volatile *shared_ptr= &blocker->waiting_for;
707 WT_THD *cursor;
708 uint i;
709 int ret= WT_OK;
710 DBUG_ENTER("deadlock_search");
711 DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, depth=%u",
712 arg->thd->name, blocker->name, depth));
713
714 LF_REQUIRE_PINS(1);
715
716 arg->last_locked_rc= 0;
717
718 if (depth > arg->max_depth)
719 {
720 DBUG_PRINT("wt", ("exit: WT_DEPTH_EXCEEDED (early)"));
721 DBUG_RETURN(WT_DEPTH_EXCEEDED);
722 }
723
724 retry:
725 /*
726 safe dereference as explained in lf_alloc-pin.c
727 (in short: protects against lf_alloc_free() in lf_hash_delete())
728 */
729 do
730 {
731 rc= *shared_ptr;
732 lf_pin(arg->thd->pins, 0, rc);
733 } while (rc != *shared_ptr && LF_BACKOFF);
734
735 if (rc == 0)
736 {
737 DBUG_PRINT("wt", ("exit: OK (early)"));
738 DBUG_RETURN(0);
739 }
740
741 rc_rdlock(rc);
742 if (rc->state != ACTIVE || *shared_ptr != rc)
743 {
744 /* blocker is not waiting on this resource anymore */
745 rc_unlock(rc);
746 lf_unpin(arg->thd->pins, 0);
747 goto retry;
748 }
749 /* as the state is locked, we can unpin now */
750 lf_unpin(arg->thd->pins, 0);
751
752 /*
753 Below is not a pure depth-first search. It's a depth-first with a
754 slightest hint of breadth-first. Depth-first is:
755
756 check(element, X):
757 foreach current in element->nodes[] do:
758 if current == X return error;
759 check(current, X);
760
761 while we do
762
763 check(element, X):
764 foreach current in element->nodes[] do:
765 if current == X return error;
766 foreach current in element->nodes[] do:
767 check(current, X);
768
769 preferring shorter deadlocks over longer ones.
770 */
771 for (i= 0; i < rc->owners.elements; i++)
772 {
773 cursor= *dynamic_element(&rc->owners, i, WT_THD**);
774 /*
775 We're only looking for (and detecting) cycles that include 'arg->thd'.
776 That is, only deadlocks that *we* have created. For example,
777 thd->A->B->thd
778 (thd waits for A, A waits for B, while B is waiting for thd).
779 While walking the graph we can encounter other cicles, e.g.
780 thd->A->B->C->A
781 This will not be detected. Instead we will walk it in circles until
782 the search depth limit is reached (the latter guarantees that an
783 infinite loop is impossible). We expect the thread that has created
784 the cycle (one of A, B, and C) to detect its deadlock.
785 */
786 if (cursor == arg->thd)
787 {
788 ret= WT_DEADLOCK;
789 increment_cycle_stats(depth, arg->max_depth ==
790 *arg->thd->deadlock_search_depth_long);
791 arg->victim= cursor;
792 goto end;
793 }
794 }
795 for (i= 0; i < rc->owners.elements; i++)
796 {
797 cursor= *dynamic_element(&rc->owners, i, WT_THD**);
798 switch (deadlock_search(arg, cursor, depth+1)) {
799 case WT_OK:
800 break;
801 case WT_DEPTH_EXCEEDED:
802 ret= WT_DEPTH_EXCEEDED;
803 break;
804 case WT_DEADLOCK:
805 ret= WT_DEADLOCK;
806 change_victim(cursor, arg); /* also sets arg->last_locked_rc to 0 */
807 i= rc->owners.elements; /* jump out of the loop */
808 break;
809 default:
810 DBUG_ASSERT(0);
811 }
812 if (arg->last_locked_rc)
813 rc_unlock(arg->last_locked_rc);
814 }
815 end:
816 /*
817 Note that 'rc' is locked in this function, but it's never unlocked here.
818 Instead it's saved in arg->last_locked_rc and the *caller* is
819 expected to unlock it. It's done to support different killing
820 strategies. This is how it works:
821 Assuming a graph
822
823 thd->A->B->C->thd
824
825 deadlock_search() function starts from thd, locks it (in fact it locks not
826 a thd, but a resource it is waiting on, but below, for simplicity, I'll
827 talk about "locking a thd"). Then it goes down recursively, locks A, and so
828 on. Goes down recursively, locks B. Goes down recursively, locks C.
829 Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
830 Returns from the last deadlock_search() call. C stays locked!
831 Now it checks whether C is a more appropriate victim than 'thd'.
832 If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
833 Now it checks whether B is a more appropriate victim than arg->victim.
834 If yes - old arg->victim is unlocked and arg->victim=B,
835 otherwise B is unlocked. Return.
836 And so on.
837
838 In short, a resource is locked in a frame. But it's not unlocked in the
839 same frame, it's unlocked by the caller, and only after the caller checks
840 that it doesn't need to use current WT_THD as a victim. If it does - the
841 lock is kept and the old victim's resource is unlocked. When the recursion
842 is unrolled and we are back to deadlock() function, there are only two
843 locks left - on thd and on the victim.
844 */
845 arg->last_locked_rc= rc;
846 DBUG_PRINT("wt", ("exit: %s",
847 ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
848 ret ? "WT_DEADLOCK" : "OK"));
849 DBUG_RETURN(ret);
850 }
851
852 /**
853 Deadlock detection in a wait-for graph
854
855 A wrapper for recursive deadlock_search() - prepares deadlock_arg structure,
856 invokes deadlock_search(), increments statistics, notifies the victim.
857
858 @param thd thread that is going to wait. Deadlock is detected
859 if, while walking the graph, we reach a thread that
860 is waiting on thd
861 @param blocker starting point of a search. In wt_thd_cond_timedwait()
862 it's thd, in wt_thd_will_wait_for() it's a thread that
863 thd is going to wait for
864 @param depth starting search depth. In general it's the number of
865 edges in the wait-for graph between thd and the
866 blocker. Practically only two values are used (and
867 supported) - when thd == blocker it's 0, when thd
868 waits directly for blocker, it's 1
869 @param max_depth search depth limit
870 */
deadlock(WT_THD * thd,WT_THD * blocker,uint depth,uint max_depth)871 static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth,
872 uint max_depth)
873 {
874 struct deadlock_arg arg= {thd, max_depth, 0, 0};
875 int ret;
876 DBUG_ENTER("deadlock");
877 DBUG_ASSERT(depth < 2);
878 ret= deadlock_search(&arg, blocker, depth);
879 if (ret == WT_DEPTH_EXCEEDED)
880 {
881 increment_cycle_stats(WT_CYCLE_STATS, max_depth ==
882 *thd->deadlock_search_depth_long);
883 ret= WT_OK;
884 }
885 /*
886 if we started with depth==1, blocker was never considered for a victim
887 in deadlock_search(). Do it here.
888 */
889 if (ret == WT_DEADLOCK && depth)
890 change_victim(blocker, &arg);
891 if (arg.last_locked_rc)
892 {
893 /*
894 Special return code if there's nobody to wait for.
895
896 depth == 0 means that we start the search from thd (thd == blocker).
897 ret == WT_OK means that no cycle was found and
898 arg.last_locked_rc == thd->waiting_for.
899 and arg.last_locked_rc->owners.elements == 0 means that
900 (applying the rule above) thd->waiting_for->owners.elements == 0,
901 and thd doesn't have anybody to wait for.
902 */
903 if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
904 {
905 DBUG_ASSERT(thd == blocker);
906 DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
907 ret= WT_FREE_TO_GO;
908 }
909 rc_unlock(arg.last_locked_rc);
910 }
911 /* notify the victim, if appropriate */
912 if (ret == WT_DEADLOCK && arg.victim != thd)
913 {
914 DBUG_PRINT("wt", ("killing %s", arg.victim->name));
915 arg.victim->killed= 1;
916 mysql_cond_broadcast(&arg.victim->waiting_for->cond);
917 rc_unlock(arg.victim->waiting_for);
918 ret= WT_OK;
919 }
920 DBUG_RETURN(ret);
921 }
922
923
924 /**
925 Delete an element from reshash if it has no waiters or owners
926
927 rc->lock must be locked by the caller and it's unlocked on return.
928 */
unlock_lock_and_free_resource(WT_THD * thd,WT_RESOURCE * rc)929 static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc)
930 {
931 uint keylen;
932 const void *key;
933 DBUG_ENTER("unlock_lock_and_free_resource");
934
935 DBUG_ASSERT(rc->state == ACTIVE);
936
937 if (rc->owners.elements || rc->waiter_count)
938 {
939 DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
940 rc->owners.elements, rc->waiter_count));
941 rc_unlock(rc);
942 DBUG_RETURN(0);
943 }
944
945 if (fix_thd_pins(thd))
946 {
947 rc_unlock(rc);
948 DBUG_RETURN(1);
949 }
950
951 /* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */
952 {
953 key= &rc->id;
954 keylen= sizeof_WT_RESOURCE_ID;
955 }
956
957 /*
958 To free the element correctly we need to:
959 1. take its lock (already done).
960 2. set the state to FREE
961 3. release the lock
962 4. remove from the hash
963 */
964 rc->state= FREE;
965 rc_unlock(rc);
966 DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
967 }
968
969
970 /**
971 register the fact that thd is not waiting anymore
972
973 decrease waiter_count, clear waiting_for, free the resource if appropriate.
974 thd->waiting_for must be locked!
975 */
stop_waiting_locked(WT_THD * thd)976 static int stop_waiting_locked(WT_THD *thd)
977 {
978 int ret;
979 WT_RESOURCE *rc= thd->waiting_for;
980 DBUG_ENTER("stop_waiting_locked");
981
982 DBUG_ASSERT(rc->waiter_count);
983 DBUG_ASSERT(rc->state == ACTIVE);
984 rc->waiter_count--;
985 thd->waiting_for= 0;
986 ret= unlock_lock_and_free_resource(thd, rc);
987 DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK);
988 }
989
990 /**
991 register the fact that thd is not waiting anymore
992
993 locks thd->waiting_for and calls stop_waiting_locked().
994 */
stop_waiting(WT_THD * thd)995 static int stop_waiting(WT_THD *thd)
996 {
997 int ret;
998 WT_RESOURCE *rc= thd->waiting_for;
999 DBUG_ENTER("stop_waiting");
1000
1001 if (!rc)
1002 DBUG_RETURN(WT_OK);
1003 /*
1004 nobody's trying to free the resource now,
1005 as its waiter_count is guaranteed to be non-zero
1006 */
1007 rc_wrlock(rc);
1008 ret= stop_waiting_locked(thd);
1009 DBUG_RETURN(ret);
1010 }
1011
1012 /**
1013 notify the system that a thread needs to wait for another thread
1014
1015 called by a *waiter* to declare that it (thd) will wait for another
1016 thread (blocker) on a specific resource (resid).
1017 can be called many times, if many blockers own a blocking resource.
1018 but must always be called with the same resource id - a thread cannot
1019 wait for more than one resource at a time.
1020
1021 @return WT_OK or WT_DEADLOCK
1022
1023 As a new edge is added to the wait-for graph, a deadlock detection is
1024 performed for this new edge.
1025 */
wt_thd_will_wait_for(WT_THD * thd,WT_THD * blocker,const WT_RESOURCE_ID * resid)1026 int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
1027 const WT_RESOURCE_ID *resid)
1028 {
1029 uint i;
1030 WT_RESOURCE *rc;
1031 DBUG_ENTER("wt_thd_will_wait_for");
1032
1033 LF_REQUIRE_PINS(3);
1034
1035 DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, resid=%lu",
1036 thd->name, blocker->name, (ulong)resid->value));
1037
1038 if (fix_thd_pins(thd))
1039 DBUG_RETURN(WT_DEADLOCK);
1040
1041 if (thd->waiting_for == 0)
1042 {
1043 uint keylen;
1044 const void *key;
1045 /* XXX if (restype->make_key) key= restype->make_key(resid, &keylen); else */
1046 {
1047 key= resid;
1048 keylen= sizeof_WT_RESOURCE_ID;
1049 }
1050
1051 DBUG_PRINT("wt", ("first blocker"));
1052
1053 retry:
1054 while ((rc= lf_hash_search(&reshash, thd->pins, key, keylen)) == 0)
1055 {
1056 WT_RESOURCE tmp;
1057
1058 DBUG_PRINT("wt", ("failed to find rc in hash, inserting"));
1059 memset(&tmp, 0, sizeof(tmp));
1060 tmp.id= *resid;
1061 tmp.state= ACTIVE;
1062
1063 if (lf_hash_insert(&reshash, thd->pins, &tmp) == -1) /* if OOM */
1064 DBUG_RETURN(WT_DEADLOCK);
1065 /*
1066 Two cases: either lf_hash_insert() failed - because another thread
1067 has just inserted a resource with the same id - and we need to retry.
1068 Or lf_hash_insert() succeeded, and then we need to repeat
1069 lf_hash_search() to find a real address of the newly inserted element.
1070 That is, we don't care what lf_hash_insert() has returned.
1071 And we need to repeat the loop anyway.
1072 */
1073 }
1074 if (rc == MY_ERRPTR)
1075 DBUG_RETURN(WT_DEADLOCK);
1076
1077 DBUG_PRINT("wt", ("found in hash rc=%p", rc));
1078
1079 rc_wrlock(rc);
1080 if (rc->state != ACTIVE)
1081 {
1082 DBUG_PRINT("wt", ("but it's not active, retrying"));
1083 /* Somebody has freed the element while we weren't looking */
1084 rc_unlock(rc);
1085 lf_hash_search_unpin(thd->pins);
1086 goto retry;
1087 }
1088
1089 lf_hash_search_unpin(thd->pins); /* the element cannot go away anymore */
1090 thd->waiting_for= rc;
1091 rc->waiter_count++;
1092 thd->killed= 0;
1093 }
1094 else
1095 {
1096 DBUG_ASSERT(thd->waiting_for->id.type == resid->type);
1097 DBUG_ASSERT(resid->type->compare(&thd->waiting_for->id, resid) == 0);
1098 DBUG_PRINT("wt", ("adding another blocker"));
1099
1100 /*
1101 we can safely access the resource here, it's in the hash as it has
1102 non-zero waiter_count
1103 */
1104 rc= thd->waiting_for;
1105 rc_wrlock(rc);
1106 DBUG_ASSERT(rc->waiter_count);
1107 DBUG_ASSERT(rc->state == ACTIVE);
1108
1109 if (thd->killed)
1110 {
1111 stop_waiting_locked(thd);
1112 DBUG_RETURN(WT_DEADLOCK);
1113 }
1114 }
1115 /*
1116 Another thread could be waiting on this resource for this very 'blocker'.
1117 In this case we should not add it to the list for the second time.
1118 */
1119 for (i= 0; i < rc->owners.elements; i++)
1120 if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
1121 break;
1122 if (i >= rc->owners.elements)
1123 {
1124 if (push_dynamic(&blocker->my_resources, (void*)&rc))
1125 {
1126 stop_waiting_locked(thd);
1127 DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */
1128 }
1129 if (push_dynamic(&rc->owners, (void*)&blocker))
1130 {
1131 pop_dynamic(&blocker->my_resources);
1132 stop_waiting_locked(thd);
1133 DBUG_RETURN(WT_DEADLOCK);
1134 }
1135 }
1136 rc_unlock(rc);
1137
1138 if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
1139 {
1140 stop_waiting(thd);
1141 DBUG_RETURN(WT_DEADLOCK);
1142 }
1143 DBUG_RETURN(WT_OK);
1144 }
1145
1146 /**
1147 called by a *waiter* (thd) to start waiting
1148
1149 It's supposed to be a drop-in replacement for
1150 pthread_cond_timedwait(), and it takes mutex as an argument.
1151
1152 @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
1153 */
wt_thd_cond_timedwait(WT_THD * thd,mysql_mutex_t * mutex)1154 int wt_thd_cond_timedwait(WT_THD *thd, mysql_mutex_t *mutex)
1155 {
1156 int ret= WT_TIMEOUT;
1157 struct timespec timeout;
1158 ulonglong before, after, starttime;
1159 WT_RESOURCE *rc= thd->waiting_for;
1160 DBUG_ENTER("wt_thd_cond_timedwait");
1161 DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
1162
1163 #ifndef DBUG_OFF
1164 if (rc->cond_mutex)
1165 DBUG_ASSERT(rc->cond_mutex == mutex);
1166 else
1167 rc->cond_mutex= mutex;
1168 mysql_mutex_assert_owner(mutex);
1169 #endif
1170
1171 before= starttime= my_getsystime();
1172
1173 #ifdef __WIN__
1174 /*
1175 only for the sake of Windows we distinguish between
1176 'before' and 'starttime':
1177
1178 my_getsystime() returns high-resolution value, that cannot be used for
1179 waiting (it doesn't follow system clock changes), but is good for time
1180 intervals.
1181
1182 GetSystemTimeAsFileTime() follows system clock, but is low-resolution
1183 and will result in lousy intervals.
1184 */
1185 GetSystemTimeAsFileTime((PFILETIME)&starttime);
1186 #endif
1187
1188 rc_wrlock(rc);
1189 if (rc->owners.elements == 0)
1190 ret= WT_OK;
1191 rc_unlock(rc);
1192
1193 set_timespec_time_nsec(timeout, starttime, (*thd->timeout_short)*1000ULL);
1194 if (ret == WT_TIMEOUT && !thd->killed)
1195 ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1196 if (ret == WT_TIMEOUT && !thd->killed)
1197 {
1198 int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
1199 if (r == WT_FREE_TO_GO)
1200 ret= WT_OK;
1201 else if (r != WT_OK)
1202 ret= WT_DEADLOCK;
1203 else if (*thd->timeout_long > *thd->timeout_short)
1204 {
1205 set_timespec_time_nsec(timeout, starttime, (*thd->timeout_long)*1000ULL);
1206 if (!thd->killed)
1207 ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1208 }
1209 }
1210 after= my_getsystime();
1211 if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */
1212 ret= WT_DEADLOCK;
1213 increment_wait_stats(after-before, ret);
1214 if (ret == WT_OK)
1215 increment_success_stats();
1216 DBUG_RETURN(ret);
1217 }
1218
1219 /**
1220 called by a *blocker* when it releases a resource
1221
1222 it's conceptually similar to pthread_cond_broadcast, and must be done
1223 under the same mutex as wt_thd_cond_timedwait().
1224
1225 @param resid a resource to release. 0 to release all resources
1226 */
1227
wt_thd_release(WT_THD * thd,const WT_RESOURCE_ID * resid)1228 void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
1229 {
1230 uint i;
1231 DBUG_ENTER("wt_thd_release");
1232
1233 for (i= 0; i < thd->my_resources.elements; i++)
1234 {
1235 WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
1236 if (!resid || (resid->type->compare(&rc->id, resid) == 0))
1237 {
1238 uint j;
1239
1240 rc_wrlock(rc);
1241 /*
1242 nobody's trying to free the resource now,
1243 as its owners[] array is not empty (at least thd must be there)
1244 */
1245 DBUG_ASSERT(rc->state == ACTIVE);
1246 for (j= 0; j < rc->owners.elements; j++)
1247 if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
1248 break;
1249 DBUG_ASSERT(j < rc->owners.elements);
1250 delete_dynamic_element(&rc->owners, j);
1251 if (rc->owners.elements == 0)
1252 {
1253 mysql_cond_broadcast(&rc->cond);
1254 #ifndef DBUG_OFF
1255 if (rc->cond_mutex)
1256 mysql_mutex_assert_owner(rc->cond_mutex);
1257 #endif
1258 }
1259 unlock_lock_and_free_resource(thd, rc);
1260 if (resid)
1261 {
1262 delete_dynamic_element(&thd->my_resources, i);
1263 DBUG_VOID_RETURN;
1264 }
1265 }
1266 }
1267 if (!resid)
1268 reset_dynamic(&thd->my_resources);
1269 DBUG_VOID_RETURN;
1270 }
1271
1272