1 /* Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    Without limiting anything contained in the foregoing, this file,
15    which is part of C Driver for MySQL (Connector/C), is also subject to the
16    Universal FOSS Exception, version 1.0, a copy of which can be found at
17    http://oss.oracle.com/licenses/universal-foss-exception.
18 
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License, version 2.0, for more details.
23 
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, write to the Free Software
26    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
27 
28 /**
29   @file
30 
31   "waiting threads" subsystem - a unified interface for threads to wait
32   on each other, with built-in deadlock detection.
33 
34   Main concepts
35   ^^^^^^^^^^^^^
36   a thread - is represented by a WT_THD structure. One physical thread
37     can have only one WT_THD descriptor at any given moment.
38 
39   a resource - a thread does not wait for other threads directly,
40     instead it waits for a "resource", which is "owned" by other threads.
41     It waits, exactly, for all "owners" to "release" a resource.
42     It does not have to correspond to a physical resource. For example, it
43     may be convenient in certain cases to force resource == thread.
44     A resource is represented by a WT_RESOURCE structure.
45 
46   a resource identifier - a pair of {resource type, value}. A value is
47     an ulonglong number. Represented by a WT_RESOURCE_ID structure.
48 
49   a resource type - a pointer to a statically defined instance of
50     WT_RESOURCE_TYPE structure. This structure contains a pointer to
51     a function that knows how to compare values of this resource type.
52     In the simple case it could be wt_resource_id_memcmp().
53 
54   a wait-for graph - a graph, that represenst "wait-for" relationships.
55     It has two types of nodes - threads and resources. There are directed
56     edges from a thread to a resource it is waiting for (WT_THD::waiting_for),
57     from a thread to resources that it "owns" (WT_THD::my_resources),
58     and from a resource to threads that "own" it (WT_RESOURCE::owners)
59 
60   Graph completeness
61   ^^^^^^^^^^^^^^^^^^
62 
63   For flawless deadlock detection wait-for graph must be complete.
64   It means that when a thread starts waiting it needs to know *all* its
65   blockers, and call wt_thd_will_wait_for() for every one of them.
66   Otherwise two phenomena should be expected:
67 
68   1. Fuzzy timeouts:
69 
70     thread A needs to get a lock, and is blocked by a thread B.
71     it waits.
72     Just before the timeout thread B releases the lock.
73     thread A is ready to grab the lock but discovers that it is also
74     blocked by a thread C.
75     It waits and times out.
76 
77     As a result thread A has waited two timeout intervals, instead of one.
78 
79   2. Unreliable cycle detection:
80 
81      Thread A waits for threads B and C
82      Thread C waits for D
83      Thread D wants to start waiting for A
84 
85      one can see immediately that thread D creates a cycle, and thus
86      a deadlock is detected.
87 
88      But if thread A would only wait for B, and start waiting for C
89      when B would unlock, thread D would be allowed to wait, a deadlock
90      would be only detected when B unlocks or somebody times out.
91 
92   These two phenomena don't affect a correctness, and strictly speaking,
93   the caller is not required to call wt_thd_will_wait_for() for *all*
94   blockers - it may optimize wt_thd_will_wait_for() calls. But they
95   may be perceived as bugs by users, it must be understood that such
96   an optimization comes with its price.
97 
98   Usage
99   ^^^^^
100 
101   First, the wt* subsystem must be initialized by calling
102   wt_init(). In the server you don't need to do it, it's done
103   in mysqld.cc.
104 
105   Similarly, wt_end() frees wt* structures, should be called
106   at the end, but in the server mysqld.cc takes care of that.
107 
108   Every WT_THD should be initialized with wt_thd_lazy_init().
109   After that they can be used in other wt_thd_* calls.
110   Before discarding, WT_THD should be free'd with
111   wt_thd_destroy(). In the server both are handled in sql_class.cc,
112   it's an error to try to do it manually.
113 
114   To use the deadlock detection one needs to use this thread's WT_THD,
115   call wt_thd_will_wait_for() for every thread it needs to wait on,
116   then call wt_thd_cond_timedwait(). When thread releases a resource
117   it should call wt_thd_release() (or wt_thd_release_all()) - it will
118   notify (send a signal) threads waiting in wt_thd_cond_timedwait(),
119   if appropriate.
120 
121   Just like with pthread's cond_wait, there could be spurious
122   wake-ups from wt_thd_cond_timedwait(). A caller is expected to
123   handle that (that is, to re-check the blocking criteria).
124 
125   wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either
126   WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return
127   WT_TIMEOUT. Out of memory and other fatal errors are reported as
128   WT_DEADLOCK - and a transaction must be aborted just the same.
129 
130   Configuration
131   ^^^^^^^^^^^^^
132   There are four config variables. Two deadlock search depths - short and
133   long - and two timeouts. Deadlock search is performed with the short
134   depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait()
135   waits with a short timeout, performs a deadlock search with the long
136   depth, and waits with a long timeout. As most deadlock cycles are supposed
137   to be short, most deadlocks will be detected at once, and waits will
138   rarely be necessary.
139 
140   These config variables are thread-local. Different threads may have
141   different search depth and timeout values.
142 
143   Also, deadlock detector supports different killing strategies, the victim
144   in a deadlock cycle is selected based on the "weight". See "weight"
145   description in waiting_threads.h for details. It's up to the caller to
146   set weights accordingly.
147 
148   Status
149   ^^^^^^
150   We calculate the number of successfull waits (WT_OK returned from
151   wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle
152   length distribution - number of deadlocks with every length from
153   1 to WT_CYCLE_STATS, and a wait time distribution - number
154   of waits with a time from 1 us to 1 min in WT_WAIT_STATS
155   intervals on a log e scale.
156 
157   Sample usage as was done in the Maria engine
158   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
159   - in class THD, THD::transaction had a WT_THD object; there were session
160   variables to set the short/long depth/timeout.
161   - when mysqld started, wt_init() was called; when it ended, wt_end() was
162   called.
163   - in THD's constructor,
164   wt_thd_lazy_init(&transaction.wt, &variables.wt_deadlock_search_depth_short,
165     &variables.wt_timeout_short,
166     &variables.wt_deadlock_search_depth_long,
167     &variables.wt_timeout_long);
168   - in THD::cleanup():
169   wt_thd_destroy(&transaction.wt);
170   - this was sufficient to make the deadlock-detector available to the Maria
171   engine (which can grab THD); the engine used it this way:
172   - when it wrote a row, and hit a duplicate key, it would find who wrote this
173   key, the "blocker" transaction. If "blocker" had committed, duplicate key
174   error would be sent. Otherwise, we would wait for it, in the following code
175   snippet (originally from storage/maria/ma_write.c). After the blocker is
176   gone, we would retry the write:
177 
178         while (keyinfo->ck_insert(info,
179                  (*keyinfo->make_key)(info, &int_key, i, buff, record,
180                                       filepos, info->trn->trid)))
181         {
182           / * we got a write error * /
183           if error is not "duplicate key" then return error;
184           info->dup_key_trid has the culprit:
185           if the culprit is ourselves then return error;
186           otherwise:
187           blocker= trnman_trid_to_trn(info->trn, info->dup_key_trid);
188           / *
189             if blocker TRN was not found, it means that the conflicting
190             transaction was committed long time ago. It could not be
191             aborted, as it would have to wait on the key tree lock
192             to remove the conflicting key it has inserted.
193           * /
194           if (!blocker || blocker->commit_trid != ~(TrID)0)
195           { / * committed * /
196             if (blocker)
197               pthread_mutex_unlock(& blocker->state_lock);
198             rw_unlock(&keyinfo->root_lock);
199             goto err;
200           }
201           / * release root_lock to let blocker finish its work * /
202           rw_unlock(&keyinfo->root_lock);
203           {
204             / * running. now we wait * /
205             WT_RESOURCE_ID rc;
206             int res;
207             const char *old_proc_info;
208 
209             rc.type= &ma_rc_dup_unique;
210             rc.value= (intptr)blocker;
211             res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc);
212             if (res != WT_OK)
213             {
214               pthread_mutex_unlock(& blocker->state_lock);
215               my_errno= HA_ERR_LOCK_DEADLOCK;
216               goto err;
217             }
218             old_proc_info= proc_info_hook(0,
219                                           "waiting for a resource",
220                                           __func__, __FILE__, __LINE__);
221             res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock);
222             proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__);
223 
224             pthread_mutex_unlock(& blocker->state_lock);
225             if (res != WT_OK)
226             {
227               my_errno= res == WT_TIMEOUT ? HA_ERR_LOCK_WAIT_TIMEOUT
228                                           : HA_ERR_LOCK_DEADLOCK;
229               goto err;
230             }
231             / * if we come here, blocker has rolled back or committed,
232             so is gone, we can retry the ck_insert() * /
233           }
234           rw_wrlock(&keyinfo->root_lock);
235 #ifndef MARIA_CANNOT_ROLLBACK
236           keyinfo->version++;
237 #endif
238         }
239   - ma_rc_dup_unique was:
240   / * a WT_RESOURCE_TYPE for transactions waiting on a unique key conflict * /
241   WT_RESOURCE_TYPE ma_rc_dup_unique={ wt_resource_id_memcmp, 0};
242   - When a Maria transaction would commit or rollback it would call:
243   / * Wake up threads waiting for this transaction * /
244   static void wt_thd_release_self(TRN *trn)
245   {
246     if (trn->wt)
247     {
248       WT_RESOURCE_ID rc;
249       rc.type= &ma_rc_dup_unique;
250       rc.value= (intptr)trn;
251       wt_thd_release(trn->wt, & rc);
252       trn->wt= 0;
253     }
254   }
255 
256   Tests
257   ^^^^^
258   unittest/mysys/waiting_threads-t.c, currently disabled.
259 */
260 
261 /*
262   Note that if your lock system satisfy the following condition:
263 
264     there exist four lock levels A, B, C, D, such as
265       A is compatible with B
266       A is not compatible with C
267       D is not compatible with B
268 
269       (example A=IX, B=IS, C=S, D=X)
270 
271    you need to include lock level in the resource identifier - a
272    thread waiting for lock of the type A on resource R and another
273    thread waiting for lock of the type B on resource R should wait on
274    different WT_RESOURCE structures, on different {lock, resource}
275    pairs.  Otherwise the following is possible:
276 
277       thread1> take S-lock on R
278       thread2> take IS-lock on R
279       thread3> wants X-lock on R, starts waiting for threads 1 and 2 on R.
280       thread3 is killed (or timeout or whatever)
281       WT_RESOURCE structure for R is still in the hash, as it has two owners
282       thread4> wants an IX-lock on R
283       WT_RESOURCE for R is found in the hash, thread4 starts waiting on it.
284       !! now thread4 is waiting for both thread1 and thread2
285       !! while, in fact, IX-lock and IS-lock are compatible and
286       !! thread4 should not wait for thread2.
287 */
288 
289 #include <waiting_threads.h>
290 #include <m_string.h>
291 
292 /* status variables */
293 
294 /**
295   preset table of wait intervals
296 */
297 ulonglong wt_wait_table[WT_WAIT_STATS];
298 /**
299   wait time distribution (log e scale)
300 */
301 uint32 wt_wait_stats[WT_WAIT_STATS+1];
302 /**
303   distribution of cycle lengths
304   first column tells whether this was during short or long detection
305 */
306 uint32 wt_cycle_stats[2][WT_CYCLE_STATS+1];
307 uint32 wt_success_stats;
308 
309 static my_atomic_rwlock_t
310   cycle_stats_lock MY_ATTRIBUTE((unused)),
311   wait_stats_lock MY_ATTRIBUTE((unused)),
312   success_stats_lock MY_ATTRIBUTE((unused));
313 
314 #ifdef SAFE_STATISTICS
315 #define incr(VAR, LOCK)                           \
316   do {                                            \
317     my_atomic_rwlock_wrlock(&(LOCK));             \
318     my_atomic_add32(&(VAR), 1);                   \
319     my_atomic_rwlock_wrunlock(&(LOCK));           \
320   } while(0)
321 #else
322 #define incr(VAR,LOCK)  do { (VAR)++; } while(0)
323 #endif
324 
increment_success_stats()325 static void increment_success_stats()
326 {
327   incr(wt_success_stats, success_stats_lock);
328 }
329 
increment_cycle_stats(uint depth,uint slot)330 static void increment_cycle_stats(uint depth, uint slot)
331 {
332   if (depth >= WT_CYCLE_STATS)
333     depth= WT_CYCLE_STATS;
334   incr(wt_cycle_stats[slot][depth], cycle_stats_lock);
335 }
336 
increment_wait_stats(ulonglong waited,int ret)337 static void increment_wait_stats(ulonglong waited,int ret)
338 {
339   uint i;
340   if ((ret) == ETIMEDOUT)
341     i= WT_WAIT_STATS;
342   else
343     for (i= 0; i < WT_WAIT_STATS && waited/10 > wt_wait_table[i]; i++) ;
344   incr(wt_wait_stats[i], wait_stats_lock);
345 }
346 
347 /*
348   'lock' protects 'owners', 'state', and 'waiter_count'
349   'id' is read-only
350 
351   a resource is picked up from a hash in a lock-free manner
352   it's returned pinned, so it cannot be freed at once
353   but it may be freed right after the pin is removed
354   to free a resource it should
355     1. have no owners
356     2. have no waiters
357 
358   two ways to access a resource:
359     1. find it in a hash
360        - it's returned pinned.
361         a) take a lock in exclusive mode
362         b) check the state, it should be ACTIVE to be usable
363         c) unpin
364     2. by a direct reference
365        - could only used if a resource cannot be freed
366        e.g. accessing a resource by thd->waiting_for is safe,
367        a resource cannot be freed as there's a thread waiting for it
368 */
369 struct st_wt_resource {
370   WT_RESOURCE_ID  id;
371   uint            waiter_count;
372   enum { ACTIVE, FREE } state;
373 #ifndef DBUG_OFF
374   mysql_mutex_t  *cond_mutex; /* a mutex for the 'cond' below */
375 #endif
376   /*
377     before the 'lock' all elements are mutable, after (and including) -
378     immutable in the sense that lf_hash_insert() won't memcpy() over them.
379     See wt_init().
380   */
381 #ifdef WT_RWLOCKS_USE_MUTEXES
382   /*
383     we need a special rwlock-like 'lock' to allow readers bypass
384     waiting writers, otherwise readers can deadlock. For example:
385 
386       A waits on resource x, owned by B, B waits on resource y, owned
387       by A, we have a cycle (A->x->B->y->A)
388       Both A and B start deadlock detection:
389 
390         A locks x                          B locks y
391         A goes deeper                      B goes deeper
392         A locks y                          B locks x
393 
394       with mutexes it would deadlock. With rwlocks it won't, as long
395       as both A and B are taking read locks (and they do).
396       But other threads may take write locks. Assume there's
397       C who wants to start waiting on x, and D who wants to start
398       waiting on y.
399 
400         A read-locks x                       B read-locks y
401         A goes deeper                        B goes deeper
402      => C write-locks x (to add a new edge)  D write-locks y
403      .. C is blocked                         D is blocked
404         A read-locks y                       B read-locks x
405 
406       Now, if a read lock can bypass a pending wrote lock request, we're fine.
407       If it can not, we have a deadlock.
408 
409     writer starvation is technically possible, but unlikely, because
410     the contention is expected to be low.
411   */
412   struct {
413     mysql_cond_t   cond;
414     mysql_mutex_t  mutex;
415     uint readers: 16;
416     uint pending_writers: 15;
417     uint write_locked: 1;
418   } lock;
419 #else
420   rw_lock_t lock;
421 #endif
422   mysql_cond_t     cond; /* the corresponding mutex is provided by the caller */
423   DYNAMIC_ARRAY    owners;
424 };
425 
426 #ifdef  WT_RWLOCKS_USE_MUTEXES
rc_rwlock_init(WT_RESOURCE * rc)427 static void rc_rwlock_init(WT_RESOURCE *rc)
428 {
429   mysql_cond_init(0, &rc->lock.cond, 0);
430   mysql_mutex_init(0, &rc->lock.mutex, MY_MUTEX_INIT_FAST);
431 }
rc_rwlock_destroy(WT_RESOURCE * rc)432 static void rc_rwlock_destroy(WT_RESOURCE *rc)
433 {
434   DBUG_ASSERT(rc->lock.write_locked == 0);
435   DBUG_ASSERT(rc->lock.readers == 0);
436   mysql_cond_destroy(&rc->lock.cond);
437   mysql_mutex_destroy(&rc->lock.mutex);
438 }
rc_rdlock(WT_RESOURCE * rc)439 static void rc_rdlock(WT_RESOURCE *rc)
440 {
441   DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
442   mysql_mutex_lock(&rc->lock.mutex);
443   while (rc->lock.write_locked)
444     mysql_cond_wait(&rc->lock.cond, &rc->lock.mutex);
445   rc->lock.readers++;
446   mysql_mutex_unlock(&rc->lock.mutex);
447   DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
448 }
rc_wrlock(WT_RESOURCE * rc)449 static void rc_wrlock(WT_RESOURCE *rc)
450 {
451   DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
452   mysql_mutex_lock(&rc->lock.mutex);
453   while (rc->lock.write_locked || rc->lock.readers)
454     mysql_cond_wait(&rc->lock.cond, &rc->lock.mutex);
455   rc->lock.write_locked= 1;
456   mysql_mutex_unlock(&rc->lock.mutex);
457   DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
458 }
rc_unlock(WT_RESOURCE * rc)459 static void rc_unlock(WT_RESOURCE *rc)
460 {
461   DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
462   mysql_mutex_lock(&rc->lock.mutex);
463   if (rc->lock.write_locked)
464   {
465     rc->lock.write_locked= 0;
466     mysql_cond_broadcast(&rc->lock.cond);
467   }
468   else if (--rc->lock.readers == 0)
469     mysql_cond_broadcast(&rc->lock.cond);
470   mysql_mutex_unlock(&rc->lock.mutex);
471 }
472 #else
rc_rwlock_init(WT_RESOURCE * rc)473 static void rc_rwlock_init(WT_RESOURCE *rc)
474 {
475   my_rwlock_init(&rc->lock, 0);
476 }
rc_rwlock_destroy(WT_RESOURCE * rc)477 static void rc_rwlock_destroy(WT_RESOURCE *rc)
478 {
479   rwlock_destroy(&rc->lock);
480 }
rc_rdlock(WT_RESOURCE * rc)481 static void rc_rdlock(WT_RESOURCE *rc)
482 {
483   DBUG_PRINT("wt", ("TRYLOCK resid=%ld for READ", (ulong)rc->id.value));
484   rw_rdlock(&rc->lock);
485   DBUG_PRINT("wt", ("LOCK resid=%ld for READ", (ulong)rc->id.value));
486 }
rc_wrlock(WT_RESOURCE * rc)487 static void rc_wrlock(WT_RESOURCE *rc)
488 {
489   DBUG_PRINT("wt", ("TRYLOCK resid=%ld for WRITE", (ulong)rc->id.value));
490   rw_wrlock(&rc->lock);
491   DBUG_PRINT("wt", ("LOCK resid=%ld for WRITE", (ulong)rc->id.value));
492 }
rc_unlock(WT_RESOURCE * rc)493 static void rc_unlock(WT_RESOURCE *rc)
494 {
495   DBUG_PRINT("wt", ("UNLOCK resid=%ld", (ulong)rc->id.value));
496   rw_unlock(&rc->lock);
497 }
498 #endif
499 
500 /*
501   All resources are stored in a lock-free hash. Different threads
502   may add new resources and perform deadlock detection concurrently.
503 */
504 static LF_HASH      reshash;
505 
506 /**
507   WT_RESOURCE constructor
508 
509   It's called from lf_hash and takes a pointer to an LF_SLIST instance.
510   WT_RESOURCE is located at arg+sizeof(LF_SLIST)
511 */
wt_resource_init(uchar * arg)512 static void wt_resource_init(uchar *arg)
513 {
514   WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
515   DBUG_ENTER("wt_resource_init");
516 
517   memset(rc, 0, sizeof(*rc));
518   rc_rwlock_init(rc);
519   mysql_cond_init(0, &rc->cond, 0);
520   my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 0, 5);
521   DBUG_VOID_RETURN;
522 }
523 
524 /**
525   WT_RESOURCE destructor
526 
527   It's called from lf_hash and takes a pointer to an LF_SLIST instance.
528   WT_RESOURCE is located at arg+sizeof(LF_SLIST)
529 */
wt_resource_destroy(uchar * arg)530 static void wt_resource_destroy(uchar *arg)
531 {
532   WT_RESOURCE *rc= (WT_RESOURCE*)(arg+LF_HASH_OVERHEAD);
533   DBUG_ENTER("wt_resource_destroy");
534 
535   DBUG_ASSERT(rc->owners.elements == 0);
536   rc_rwlock_destroy(rc);
537   mysql_cond_destroy(&rc->cond);
538   delete_dynamic(&rc->owners);
539   DBUG_VOID_RETURN;
540 }
541 
wt_init()542 void wt_init()
543 {
544   DBUG_ENTER("wt_init");
545   DBUG_ASSERT(reshash.alloc.constructor != wt_resource_init);
546 
547   lf_hash_init(&reshash, sizeof(WT_RESOURCE), LF_HASH_UNIQUE, 0,
548                sizeof_WT_RESOURCE_ID, 0, 0);
549   reshash.alloc.constructor= wt_resource_init;
550   reshash.alloc.destructor= wt_resource_destroy;
551   /*
552     Note a trick: we initialize the hash with the real element size,
553     but fix it later to a shortened element size. This way
554     the allocator will allocate elements correctly, but
555     lf_hash_insert() will only overwrite part of the element with memcpy().
556     lock, condition, and dynamic array will be intact.
557   */
558   reshash.element_size= offsetof(WT_RESOURCE, lock);
559   memset(wt_wait_stats, 0, sizeof(wt_wait_stats));
560   memset(wt_cycle_stats, 0, sizeof(wt_cycle_stats));
561   wt_success_stats= 0;
562   { /* initialize wt_wait_table[]. from 1 us to 1 min, log e scale */
563     int i;
564     double from= log(1);   /* 1 us */
565     double to= log(60e6);  /* 1 min */
566     for (i= 0; i < WT_WAIT_STATS; i++)
567     {
568       wt_wait_table[i]= (ulonglong)exp((to-from)/(WT_WAIT_STATS-1)*i+from);
569       DBUG_ASSERT(i == 0 || wt_wait_table[i-1] != wt_wait_table[i]);
570     }
571   }
572   my_atomic_rwlock_init(&cycle_stats_lock);
573   my_atomic_rwlock_init(&success_stats_lock);
574   my_atomic_rwlock_init(&wait_stats_lock);
575   DBUG_VOID_RETURN;
576 }
577 
wt_end()578 void wt_end()
579 {
580   DBUG_ENTER("wt_end");
581 
582   DBUG_ASSERT(reshash.count == 0);
583   lf_hash_destroy(&reshash);
584   my_atomic_rwlock_destroy(&cycle_stats_lock);
585   my_atomic_rwlock_destroy(&success_stats_lock);
586   my_atomic_rwlock_destroy(&wait_stats_lock);
587   DBUG_VOID_RETURN;
588 }
589 
590 /**
591   Lazy WT_THD initialization
592 
593   Cheap initialization of WT_THD. Only initialize fields that don't require
594   memory allocations - basically, it only does assignments. The rest of the
595   WT_THD structure will be initialized on demand, on the first use.
596   This allows one to initialize lazily all WT_THD structures, even if some
597   (or even most) of them will never be used for deadlock detection.
598 
599   @param ds     a pointer to deadlock search depth short value
600   @param ts     a pointer to deadlock timeout short value
601   @param dl     a pointer to deadlock search depth long value
602   @param tl     a pointer to deadlock timeout long value
603 
604   @note these are pointers to values, and WT_THD stores them as pointers.
605   It allows one later to change search depths and timeouts for existing
606   threads. It also means that the pointers must stay valid for the lifetime
607   of WT_THD.
608 */
wt_thd_lazy_init(WT_THD * thd,const ulong * ds,const ulong * ts,const ulong * dl,const ulong * tl)609 void wt_thd_lazy_init(WT_THD *thd, const ulong *ds, const ulong *ts,
610                                    const ulong *dl, const ulong *tl)
611 {
612   DBUG_ENTER("wt_thd_lazy_init");
613   thd->waiting_for= 0;
614   thd->weight= 0;
615   thd->deadlock_search_depth_short= ds;
616   thd->timeout_short= ts;
617   thd->deadlock_search_depth_long= dl;
618   thd->timeout_long= tl;
619   /* dynamic array is also initialized lazily - without memory allocations */
620   my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5);
621 #ifndef DBUG_OFF
622   thd->name= my_thread_name();
623 #endif
624   DBUG_VOID_RETURN;
625 }
626 
627 /**
628   Finalize WT_THD initialization
629 
630   After lazy WT_THD initialization, parts of the structure are still
631   uninitialized. This function completes the initialization, allocating
632   memory, if necessary. It's called automatically on demand, when WT_THD
633   is about to be used.
634 */
fix_thd_pins(WT_THD * thd)635 static int fix_thd_pins(WT_THD *thd)
636 {
637   if (unlikely(thd->pins == 0))
638   {
639     thd->pins= lf_hash_get_pins(&reshash);
640 #ifndef DBUG_OFF
641     thd->name= my_thread_name();
642 #endif
643   }
644   return thd->pins == 0;
645 }
646 
wt_thd_destroy(WT_THD * thd)647 void wt_thd_destroy(WT_THD *thd)
648 {
649   DBUG_ENTER("wt_thd_destroy");
650 
651   DBUG_ASSERT(thd->my_resources.elements == 0);
652   DBUG_ASSERT(thd->waiting_for == 0);
653 
654   if (thd->pins != 0)
655     lf_hash_put_pins(thd->pins);
656 
657   delete_dynamic(&thd->my_resources);
658   DBUG_VOID_RETURN;
659 }
660 /**
661   Trivial resource id comparison function - bytewise memcmp.
662 
663   It can be used in WT_RESOURCE_TYPE structures where bytewise
664   comparison of values is sufficient.
665 */
wt_resource_id_memcmp(const void * a,const void * b)666 my_bool wt_resource_id_memcmp(const void *a, const void *b)
667 {
668   /* we use the fact that there's no padding in the middle of WT_RESOURCE_ID */
669   compile_time_assert(offsetof(WT_RESOURCE_ID, type) == sizeof(ulonglong));
670   return memcmp(a, b, sizeof_WT_RESOURCE_ID);
671 }
672 
673 /**
674   arguments for the recursive deadlock_search function
675 */
676 struct deadlock_arg {
677   WT_THD * const thd;          /**< starting point of a search */
678   uint const max_depth;        /**< search depth limit */
679   WT_THD *victim;              /**< a thread to be killed to resolve a deadlock */
680   WT_RESOURCE *last_locked_rc; /**< see comment at the end of deadlock_search() */
681 };
682 
683 /**
684   helper function to change the victim, according to the weight
685 */
change_victim(WT_THD * found,struct deadlock_arg * arg)686 static void change_victim(WT_THD* found, struct deadlock_arg *arg)
687 {
688   if (found->weight < arg->victim->weight)
689   {
690     if (arg->victim != arg->thd)
691     {
692       rc_unlock(arg->victim->waiting_for); /* release the previous victim */
693       DBUG_ASSERT(arg->last_locked_rc == found->waiting_for);
694     }
695     arg->victim= found;
696     arg->last_locked_rc= 0;
697   }
698 }
699 
700 /**
701   recursive loop detection in a wait-for graph with a limited search depth
702 */
deadlock_search(struct deadlock_arg * arg,WT_THD * blocker,uint depth)703 static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker,
704                            uint depth)
705 {
706   WT_RESOURCE *rc, *volatile *shared_ptr= &blocker->waiting_for;
707   WT_THD *cursor;
708   uint i;
709   int ret= WT_OK;
710   DBUG_ENTER("deadlock_search");
711   DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, depth=%u",
712                     arg->thd->name, blocker->name, depth));
713 
714   LF_REQUIRE_PINS(1);
715 
716   arg->last_locked_rc= 0;
717 
718   if (depth > arg->max_depth)
719   {
720     DBUG_PRINT("wt", ("exit: WT_DEPTH_EXCEEDED (early)"));
721     DBUG_RETURN(WT_DEPTH_EXCEEDED);
722   }
723 
724 retry:
725   /*
726     safe dereference as explained in lf_alloc-pin.c
727     (in short: protects against lf_alloc_free() in lf_hash_delete())
728   */
729   do
730   {
731     rc= *shared_ptr;
732     lf_pin(arg->thd->pins, 0, rc);
733   } while (rc != *shared_ptr && LF_BACKOFF);
734 
735   if (rc == 0)
736   {
737     DBUG_PRINT("wt", ("exit: OK (early)"));
738     DBUG_RETURN(0);
739   }
740 
741   rc_rdlock(rc);
742   if (rc->state != ACTIVE || *shared_ptr != rc)
743   {
744     /* blocker is not waiting on this resource anymore */
745     rc_unlock(rc);
746     lf_unpin(arg->thd->pins, 0);
747     goto retry;
748   }
749   /* as the state is locked, we can unpin now */
750   lf_unpin(arg->thd->pins, 0);
751 
752   /*
753     Below is not a pure depth-first search. It's a depth-first with a
754     slightest hint of breadth-first. Depth-first is:
755 
756       check(element, X):
757         foreach current in element->nodes[] do:
758           if current == X return error;
759           check(current, X);
760 
761     while we do
762 
763       check(element, X):
764         foreach current in element->nodes[] do:
765           if current == X return error;
766         foreach current in element->nodes[] do:
767           check(current, X);
768 
769     preferring shorter deadlocks over longer ones.
770   */
771   for (i= 0; i < rc->owners.elements; i++)
772   {
773     cursor= *dynamic_element(&rc->owners, i, WT_THD**);
774     /*
775       We're only looking for (and detecting) cycles that include 'arg->thd'.
776       That is, only deadlocks that *we* have created. For example,
777         thd->A->B->thd
778       (thd waits for A, A waits for B, while B is waiting for thd).
779       While walking the graph we can encounter other cicles, e.g.
780         thd->A->B->C->A
781       This will not be detected. Instead we will walk it in circles until
782       the search depth limit is reached (the latter guarantees that an
783       infinite loop is impossible). We expect the thread that has created
784       the cycle (one of A, B, and C) to detect its deadlock.
785     */
786     if (cursor == arg->thd)
787     {
788       ret= WT_DEADLOCK;
789       increment_cycle_stats(depth, arg->max_depth ==
790                                    *arg->thd->deadlock_search_depth_long);
791       arg->victim= cursor;
792       goto end;
793     }
794   }
795   for (i= 0; i < rc->owners.elements; i++)
796   {
797     cursor= *dynamic_element(&rc->owners, i, WT_THD**);
798     switch (deadlock_search(arg, cursor, depth+1)) {
799     case WT_OK:
800       break;
801     case WT_DEPTH_EXCEEDED:
802       ret= WT_DEPTH_EXCEEDED;
803       break;
804     case WT_DEADLOCK:
805       ret= WT_DEADLOCK;
806       change_victim(cursor, arg);       /* also sets arg->last_locked_rc to 0 */
807       i= rc->owners.elements;           /* jump out of the loop */
808       break;
809     default:
810       DBUG_ASSERT(0);
811     }
812     if (arg->last_locked_rc)
813       rc_unlock(arg->last_locked_rc);
814   }
815 end:
816   /*
817     Note that 'rc' is locked in this function, but it's never unlocked here.
818     Instead it's saved in arg->last_locked_rc and the *caller* is
819     expected to unlock it.  It's done to support different killing
820     strategies. This is how it works:
821     Assuming a graph
822 
823       thd->A->B->C->thd
824 
825     deadlock_search() function starts from thd, locks it (in fact it locks not
826     a thd, but a resource it is waiting on, but below, for simplicity, I'll
827     talk about "locking a thd"). Then it goes down recursively, locks A, and so
828     on. Goes down recursively, locks B. Goes down recursively, locks C.
829     Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd.
830     Returns from the last deadlock_search() call. C stays locked!
831     Now it checks whether C is a more appropriate victim than 'thd'.
832     If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked.
833     Now it checks whether B is a more appropriate victim than arg->victim.
834     If yes - old arg->victim is unlocked and arg->victim=B,
835     otherwise B is unlocked. Return.
836     And so on.
837 
838     In short, a resource is locked in a frame. But it's not unlocked in the
839     same frame, it's unlocked by the caller, and only after the caller checks
840     that it doesn't need to use current WT_THD as a victim. If it does - the
841     lock is kept and the old victim's resource is unlocked. When the recursion
842     is unrolled and we are back to deadlock() function, there are only two
843     locks left - on thd and on the victim.
844   */
845   arg->last_locked_rc= rc;
846   DBUG_PRINT("wt", ("exit: %s",
847                     ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" :
848                     ret ? "WT_DEADLOCK" : "OK"));
849   DBUG_RETURN(ret);
850 }
851 
852 /**
853   Deadlock detection in a wait-for graph
854 
855   A wrapper for recursive deadlock_search() - prepares deadlock_arg structure,
856   invokes deadlock_search(), increments statistics, notifies the victim.
857 
858   @param thd            thread that is going to wait. Deadlock is detected
859                         if, while walking the graph, we reach a thread that
860                         is waiting on thd
861   @param blocker        starting point of a search. In wt_thd_cond_timedwait()
862                         it's thd, in wt_thd_will_wait_for() it's a thread that
863                         thd is going to wait for
864   @param depth          starting search depth. In general it's the number of
865                         edges in the wait-for graph between thd and the
866                         blocker. Practically only two values are used (and
867                         supported) - when thd == blocker it's 0, when thd
868                         waits directly for blocker, it's 1
869   @param max_depth      search depth limit
870 */
deadlock(WT_THD * thd,WT_THD * blocker,uint depth,uint max_depth)871 static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth,
872                             uint max_depth)
873 {
874   struct deadlock_arg arg= {thd, max_depth, 0, 0};
875   int ret;
876   DBUG_ENTER("deadlock");
877   DBUG_ASSERT(depth < 2);
878   ret= deadlock_search(&arg, blocker, depth);
879   if (ret == WT_DEPTH_EXCEEDED)
880   {
881     increment_cycle_stats(WT_CYCLE_STATS, max_depth ==
882                                           *thd->deadlock_search_depth_long);
883     ret= WT_OK;
884   }
885   /*
886     if we started with depth==1, blocker was never considered for a victim
887     in deadlock_search(). Do it here.
888   */
889   if (ret == WT_DEADLOCK && depth)
890     change_victim(blocker, &arg);
891   if (arg.last_locked_rc)
892   {
893     /*
894       Special return code if there's nobody to wait for.
895 
896       depth == 0 means that we start the search from thd (thd == blocker).
897       ret == WT_OK means that no cycle was found and
898         arg.last_locked_rc == thd->waiting_for.
899       and arg.last_locked_rc->owners.elements == 0 means that
900         (applying the rule above) thd->waiting_for->owners.elements == 0,
901         and thd doesn't have anybody to wait for.
902     */
903     if (depth == 0 && ret == WT_OK && arg.last_locked_rc->owners.elements == 0)
904     {
905       DBUG_ASSERT(thd == blocker);
906       DBUG_ASSERT(arg.last_locked_rc == thd->waiting_for);
907       ret= WT_FREE_TO_GO;
908     }
909     rc_unlock(arg.last_locked_rc);
910   }
911   /* notify the victim, if appropriate */
912   if (ret == WT_DEADLOCK && arg.victim != thd)
913   {
914     DBUG_PRINT("wt", ("killing %s", arg.victim->name));
915     arg.victim->killed= 1;
916     mysql_cond_broadcast(&arg.victim->waiting_for->cond);
917     rc_unlock(arg.victim->waiting_for);
918     ret= WT_OK;
919   }
920   DBUG_RETURN(ret);
921 }
922 
923 
924 /**
925   Delete an element from reshash if it has no waiters or owners
926 
927   rc->lock must be locked by the caller and it's unlocked on return.
928 */
unlock_lock_and_free_resource(WT_THD * thd,WT_RESOURCE * rc)929 static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc)
930 {
931   uint keylen;
932   const void *key;
933   DBUG_ENTER("unlock_lock_and_free_resource");
934 
935   DBUG_ASSERT(rc->state == ACTIVE);
936 
937   if (rc->owners.elements || rc->waiter_count)
938   {
939     DBUG_PRINT("wt", ("nothing to do, %u owners, %u waiters",
940                       rc->owners.elements, rc->waiter_count));
941     rc_unlock(rc);
942     DBUG_RETURN(0);
943   }
944 
945   if (fix_thd_pins(thd))
946   {
947     rc_unlock(rc);
948     DBUG_RETURN(1);
949   }
950 
951   /* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */
952   {
953     key= &rc->id;
954     keylen= sizeof_WT_RESOURCE_ID;
955   }
956 
957   /*
958     To free the element correctly we need to:
959      1. take its lock (already done).
960      2. set the state to FREE
961      3. release the lock
962      4. remove from the hash
963   */
964   rc->state= FREE;
965   rc_unlock(rc);
966   DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1);
967 }
968 
969 
970 /**
971   register the fact that thd is not waiting anymore
972 
973   decrease waiter_count, clear waiting_for, free the resource if appropriate.
974   thd->waiting_for must be locked!
975 */
stop_waiting_locked(WT_THD * thd)976 static int stop_waiting_locked(WT_THD *thd)
977 {
978   int ret;
979   WT_RESOURCE *rc= thd->waiting_for;
980   DBUG_ENTER("stop_waiting_locked");
981 
982   DBUG_ASSERT(rc->waiter_count);
983   DBUG_ASSERT(rc->state == ACTIVE);
984   rc->waiter_count--;
985   thd->waiting_for= 0;
986   ret= unlock_lock_and_free_resource(thd, rc);
987   DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK);
988 }
989 
990 /**
991   register the fact that thd is not waiting anymore
992 
993   locks thd->waiting_for and calls stop_waiting_locked().
994 */
stop_waiting(WT_THD * thd)995 static int stop_waiting(WT_THD *thd)
996 {
997   int ret;
998   WT_RESOURCE *rc= thd->waiting_for;
999   DBUG_ENTER("stop_waiting");
1000 
1001   if (!rc)
1002     DBUG_RETURN(WT_OK);
1003   /*
1004     nobody's trying to free the resource now,
1005     as its waiter_count is guaranteed to be non-zero
1006   */
1007   rc_wrlock(rc);
1008   ret= stop_waiting_locked(thd);
1009   DBUG_RETURN(ret);
1010 }
1011 
1012 /**
1013   notify the system that a thread needs to wait for another thread
1014 
1015   called by a *waiter* to declare that it (thd) will wait for another
1016   thread (blocker) on a specific resource (resid).
1017   can be called many times, if many blockers own a blocking resource.
1018   but must always be called with the same resource id - a thread cannot
1019   wait for more than one resource at a time.
1020 
1021   @return WT_OK or WT_DEADLOCK
1022 
1023   As a new edge is added to the wait-for graph, a deadlock detection is
1024   performed for this new edge.
1025 */
wt_thd_will_wait_for(WT_THD * thd,WT_THD * blocker,const WT_RESOURCE_ID * resid)1026 int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker,
1027                          const WT_RESOURCE_ID *resid)
1028 {
1029   uint i;
1030   WT_RESOURCE *rc;
1031   DBUG_ENTER("wt_thd_will_wait_for");
1032 
1033   LF_REQUIRE_PINS(3);
1034 
1035   DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, resid=%lu",
1036                     thd->name, blocker->name, (ulong)resid->value));
1037 
1038   if (fix_thd_pins(thd))
1039     DBUG_RETURN(WT_DEADLOCK);
1040 
1041   if (thd->waiting_for == 0)
1042   {
1043     uint keylen;
1044     const void *key;
1045     /* XXX if (restype->make_key) key= restype->make_key(resid, &keylen); else */
1046     {
1047       key= resid;
1048       keylen= sizeof_WT_RESOURCE_ID;
1049     }
1050 
1051     DBUG_PRINT("wt", ("first blocker"));
1052 
1053 retry:
1054     while ((rc= lf_hash_search(&reshash, thd->pins, key, keylen)) == 0)
1055     {
1056       WT_RESOURCE tmp;
1057 
1058       DBUG_PRINT("wt", ("failed to find rc in hash, inserting"));
1059       memset(&tmp, 0, sizeof(tmp));
1060       tmp.id= *resid;
1061       tmp.state= ACTIVE;
1062 
1063       if (lf_hash_insert(&reshash, thd->pins, &tmp) == -1) /* if OOM */
1064         DBUG_RETURN(WT_DEADLOCK);
1065       /*
1066         Two cases: either lf_hash_insert() failed - because another thread
1067         has just inserted a resource with the same id - and we need to retry.
1068         Or lf_hash_insert() succeeded, and then we need to repeat
1069         lf_hash_search() to find a real address of the newly inserted element.
1070         That is, we don't care what lf_hash_insert() has returned.
1071         And we need to repeat the loop anyway.
1072       */
1073     }
1074     if (rc == MY_ERRPTR)
1075       DBUG_RETURN(WT_DEADLOCK);
1076 
1077     DBUG_PRINT("wt", ("found in hash rc=%p", rc));
1078 
1079     rc_wrlock(rc);
1080     if (rc->state != ACTIVE)
1081     {
1082       DBUG_PRINT("wt", ("but it's not active, retrying"));
1083       /* Somebody has freed the element while we weren't looking */
1084       rc_unlock(rc);
1085       lf_hash_search_unpin(thd->pins);
1086       goto retry;
1087     }
1088 
1089     lf_hash_search_unpin(thd->pins); /* the element cannot go away anymore */
1090     thd->waiting_for= rc;
1091     rc->waiter_count++;
1092     thd->killed= 0;
1093   }
1094   else
1095   {
1096     DBUG_ASSERT(thd->waiting_for->id.type == resid->type);
1097     DBUG_ASSERT(resid->type->compare(&thd->waiting_for->id, resid) == 0);
1098     DBUG_PRINT("wt", ("adding another blocker"));
1099 
1100     /*
1101       we can safely access the resource here, it's in the hash as it has
1102       non-zero waiter_count
1103     */
1104     rc= thd->waiting_for;
1105     rc_wrlock(rc);
1106     DBUG_ASSERT(rc->waiter_count);
1107     DBUG_ASSERT(rc->state == ACTIVE);
1108 
1109     if (thd->killed)
1110     {
1111       stop_waiting_locked(thd);
1112       DBUG_RETURN(WT_DEADLOCK);
1113     }
1114   }
1115   /*
1116     Another thread could be waiting on this resource for this very 'blocker'.
1117     In this case we should not add it to the list for the second time.
1118   */
1119   for (i= 0; i < rc->owners.elements; i++)
1120     if (*dynamic_element(&rc->owners, i, WT_THD**) == blocker)
1121       break;
1122   if (i >= rc->owners.elements)
1123   {
1124     if (push_dynamic(&blocker->my_resources, (void*)&rc))
1125     {
1126       stop_waiting_locked(thd);
1127       DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */
1128     }
1129     if (push_dynamic(&rc->owners, (void*)&blocker))
1130     {
1131       pop_dynamic(&blocker->my_resources);
1132       stop_waiting_locked(thd);
1133       DBUG_RETURN(WT_DEADLOCK);
1134     }
1135   }
1136   rc_unlock(rc);
1137 
1138   if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short) != WT_OK)
1139   {
1140     stop_waiting(thd);
1141     DBUG_RETURN(WT_DEADLOCK);
1142   }
1143   DBUG_RETURN(WT_OK);
1144 }
1145 
1146 /**
1147   called by a *waiter* (thd) to start waiting
1148 
1149   It's supposed to be a drop-in replacement for
1150   pthread_cond_timedwait(), and it takes mutex as an argument.
1151 
1152   @return one of WT_TIMEOUT, WT_DEADLOCK, WT_OK
1153 */
wt_thd_cond_timedwait(WT_THD * thd,mysql_mutex_t * mutex)1154 int wt_thd_cond_timedwait(WT_THD *thd, mysql_mutex_t *mutex)
1155 {
1156   int ret= WT_TIMEOUT;
1157   struct timespec timeout;
1158   ulonglong before, after, starttime;
1159   WT_RESOURCE *rc= thd->waiting_for;
1160   DBUG_ENTER("wt_thd_cond_timedwait");
1161   DBUG_PRINT("wt", ("enter: thd=%s, rc=%p", thd->name, rc));
1162 
1163 #ifndef DBUG_OFF
1164   if (rc->cond_mutex)
1165     DBUG_ASSERT(rc->cond_mutex == mutex);
1166   else
1167     rc->cond_mutex= mutex;
1168   mysql_mutex_assert_owner(mutex);
1169 #endif
1170 
1171   before= starttime= my_getsystime();
1172 
1173 #ifdef __WIN__
1174   /*
1175     only for the sake of Windows we distinguish between
1176     'before' and 'starttime':
1177 
1178     my_getsystime() returns high-resolution value, that cannot be used for
1179     waiting (it doesn't follow system clock changes), but is good for time
1180     intervals.
1181 
1182     GetSystemTimeAsFileTime() follows system clock, but is low-resolution
1183     and will result in lousy intervals.
1184   */
1185   GetSystemTimeAsFileTime((PFILETIME)&starttime);
1186 #endif
1187 
1188   rc_wrlock(rc);
1189   if (rc->owners.elements == 0)
1190     ret= WT_OK;
1191   rc_unlock(rc);
1192 
1193   set_timespec_time_nsec(timeout, starttime, (*thd->timeout_short)*1000ULL);
1194   if (ret == WT_TIMEOUT && !thd->killed)
1195     ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1196   if (ret == WT_TIMEOUT && !thd->killed)
1197   {
1198     int r= deadlock(thd, thd, 0, *thd->deadlock_search_depth_long);
1199     if (r == WT_FREE_TO_GO)
1200       ret= WT_OK;
1201     else if (r != WT_OK)
1202       ret= WT_DEADLOCK;
1203     else if (*thd->timeout_long > *thd->timeout_short)
1204     {
1205       set_timespec_time_nsec(timeout, starttime, (*thd->timeout_long)*1000ULL);
1206       if (!thd->killed)
1207         ret= mysql_cond_timedwait(&rc->cond, mutex, &timeout);
1208     }
1209   }
1210   after= my_getsystime();
1211   if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */
1212     ret= WT_DEADLOCK;
1213   increment_wait_stats(after-before, ret);
1214   if (ret == WT_OK)
1215     increment_success_stats();
1216   DBUG_RETURN(ret);
1217 }
1218 
1219 /**
1220   called by a *blocker* when it releases a resource
1221 
1222   it's conceptually similar to pthread_cond_broadcast, and must be done
1223   under the same mutex as wt_thd_cond_timedwait().
1224 
1225   @param resid   a resource to release. 0 to release all resources
1226 */
1227 
wt_thd_release(WT_THD * thd,const WT_RESOURCE_ID * resid)1228 void wt_thd_release(WT_THD *thd, const WT_RESOURCE_ID *resid)
1229 {
1230   uint i;
1231   DBUG_ENTER("wt_thd_release");
1232 
1233   for (i= 0; i < thd->my_resources.elements; i++)
1234   {
1235     WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**);
1236     if (!resid || (resid->type->compare(&rc->id, resid) == 0))
1237     {
1238       uint j;
1239 
1240       rc_wrlock(rc);
1241       /*
1242         nobody's trying to free the resource now,
1243         as its owners[] array is not empty (at least thd must be there)
1244       */
1245       DBUG_ASSERT(rc->state == ACTIVE);
1246       for (j= 0; j < rc->owners.elements; j++)
1247         if (*dynamic_element(&rc->owners, j, WT_THD**) == thd)
1248           break;
1249       DBUG_ASSERT(j < rc->owners.elements);
1250       delete_dynamic_element(&rc->owners, j);
1251       if (rc->owners.elements == 0)
1252       {
1253         mysql_cond_broadcast(&rc->cond);
1254 #ifndef DBUG_OFF
1255         if (rc->cond_mutex)
1256           mysql_mutex_assert_owner(rc->cond_mutex);
1257 #endif
1258       }
1259       unlock_lock_and_free_resource(thd, rc);
1260       if (resid)
1261       {
1262         delete_dynamic_element(&thd->my_resources, i);
1263         DBUG_VOID_RETURN;
1264       }
1265     }
1266   }
1267   if (!resid)
1268     reset_dynamic(&thd->my_resources);
1269   DBUG_VOID_RETURN;
1270 }
1271 
1272