1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5 
6 Portions of this file contain modifications contributed and copyrighted by
7 Google, Inc. Those modifications are gratefully acknowledged and are described
8 briefly in the InnoDB documentation. The contributions by Google are
9 incorporated with their permission, and subject to the conditions contained in
10 the file COPYING.Google.
11 
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License, version 2.0,
14 as published by the Free Software Foundation.
15 
16 This program is also distributed with certain software (including
17 but not limited to OpenSSL) that is licensed under separate terms,
18 as designated in a particular file or component or in included license
19 documentation.  The authors of MySQL hereby grant you an additional
20 permission to link the program and your derivative works with the
21 separately licensed software that they have included with MySQL.
22 
23 This program is distributed in the hope that it will be useful,
24 but WITHOUT ANY WARRANTY; without even the implied warranty of
25 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26 GNU General Public License, version 2.0, for more details.
27 
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
31 
32 *****************************************************************************/
33 
34 /**************************************************//**
35 @file sync/sync0arr.cc
36 The wait array used in synchronization primitives
37 
38 Created 9/5/1995 Heikki Tuuri
39 *******************************************************/
40 
41 #include "sync0arr.h"
42 #ifdef UNIV_NONINL
43 #include "sync0arr.ic"
44 #endif
45 
46 #include "sync0sync.h"
47 #include "sync0rw.h"
48 #include "os0sync.h"
49 #include "os0file.h"
50 #include "lock0lock.h"
51 #include "srv0srv.h"
52 #include "ha_prototypes.h"
53 
54 /*
55 			WAIT ARRAY
56 			==========
57 
58 The wait array consists of cells each of which has an
59 an operating system event object created for it. The threads
60 waiting for a mutex, for example, can reserve a cell
61 in the array and suspend themselves to wait for the event
62 to become signaled. When using the wait array, remember to make
63 sure that some thread holding the synchronization object
64 will eventually know that there is a waiter in the array and
65 signal the object, to prevent infinite wait.
66 Why we chose to implement a wait array? First, to make
67 mutexes fast, we had to code our own implementation of them,
68 which only in usually uncommon cases resorts to using
69 slow operating system primitives. Then we had the choice of
70 assigning a unique OS event for each mutex, which would
71 be simpler, or using a global wait array. In some operating systems,
72 the global wait array solution is more efficient and flexible,
73 because we can do with a very small number of OS events,
74 say 200. In NT 3.51, allocating events seems to be a quadratic
75 algorithm, because 10 000 events are created fast, but
76 100 000 events takes a couple of minutes to create.
77 
78 As of 5.0.30 the above mentioned design is changed. Since now
79 OS can handle millions of wait events efficiently, we no longer
80 have this concept of each cell of wait array having one event.
81 Instead, now the event that a thread wants to wait on is embedded
82 in the wait object (mutex or rw_lock). We still keep the global
83 wait array for the sake of diagnostics and also to avoid infinite
84 wait The error_monitor thread scans the global wait array to signal
85 any waiting threads who have missed the signal. */
86 
87 /** A cell where an individual thread may wait suspended
88 until a resource is released. The suspending is implemented
89 using an operating system event semaphore. */
90 struct sync_cell_t {
91 	void*		wait_object;	/*!< pointer to the object the
92 					thread is waiting for; if NULL
93 					the cell is free for use */
94 	ib_mutex_t*	old_wait_mutex;	/*!< the latest wait mutex in cell */
95 	rw_lock_t*	old_wait_rw_lock;
96 					/*!< the latest wait rw-lock
97 					in cell */
98 	ulint		request_type;	/*!< lock type requested on the
99 					object */
100 	const char*	file;		/*!< in debug version file where
101 					requested */
102 	ulint		line;		/*!< in debug version line where
103 					requested */
104 	os_thread_id_t	thread;		/*!< thread id of this waiting
105 					thread */
106 	ibool		waiting;	/*!< TRUE if the thread has already
107 					called sync_array_event_wait
108 					on this cell */
109 	ib_int64_t	signal_count;	/*!< We capture the signal_count
110 					of the wait_object when we
111 					reset the event. This value is
112 					then passed on to os_event_wait
113 					and we wait only if the event
114 					has not been signalled in the
115 					period between the reset and
116 					wait call. */
117 	time_t		reservation_time;/*!< time when the thread reserved
118 					the wait cell */
119 };
120 
121 /* NOTE: It is allowed for a thread to wait
122 for an event allocated for the array without owning the
123 protecting mutex (depending on the case: OS or database mutex), but
124 all changes (set or reset) to the state of the event must be made
125 while owning the mutex. */
126 
127 /** Synchronization array */
128 struct sync_array_t {
129 	ulint		n_reserved;	/*!< number of currently reserved
130 					cells in the wait array */
131 	ulint		n_cells;	/*!< number of cells in the
132 					wait array */
133 	sync_cell_t*	array;		/*!< pointer to wait array */
134 	ib_mutex_t	mutex;		/*!< possible database mutex
135 					protecting this data structure */
136 	os_ib_mutex_t	os_mutex;	/*!< Possible operating system mutex
137 					protecting the data structure.
138 					As this data structure is used in
139 					constructing the database mutex,
140 					to prevent infinite recursion
141 					in implementation, we fall back to
142 					an OS mutex. */
143 	ulint		res_count;	/*!< count of cell reservations
144 					since creation of the array */
145 };
146 
147 /** User configured sync array size */
148 UNIV_INTERN ulong	srv_sync_array_size = 32;
149 
150 /** Locally stored copy of srv_sync_array_size */
151 static	ulint		sync_array_size;
152 
153 /** The global array of wait cells for implementation of the database's own
154 mutexes and read-write locks */
155 static	sync_array_t**	sync_wait_array;
156 
157 /** count of how many times an object has been signalled */
158 static ulint		sg_count;
159 
160 #ifdef UNIV_SYNC_DEBUG
161 /******************************************************************//**
162 This function is called only in the debug version. Detects a deadlock
163 of one or more threads because of waits of semaphores.
164 @return	TRUE if deadlock detected */
165 static
166 ibool
167 sync_array_detect_deadlock(
168 /*=======================*/
169 	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
170 				own the mutex to array */
171 	sync_cell_t*	start,	/*!< in: cell where recursive search started */
172 	sync_cell_t*	cell,	/*!< in: cell to search */
173 	ulint		depth);	/*!< in: recursion depth */
174 #endif /* UNIV_SYNC_DEBUG */
175 
176 /*****************************************************************//**
177 Gets the nth cell in array.
178 @return	cell */
179 static
180 sync_cell_t*
sync_array_get_nth_cell(sync_array_t * arr,ulint n)181 sync_array_get_nth_cell(
182 /*====================*/
183 	sync_array_t*	arr,	/*!< in: sync array */
184 	ulint		n)	/*!< in: index */
185 {
186 	ut_a(arr);
187 	ut_a(n < arr->n_cells);
188 
189 	return(arr->array + n);
190 }
191 
192 /******************************************************************//**
193 Reserves the mutex semaphore protecting a sync array. */
194 static
195 void
sync_array_enter(sync_array_t * arr)196 sync_array_enter(
197 /*=============*/
198 	sync_array_t*	arr)	/*!< in: sync wait array */
199 {
200 	os_mutex_enter(arr->os_mutex);
201 }
202 
203 /******************************************************************//**
204 Releases the mutex semaphore protecting a sync array. */
205 static
206 void
sync_array_exit(sync_array_t * arr)207 sync_array_exit(
208 /*============*/
209 	sync_array_t*	arr)	/*!< in: sync wait array */
210 {
211 	os_mutex_exit(arr->os_mutex);
212 }
213 
214 /*******************************************************************//**
215 Creates a synchronization wait array. It is protected by a mutex
216 which is automatically reserved when the functions operating on it
217 are called.
218 @return	own: created wait array */
219 static
220 sync_array_t*
sync_array_create(ulint n_cells)221 sync_array_create(
222 /*==============*/
223 	ulint	n_cells)	/*!< in: number of cells in the array
224 				to create */
225 {
226 	ulint		sz;
227 	sync_array_t*	arr;
228 
229 	ut_a(n_cells > 0);
230 
231 	/* Allocate memory for the data structures */
232 	arr = static_cast<sync_array_t*>(ut_malloc(sizeof(*arr)));
233 	memset(arr, 0x0, sizeof(*arr));
234 
235 	sz = sizeof(sync_cell_t) * n_cells;
236 	arr->array = static_cast<sync_cell_t*>(ut_malloc(sz));
237 	memset(arr->array, 0x0, sz);
238 
239 	arr->n_cells = n_cells;
240 
241 	/* Then create the mutex to protect the wait array complex */
242 	arr->os_mutex = os_mutex_create();
243 
244 	return(arr);
245 }
246 
247 /******************************************************************//**
248 Frees the resources in a wait array. */
249 static
250 void
sync_array_free(sync_array_t * arr)251 sync_array_free(
252 /*============*/
253 	sync_array_t*	arr)	/*!< in, own: sync wait array */
254 {
255 	ut_a(arr->n_reserved == 0);
256 
257 	sync_array_validate(arr);
258 
259 	/* Release the mutex protecting the wait array complex */
260 
261 	os_mutex_free(arr->os_mutex);
262 
263 	ut_free(arr->array);
264 	ut_free(arr);
265 }
266 
267 /********************************************************************//**
268 Validates the integrity of the wait array. Checks
269 that the number of reserved cells equals the count variable. */
270 UNIV_INTERN
271 void
sync_array_validate(sync_array_t * arr)272 sync_array_validate(
273 /*================*/
274 	sync_array_t*	arr)	/*!< in: sync wait array */
275 {
276 	ulint		i;
277 	sync_cell_t*	cell;
278 	ulint		count		= 0;
279 
280 	sync_array_enter(arr);
281 
282 	for (i = 0; i < arr->n_cells; i++) {
283 		cell = sync_array_get_nth_cell(arr, i);
284 		if (cell->wait_object != NULL) {
285 			count++;
286 		}
287 	}
288 
289 	ut_a(count == arr->n_reserved);
290 
291 	sync_array_exit(arr);
292 }
293 
294 /*******************************************************************//**
295 Returns the event that the thread owning the cell waits for. */
296 static
297 os_event_t
sync_cell_get_event(sync_cell_t * cell)298 sync_cell_get_event(
299 /*================*/
300 	sync_cell_t*	cell) /*!< in: non-empty sync array cell */
301 {
302 	ulint type = cell->request_type;
303 
304 	if (type == SYNC_MUTEX) {
305 		return(((ib_mutex_t*) cell->wait_object)->event);
306 	} else if (type == RW_LOCK_WAIT_EX) {
307 		return(((rw_lock_t*) cell->wait_object)->wait_ex_event);
308 	} else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
309 		return(((rw_lock_t*) cell->wait_object)->event);
310 	}
311 }
312 
313 /******************************************************************//**
314 Reserves a wait array cell for waiting for an object.
315 The event of the cell is reset to nonsignalled state.
316 @return true if free cell is found, otherwise false */
317 UNIV_INTERN
318 bool
sync_array_reserve_cell(sync_array_t * arr,void * object,ulint type,const char * file,ulint line,ulint * index)319 sync_array_reserve_cell(
320 /*====================*/
321 	sync_array_t*	arr,	/*!< in: wait array */
322 	void*		object, /*!< in: pointer to the object to wait for */
323 	ulint		type,	/*!< in: lock request type */
324 	const char*	file,	/*!< in: file where requested */
325 	ulint		line,	/*!< in: line where requested */
326 	ulint*		index)	/*!< out: index of the reserved cell */
327 {
328 	sync_cell_t*	cell;
329 	os_event_t      event;
330 	ulint		i;
331 
332 	ut_a(object);
333 	ut_a(index);
334 
335 	sync_array_enter(arr);
336 
337 	arr->res_count++;
338 
339 	/* Reserve a new cell. */
340 	for (i = 0; i < arr->n_cells; i++) {
341 		cell = sync_array_get_nth_cell(arr, i);
342 
343 		if (cell->wait_object == NULL) {
344 
345 			cell->waiting = FALSE;
346 			cell->wait_object = object;
347 
348 			if (type == SYNC_MUTEX) {
349 				cell->old_wait_mutex =
350 					static_cast<ib_mutex_t*>(object);
351 			} else {
352 				cell->old_wait_rw_lock =
353 					static_cast<rw_lock_t*>(object);
354 			}
355 
356 			cell->request_type = type;
357 
358 			cell->file = file;
359 			cell->line = line;
360 
361 			arr->n_reserved++;
362 
363 			*index = i;
364 
365 			sync_array_exit(arr);
366 
367 			/* Make sure the event is reset and also store
368 			the value of signal_count at which the event
369 			was reset. */
370                         event = sync_cell_get_event(cell);
371 			cell->signal_count = os_event_reset(event);
372 
373 			cell->reservation_time = ut_time();
374 
375 			cell->thread = os_thread_get_curr_id();
376 
377 			return(true);
378 		}
379 	}
380 
381 	/* No free cell found */
382 	return false;
383 }
384 
385 /******************************************************************//**
386 This function should be called when a thread starts to wait on
387 a wait array cell. In the debug version this function checks
388 if the wait for a semaphore will result in a deadlock, in which
389 case prints info and asserts. */
390 UNIV_INTERN
391 void
sync_array_wait_event(sync_array_t * arr,ulint index)392 sync_array_wait_event(
393 /*==================*/
394 	sync_array_t*	arr,	/*!< in: wait array */
395 	ulint		index)	/*!< in: index of the reserved cell */
396 {
397 	sync_cell_t*	cell;
398 	os_event_t	event;
399 
400 	ut_a(arr);
401 
402 	sync_array_enter(arr);
403 
404 	cell = sync_array_get_nth_cell(arr, index);
405 
406 	ut_a(cell->wait_object);
407 	ut_a(!cell->waiting);
408 	ut_ad(os_thread_get_curr_id() == cell->thread);
409 
410 	event = sync_cell_get_event(cell);
411 	cell->waiting = TRUE;
412 
413 #ifdef UNIV_SYNC_DEBUG
414 
415 	/* We use simple enter to the mutex below, because if
416 	we cannot acquire it at once, mutex_enter would call
417 	recursively sync_array routines, leading to trouble.
418 	rw_lock_debug_mutex freezes the debug lists. */
419 
420 	rw_lock_debug_mutex_enter();
421 
422 	if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) {
423 
424 		fputs("########################################\n", stderr);
425 		ut_error;
426 	}
427 
428 	rw_lock_debug_mutex_exit();
429 #endif
430 	sync_array_exit(arr);
431 
432 	os_event_wait_low(event, cell->signal_count);
433 
434 	sync_array_free_cell(arr, index);
435 }
436 
437 /******************************************************************//**
438 Reports info of a wait array cell. */
439 static
440 void
sync_array_cell_print(FILE * file,sync_cell_t * cell)441 sync_array_cell_print(
442 /*==================*/
443 	FILE*		file,	/*!< in: file where to print */
444 	sync_cell_t*	cell)	/*!< in: sync cell */
445 {
446 	ib_mutex_t*	mutex;
447 	rw_lock_t*	rwlock;
448 	ulint		type;
449 	ulint		writer;
450 
451 	type = cell->request_type;
452 
453 	fprintf(file,
454 		"--Thread %lu has waited at %s line %lu"
455 		" for %.2f seconds the semaphore:\n",
456 		(ulong) os_thread_pf(cell->thread),
457 		innobase_basename(cell->file), (ulong) cell->line,
458 		difftime(time(NULL), cell->reservation_time));
459 
460 	if (type == SYNC_MUTEX) {
461 		/* We use old_wait_mutex in case the cell has already
462 		been freed meanwhile */
463 		mutex = cell->old_wait_mutex;
464 
465 		fprintf(file,
466 			"Mutex at %p created file %s line %lu, lock var %lu\n"
467 #ifdef UNIV_SYNC_DEBUG
468 			"Last time reserved in file %s line %lu, "
469 #endif /* UNIV_SYNC_DEBUG */
470 			"waiters flag %lu\n",
471 			(void*) mutex, innobase_basename(mutex->cfile_name),
472 			(ulong) mutex->cline,
473 			(ulong) mutex->lock_word,
474 #ifdef UNIV_SYNC_DEBUG
475 			mutex->file_name, (ulong) mutex->line,
476 #endif /* UNIV_SYNC_DEBUG */
477 			(ulong) mutex->waiters);
478 
479 	} else if (type == RW_LOCK_EX
480 		   || type == RW_LOCK_WAIT_EX
481 		   || type == RW_LOCK_SHARED) {
482 
483 		fputs(type == RW_LOCK_EX ? "X-lock on"
484 		      : type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on"
485 		      : "S-lock on", file);
486 
487 		rwlock = cell->old_wait_rw_lock;
488 
489 		fprintf(file,
490 			" RW-latch at %p created in file %s line %lu\n",
491 			(void*) rwlock, innobase_basename(rwlock->cfile_name),
492 			(ulong) rwlock->cline);
493 		writer = rw_lock_get_writer(rwlock);
494 		if (writer != RW_LOCK_NOT_LOCKED) {
495 			fprintf(file,
496 				"a writer (thread id %lu) has"
497 				" reserved it in mode %s",
498 				(ulong) os_thread_pf(rwlock->writer_thread),
499 				writer == RW_LOCK_EX
500 				? " exclusive\n"
501 				: " wait exclusive\n");
502 		}
503 
504 		fprintf(file,
505 			"number of readers %lu, waiters flag %lu, "
506                         "lock_word: %lx\n"
507 			"Last time read locked in file %s line %lu\n"
508 			"Last time write locked in file %s line %lu\n",
509 			(ulong) rw_lock_get_reader_count(rwlock),
510 			(ulong) rwlock->waiters,
511 			rwlock->lock_word,
512 			innobase_basename(rwlock->last_s_file_name),
513 			(ulong) rwlock->last_s_line,
514 			rwlock->last_x_file_name,
515 			(ulong) rwlock->last_x_line);
516 	} else {
517 		ut_error;
518 	}
519 
520 	if (!cell->waiting) {
521 		fputs("wait has ended\n", file);
522 	}
523 }
524 
525 #ifdef UNIV_SYNC_DEBUG
526 /******************************************************************//**
527 Looks for a cell with the given thread id.
528 @return	pointer to cell or NULL if not found */
529 static
530 sync_cell_t*
sync_array_find_thread(sync_array_t * arr,os_thread_id_t thread)531 sync_array_find_thread(
532 /*===================*/
533 	sync_array_t*	arr,	/*!< in: wait array */
534 	os_thread_id_t	thread)	/*!< in: thread id */
535 {
536 	ulint		i;
537 	sync_cell_t*	cell;
538 
539 	for (i = 0; i < arr->n_cells; i++) {
540 
541 		cell = sync_array_get_nth_cell(arr, i);
542 
543 		if (cell->wait_object != NULL
544 		    && os_thread_eq(cell->thread, thread)) {
545 
546 			return(cell);	/* Found */
547 		}
548 	}
549 
550 	return(NULL);	/* Not found */
551 }
552 
553 /******************************************************************//**
554 Recursion step for deadlock detection.
555 @return	TRUE if deadlock detected */
556 static
557 ibool
sync_array_deadlock_step(sync_array_t * arr,sync_cell_t * start,os_thread_id_t thread,ulint pass,ulint depth)558 sync_array_deadlock_step(
559 /*=====================*/
560 	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
561 				own the mutex to array */
562 	sync_cell_t*	start,	/*!< in: cell where recursive search
563 				started */
564 	os_thread_id_t	thread,	/*!< in: thread to look at */
565 	ulint		pass,	/*!< in: pass value */
566 	ulint		depth)	/*!< in: recursion depth */
567 {
568 	sync_cell_t*	new_cell;
569 
570 	if (pass != 0) {
571 		/* If pass != 0, then we do not know which threads are
572 		responsible of releasing the lock, and no deadlock can
573 		be detected. */
574 
575 		return(FALSE);
576 	}
577 
578 	new_cell = sync_array_find_thread(arr, thread);
579 
580 	if (new_cell == start) {
581 		/* Deadlock */
582 		fputs("########################################\n"
583 		      "DEADLOCK of threads detected!\n", stderr);
584 
585 		return(TRUE);
586 
587 	} else if (new_cell) {
588 		return(sync_array_detect_deadlock(
589 			arr, start, new_cell, depth + 1));
590 	}
591 	return(FALSE);
592 }
593 
594 /******************************************************************//**
595 This function is called only in the debug version. Detects a deadlock
596 of one or more threads because of waits of semaphores.
597 @return	TRUE if deadlock detected */
598 static
599 ibool
sync_array_detect_deadlock(sync_array_t * arr,sync_cell_t * start,sync_cell_t * cell,ulint depth)600 sync_array_detect_deadlock(
601 /*=======================*/
602 	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
603 				own the mutex to array */
604 	sync_cell_t*	start,	/*!< in: cell where recursive search started */
605 	sync_cell_t*	cell,	/*!< in: cell to search */
606 	ulint		depth)	/*!< in: recursion depth */
607 {
608 	ib_mutex_t*	mutex;
609 	rw_lock_t*	lock;
610 	os_thread_id_t	thread;
611 	ibool		ret;
612 	rw_lock_debug_t*debug;
613 
614 	ut_a(arr);
615 	ut_a(start);
616 	ut_a(cell);
617 	ut_ad(cell->wait_object);
618 	ut_ad(os_thread_get_curr_id() == start->thread);
619 	ut_ad(depth < 100);
620 
621 	depth++;
622 
623 	if (!cell->waiting) {
624 
625 		return(FALSE); /* No deadlock here */
626 	}
627 
628 	if (cell->request_type == SYNC_MUTEX) {
629 
630 		mutex = static_cast<ib_mutex_t*>(cell->wait_object);
631 
632 		if (mutex_get_lock_word(mutex) != 0) {
633 
634 			thread = mutex->thread_id;
635 
636 			/* Note that mutex->thread_id above may be
637 			also OS_THREAD_ID_UNDEFINED, because the
638 			thread which held the mutex maybe has not
639 			yet updated the value, or it has already
640 			released the mutex: in this case no deadlock
641 			can occur, as the wait array cannot contain
642 			a thread with ID_UNDEFINED value. */
643 
644 			ret = sync_array_deadlock_step(arr, start, thread, 0,
645 						       depth);
646 			if (ret) {
647 				fprintf(stderr,
648 			"Mutex %p owned by thread %lu file %s line %lu\n",
649 					mutex, (ulong) os_thread_pf(mutex->thread_id),
650 					mutex->file_name, (ulong) mutex->line);
651 				sync_array_cell_print(stderr, cell);
652 
653 				return(TRUE);
654 			}
655 		}
656 
657 		return(FALSE); /* No deadlock */
658 
659 	} else if (cell->request_type == RW_LOCK_EX
660 		   || cell->request_type == RW_LOCK_WAIT_EX) {
661 
662 		lock = static_cast<rw_lock_t*>(cell->wait_object);
663 
664 		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
665 		     debug != 0;
666 		     debug = UT_LIST_GET_NEXT(list, debug)) {
667 
668 			thread = debug->thread_id;
669 
670 			if (((debug->lock_type == RW_LOCK_EX)
671 			     && !os_thread_eq(thread, cell->thread))
672 			    || ((debug->lock_type == RW_LOCK_WAIT_EX)
673 				&& !os_thread_eq(thread, cell->thread))
674 			    || (debug->lock_type == RW_LOCK_SHARED)) {
675 
676 				/* The (wait) x-lock request can block
677 				infinitely only if someone (can be also cell
678 				thread) is holding s-lock, or someone
679 				(cannot be cell thread) (wait) x-lock, and
680 				he is blocked by start thread */
681 
682 				ret = sync_array_deadlock_step(
683 					arr, start, thread, debug->pass,
684 					depth);
685 				if (ret) {
686 print:
687 					fprintf(stderr, "rw-lock %p ",
688 						(void*) lock);
689 					sync_array_cell_print(stderr, cell);
690 					rw_lock_debug_print(stderr, debug);
691 					return(TRUE);
692 				}
693 			}
694 		}
695 
696 		return(FALSE);
697 
698 	} else if (cell->request_type == RW_LOCK_SHARED) {
699 
700 		lock = static_cast<rw_lock_t*>(cell->wait_object);
701 
702 		for (debug = UT_LIST_GET_FIRST(lock->debug_list);
703 		     debug != 0;
704 		     debug = UT_LIST_GET_NEXT(list, debug)) {
705 
706 			thread = debug->thread_id;
707 
708 			if ((debug->lock_type == RW_LOCK_EX)
709 			    || (debug->lock_type == RW_LOCK_WAIT_EX)) {
710 
711 				/* The s-lock request can block infinitely
712 				only if someone (can also be cell thread) is
713 				holding (wait) x-lock, and he is blocked by
714 				start thread */
715 
716 				ret = sync_array_deadlock_step(
717 					arr, start, thread, debug->pass,
718 					depth);
719 				if (ret) {
720 					goto print;
721 				}
722 			}
723 		}
724 
725 		return(FALSE);
726 
727 	} else {
728 		ut_error;
729 	}
730 
731 	return(TRUE);	/* Execution never reaches this line: for compiler
732 			fooling only */
733 }
734 #endif /* UNIV_SYNC_DEBUG */
735 
736 /******************************************************************//**
737 Determines if we can wake up the thread waiting for a sempahore. */
738 static
739 ibool
sync_arr_cell_can_wake_up(sync_cell_t * cell)740 sync_arr_cell_can_wake_up(
741 /*======================*/
742 	sync_cell_t*	cell)	/*!< in: cell to search */
743 {
744 	ib_mutex_t*	mutex;
745 	rw_lock_t*	lock;
746 
747 	if (cell->request_type == SYNC_MUTEX) {
748 
749 		mutex = static_cast<ib_mutex_t*>(cell->wait_object);
750 
751 		os_rmb;
752 		if (mutex_get_lock_word(mutex) == 0) {
753 
754 			return(TRUE);
755 		}
756 
757 	} else if (cell->request_type == RW_LOCK_EX) {
758 
759 		lock = static_cast<rw_lock_t*>(cell->wait_object);
760 
761 		os_rmb;
762 		if (lock->lock_word > 0) {
763 		/* Either unlocked or only read locked. */
764 
765 			return(TRUE);
766 		}
767 
768         } else if (cell->request_type == RW_LOCK_WAIT_EX) {
769 
770 		lock = static_cast<rw_lock_t*>(cell->wait_object);
771 
772                 /* lock_word == 0 means all readers have left */
773 		os_rmb;
774 		if (lock->lock_word == 0) {
775 
776 			return(TRUE);
777 		}
778 	} else if (cell->request_type == RW_LOCK_SHARED) {
779 		lock = static_cast<rw_lock_t*>(cell->wait_object);
780 
781                 /* lock_word > 0 means no writer or reserved writer */
782 		os_rmb;
783 		if (lock->lock_word > 0) {
784 
785 			return(TRUE);
786 		}
787 	}
788 
789 	return(FALSE);
790 }
791 
792 /******************************************************************//**
793 Frees the cell. NOTE! sync_array_wait_event frees the cell
794 automatically! */
795 UNIV_INTERN
796 void
sync_array_free_cell(sync_array_t * arr,ulint index)797 sync_array_free_cell(
798 /*=================*/
799 	sync_array_t*	arr,	/*!< in: wait array */
800 	ulint		index)  /*!< in: index of the cell in array */
801 {
802 	sync_cell_t*	cell;
803 
804 	sync_array_enter(arr);
805 
806 	cell = sync_array_get_nth_cell(arr, index);
807 
808 	ut_a(cell->wait_object != NULL);
809 
810 	cell->waiting = FALSE;
811 	cell->wait_object =  NULL;
812 	cell->signal_count = 0;
813 
814 	ut_a(arr->n_reserved > 0);
815 	arr->n_reserved--;
816 
817 	sync_array_exit(arr);
818 }
819 
820 /**********************************************************************//**
821 Increments the signalled count. */
822 UNIV_INTERN
823 void
sync_array_object_signalled(void)824 sync_array_object_signalled(void)
825 /*=============================*/
826 {
827 #ifdef HAVE_ATOMIC_BUILTINS
828 	(void) os_atomic_increment_ulint(&sg_count, 1);
829 #else
830 	++sg_count;
831 #endif /* HAVE_ATOMIC_BUILTINS */
832 }
833 
834 /**********************************************************************//**
835 If the wakeup algorithm does not work perfectly at semaphore relases,
836 this function will do the waking (see the comment in mutex_exit). This
837 function should be called about every 1 second in the server.
838 
839 Note that there's a race condition between this thread and mutex_exit
840 changing the lock_word and calling signal_object, so sometimes this finds
841 threads to wake up even when nothing has gone wrong. */
842 static
843 void
sync_array_wake_threads_if_sema_free_low(sync_array_t * arr)844 sync_array_wake_threads_if_sema_free_low(
845 /*=====================================*/
846 	sync_array_t*	arr)		/* in/out: wait array */
847 {
848 	ulint		i = 0;
849 	ulint		count;
850 
851 	sync_array_enter(arr);
852 
853 	for (count = 0;  count < arr->n_reserved; ++i) {
854 		sync_cell_t*	cell;
855 
856 		cell = sync_array_get_nth_cell(arr, i);
857 
858 		if (cell->wait_object != NULL) {
859 
860 			count++;
861 
862 			if (sync_arr_cell_can_wake_up(cell)) {
863 				os_event_t      event;
864 
865 				event = sync_cell_get_event(cell);
866 
867 				os_event_set(event);
868 			}
869 		}
870 	}
871 
872 	sync_array_exit(arr);
873 }
874 
875 /**********************************************************************//**
876 If the wakeup algorithm does not work perfectly at semaphore relases,
877 this function will do the waking (see the comment in mutex_exit). This
878 function should be called about every 1 second in the server.
879 
880 Note that there's a race condition between this thread and mutex_exit
881 changing the lock_word and calling signal_object, so sometimes this finds
882 threads to wake up even when nothing has gone wrong. */
883 UNIV_INTERN
884 void
sync_arr_wake_threads_if_sema_free(void)885 sync_arr_wake_threads_if_sema_free(void)
886 /*====================================*/
887 {
888 	ulint		i;
889 
890 	for (i = 0; i < sync_array_size; ++i) {
891 
892 		sync_array_wake_threads_if_sema_free_low(
893 			sync_wait_array[i]);
894 	}
895 }
896 
897 /**********************************************************************//**
898 Prints warnings of long semaphore waits to stderr.
899 @return	TRUE if fatal semaphore wait threshold was exceeded */
900 static
901 ibool
sync_array_print_long_waits_low(sync_array_t * arr,os_thread_id_t * waiter,const void ** sema,ibool * noticed)902 sync_array_print_long_waits_low(
903 /*============================*/
904 	sync_array_t*	arr,	/*!< in: sync array instance */
905 	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
906 	const void**	sema,	/*!< out: longest-waited-for semaphore */
907 	ibool*		noticed)/*!< out: TRUE if long wait noticed */
908 {
909 	ulint		i;
910 	ulint		fatal_timeout = srv_fatal_semaphore_wait_threshold;
911 	ibool		fatal = FALSE;
912 	double		longest_diff = 0;
913 
914 	/* For huge tables, skip the check during CHECK TABLE etc... */
915 	if (fatal_timeout > SRV_SEMAPHORE_WAIT_EXTENSION) {
916 		return(FALSE);
917 	}
918 
919 #ifdef UNIV_DEBUG_VALGRIND
920 	/* Increase the timeouts if running under valgrind because it executes
921 	extremely slowly. UNIV_DEBUG_VALGRIND does not necessary mean that
922 	we are running under valgrind but we have no better way to tell.
923 	See Bug#58432 innodb.innodb_bug56143 fails under valgrind
924 	for an example */
925 # define SYNC_ARRAY_TIMEOUT	2400
926 	fatal_timeout *= 10;
927 #else
928 # define SYNC_ARRAY_TIMEOUT	240
929 #endif
930 
931 	for (i = 0; i < arr->n_cells; i++) {
932 
933 		double		diff;
934 		sync_cell_t*	cell;
935 		void*		wait_object;
936 
937 		cell = sync_array_get_nth_cell(arr, i);
938 
939 		wait_object = cell->wait_object;
940 
941 		if (wait_object == NULL || !cell->waiting) {
942 
943 			continue;
944 		}
945 
946 		diff = difftime(time(NULL), cell->reservation_time);
947 
948 		if (diff > SYNC_ARRAY_TIMEOUT) {
949 			fputs("InnoDB: Warning: a long semaphore wait:\n",
950 			      stderr);
951 			sync_array_cell_print(stderr, cell);
952 			*noticed = TRUE;
953 		}
954 
955 		if (diff > fatal_timeout) {
956 			fatal = TRUE;
957 		}
958 
959 		if (diff > longest_diff) {
960 			longest_diff = diff;
961 			*sema = wait_object;
962 			*waiter = cell->thread;
963 		}
964 	}
965 
966 #undef SYNC_ARRAY_TIMEOUT
967 
968 	return(fatal);
969 }
970 
971 /**********************************************************************//**
972 Prints warnings of long semaphore waits to stderr.
973 @return	TRUE if fatal semaphore wait threshold was exceeded */
974 UNIV_INTERN
975 ibool
sync_array_print_long_waits(os_thread_id_t * waiter,const void ** sema)976 sync_array_print_long_waits(
977 /*========================*/
978 	os_thread_id_t*	waiter,	/*!< out: longest waiting thread */
979 	const void**	sema)	/*!< out: longest-waited-for semaphore */
980 {
981 	ulint		i;
982 	ibool		fatal = FALSE;
983 	ibool		noticed = FALSE;
984 
985 	for (i = 0; i < sync_array_size; ++i) {
986 
987 		sync_array_t*	arr = sync_wait_array[i];
988 
989 		sync_array_enter(arr);
990 
991 		if (sync_array_print_long_waits_low(
992 				arr, waiter, sema, &noticed)) {
993 
994 			fatal = TRUE;
995 		}
996 
997 		sync_array_exit(arr);
998 	}
999 
1000 	if (noticed) {
1001 		ibool	old_val;
1002 
1003 		fprintf(stderr,
1004 			"InnoDB: ###### Starts InnoDB Monitor"
1005 			" for 30 secs to print diagnostic info:\n");
1006 
1007 		old_val = srv_print_innodb_monitor;
1008 
1009 		/* If some crucial semaphore is reserved, then also the InnoDB
1010 		Monitor can hang, and we do not get diagnostics. Since in
1011 		many cases an InnoDB hang is caused by a pwrite() or a pread()
1012 		call hanging inside the operating system, let us print right
1013 		now the values of pending calls of these. */
1014 
1015 		fprintf(stderr,
1016 			"InnoDB: Pending preads %lu, pwrites %lu\n",
1017 			(ulong) os_file_n_pending_preads,
1018 			(ulong) os_file_n_pending_pwrites);
1019 
1020 		srv_print_innodb_monitor = TRUE;
1021 		os_event_set(lock_sys->timeout_event);
1022 
1023 		os_thread_sleep(30000000);
1024 
1025 		srv_print_innodb_monitor = static_cast<my_bool>(old_val);
1026 		fprintf(stderr,
1027 			"InnoDB: ###### Diagnostic info printed"
1028 			" to the standard error stream\n");
1029 	}
1030 
1031 	return(fatal);
1032 }
1033 
1034 /**********************************************************************//**
1035 Prints info of the wait array. */
1036 static
1037 void
sync_array_print_info_low(FILE * file,sync_array_t * arr)1038 sync_array_print_info_low(
1039 /*======================*/
1040 	FILE*		file,	/*!< in: file where to print */
1041 	sync_array_t*	arr)	/*!< in: wait array */
1042 {
1043 	ulint		i;
1044 	ulint		count = 0;
1045 
1046 	fprintf(file,
1047 		"OS WAIT ARRAY INFO: reservation count " ULINTPF "\n",
1048 		arr->res_count);
1049 
1050 	for (i = 0; count < arr->n_reserved; ++i) {
1051 		sync_cell_t*	cell;
1052 
1053 		cell = sync_array_get_nth_cell(arr, i);
1054 
1055 		if (cell->wait_object != NULL) {
1056 			count++;
1057 			sync_array_cell_print(file, cell);
1058 		}
1059 	}
1060 }
1061 
1062 /**********************************************************************//**
1063 Prints info of the wait array. */
1064 static
1065 void
sync_array_print_info(FILE * file,sync_array_t * arr)1066 sync_array_print_info(
1067 /*==================*/
1068 	FILE*		file,	/*!< in: file where to print */
1069 	sync_array_t*	arr)	/*!< in: wait array */
1070 {
1071 	sync_array_enter(arr);
1072 
1073 	sync_array_print_info_low(file, arr);
1074 
1075 	sync_array_exit(arr);
1076 }
1077 
1078 /**********************************************************************//**
1079 Create the primary system wait array(s), they are protected by an OS mutex */
1080 UNIV_INTERN
1081 void
sync_array_init(ulint n_threads)1082 sync_array_init(
1083 /*============*/
1084 	ulint		n_threads)		/*!< in: Number of slots to
1085 						create in all arrays */
1086 {
1087 	ulint		i;
1088 	ulint		n_slots;
1089 
1090 	ut_a(sync_wait_array == NULL);
1091 	ut_a(srv_sync_array_size > 0);
1092 	ut_a(n_threads > 0);
1093 
1094 	sync_array_size = srv_sync_array_size;
1095 
1096 	/* We have to use ut_malloc() because the mutex infrastructure
1097 	hasn't been initialised yet. It is required by mem_alloc() and
1098 	the heap functions. */
1099 
1100 	sync_wait_array = static_cast<sync_array_t**>(
1101 		ut_malloc(sizeof(*sync_wait_array) * sync_array_size));
1102 
1103 	n_slots = 1 + (n_threads - 1) / sync_array_size;
1104 
1105 	for (i = 0; i < sync_array_size; ++i) {
1106 
1107 		sync_wait_array[i] = sync_array_create(n_slots);
1108 	}
1109 }
1110 
1111 /**********************************************************************//**
1112 Close sync array wait sub-system. */
1113 UNIV_INTERN
1114 void
sync_array_close(void)1115 sync_array_close(void)
1116 /*==================*/
1117 {
1118 	ulint		i;
1119 
1120 	for (i = 0; i < sync_array_size; ++i) {
1121 		sync_array_free(sync_wait_array[i]);
1122 	}
1123 
1124 	ut_free(sync_wait_array);
1125 	sync_wait_array = NULL;
1126 }
1127 
1128 /**********************************************************************//**
1129 Print info about the sync array(s). */
1130 UNIV_INTERN
1131 void
sync_array_print(FILE * file)1132 sync_array_print(
1133 /*=============*/
1134 	FILE*		file)		/*!< in/out: Print to this stream */
1135 {
1136 	ulint		i;
1137 
1138 	for (i = 0; i < sync_array_size; ++i) {
1139 		sync_array_print_info(file, sync_wait_array[i]);
1140 	}
1141 
1142 	fprintf(file,
1143 		"OS WAIT ARRAY INFO: signal count " ULINTPF "\n", sg_count);
1144 
1145 }
1146 
1147 /**********************************************************************//**
1148 Get an instance of the sync wait array. */
1149 UNIV_INTERN
1150 sync_array_t*
sync_array_get(void)1151 sync_array_get(void)
1152 /*================*/
1153 {
1154 	ulint		i;
1155 	static ulint	count;
1156 
1157 #ifdef HAVE_ATOMIC_BUILTINS
1158 	i = os_atomic_increment_ulint(&count, 1);
1159 #else
1160 	i = count++;
1161 #endif /* HAVE_ATOMIC_BUILTINS */
1162 
1163 	return(sync_wait_array[i % sync_array_size]);
1164 }
1165