1 /*****************************************************************************
2 
3 Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, 2009 Google Inc.
5 Copyright (c) 2009, Percona Inc.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 Portions of this file contain modifications contributed and copyrighted
14 by Percona Inc.. Those modifications are
15 gratefully acknowledged and are described briefly in the InnoDB
16 documentation. The contributions by Percona Inc. are incorporated with
17 their permission, and subject to the conditions contained in the file
18 COPYING.Percona.
19 
20 This program is free software; you can redistribute it and/or modify it under
21 the terms of the GNU General Public License as published by the Free Software
22 Foundation; version 2 of the License.
23 
24 This program is distributed in the hope that it will be useful, but WITHOUT
25 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
26 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
27 
28 You should have received a copy of the GNU General Public License along with
29 this program; if not, write to the Free Software Foundation, Inc.,
30 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
31 
32 *****************************************************************************/
33 
34 /**************************************************//**
35 @file srv/srv0srv.c
36 The database server main program
37 
38 NOTE: SQL Server 7 uses something which the documentation
39 calls user mode scheduled threads (UMS threads). One such
40 thread is usually allocated per processor. Win32
41 documentation does not know any UMS threads, which suggests
42 that the concept is internal to SQL Server 7. It may mean that
43 SQL Server 7 does all the scheduling of threads itself, even
44 in i/o waits. We should maybe modify InnoDB to use the same
45 technique, because thread switches within NT may be too slow.
46 
47 SQL Server 7 also mentions fibers, which are cooperatively
48 scheduled threads. They can boost performance by 5 %,
49 according to the Delaney and Soukup's book.
50 
51 Windows 2000 will have something called thread pooling
52 (see msdn website), which we could possibly use.
53 
54 Another possibility could be to use some very fast user space
55 thread library. This might confuse NT though.
56 
57 Created 10/8/1995 Heikki Tuuri
58 *******************************************************/
59 
60 /* Dummy comment */
61 #include "m_string.h" /* for my_sys.h */
62 #include "my_sys.h" /* DEBUG_SYNC_C */
63 #include "srv0srv.h"
64 
65 #include "ut0mem.h"
66 #include "ut0ut.h"
67 #include "os0proc.h"
68 #include "mem0mem.h"
69 #include "mem0pool.h"
70 #include "sync0sync.h"
71 #include "que0que.h"
72 #include "log0recv.h"
73 #include "pars0pars.h"
74 #include "usr0sess.h"
75 #include "lock0lock.h"
76 #include "trx0purge.h"
77 #include "ibuf0ibuf.h"
78 #include "buf0flu.h"
79 #include "buf0lru.h"
80 #include "btr0sea.h"
81 #include "dict0load.h"
82 #include "dict0boot.h"
83 #include "srv0start.h"
84 #include "row0mysql.h"
85 #include "ha_prototypes.h"
86 #include "trx0i_s.h"
87 #include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
88 #include "read0read.h"
89 #include "mysql/plugin.h"
90 #include "mysql/service_thd_wait.h"
91 
92 /* The following counter is incremented whenever there is some user activity
93 in the server */
94 UNIV_INTERN ulint	srv_activity_count	= 0;
95 
96 /* The following is the maximum allowed duration of a lock wait. */
97 UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
98 
99 /* How much data manipulation language (DML) statements need to be delayed,
100 in microseconds, in order to reduce the lagging of the purge thread. */
101 UNIV_INTERN ulint	srv_dml_needed_delay = 0;
102 
103 UNIV_INTERN ibool	srv_lock_timeout_active = FALSE;
104 UNIV_INTERN ibool	srv_monitor_active = FALSE;
105 UNIV_INTERN ibool	srv_error_monitor_active = FALSE;
106 
107 UNIV_INTERN const char*	srv_main_thread_op_info = "";
108 
109 /** Prefix used by MySQL to indicate pre-5.1 table name encoding */
110 UNIV_INTERN const char	srv_mysql50_table_name_prefix[9] = "#mysql50#";
111 
112 /* Server parameters which are read from the initfile */
113 
114 /* The following three are dir paths which are catenated before file
115 names, where the file name itself may also contain a path */
116 
117 UNIV_INTERN char*	srv_data_home	= NULL;
118 #ifdef UNIV_LOG_ARCHIVE
119 UNIV_INTERN char*	srv_arch_dir	= NULL;
120 #endif /* UNIV_LOG_ARCHIVE */
121 
122 /** store to its own file each table created by an user; data
123 dictionary tables are in the system tablespace 0 */
124 UNIV_INTERN my_bool	srv_file_per_table;
125 /** The file format to use on new *.ibd files. */
126 UNIV_INTERN ulint	srv_file_format = 0;
127 /** Whether to check file format during startup.  A value of
128 DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
129 set it to the highest format we support. */
130 UNIV_INTERN ulint	srv_max_file_format_at_startup = DICT_TF_FORMAT_MAX;
131 
132 #if DICT_TF_FORMAT_51
133 # error "DICT_TF_FORMAT_51 must be 0!"
134 #endif
135 /** Place locks to records only i.e. do not use next-key locking except
136 on duplicate key checking and foreign key checking */
137 UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;
138 
139 /* If this flag is TRUE, then we will use the native aio of the
140 OS (provided we compiled Innobase with it in), otherwise we will
141 use simulated aio we build below with threads.
142 Currently we support native aio on windows and linux */
143 UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
144 
145 #ifdef __WIN__
146 /* Windows native condition variables. We use runtime loading / function
147 pointers, because they are not available on Windows Server 2003 and
148 Windows XP/2000.
149 
150 We use condition for events on Windows if possible, even if os_event
151 resembles Windows kernel event object well API-wise. The reason is
152 performance, kernel objects are heavyweights and WaitForSingleObject() is a
153 performance killer causing calling thread to context switch. Besides, Innodb
154 is preallocating large number (often millions) of os_events. With kernel event
155 objects it takes a big chunk out of non-paged pool, which is better suited
156 for tasks like IO than for storing idle event objects. */
157 UNIV_INTERN ibool	srv_use_native_conditions = FALSE;
158 #endif /* __WIN__ */
159 
160 UNIV_INTERN ulint	srv_n_data_files = 0;
161 UNIV_INTERN char**	srv_data_file_names = NULL;
162 /* size in database pages */
163 UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
164 
165 /* if TRUE, then we auto-extend the last data file */
166 UNIV_INTERN ibool	srv_auto_extend_last_data_file	= FALSE;
167 /* if != 0, this tells the max size auto-extending may increase the
168 last data file size */
169 UNIV_INTERN ulint	srv_last_file_size_max	= 0;
170 /* If the last data file is auto-extended, we add this
171 many pages to it at a time */
172 UNIV_INTERN ulong	srv_auto_extend_increment = 8;
173 UNIV_INTERN ulint*	srv_data_file_is_raw_partition = NULL;
174 
175 /* If the following is TRUE we do not allow inserts etc. This protects
176 the user from forgetting the 'newraw' keyword to my.cnf */
177 
178 UNIV_INTERN ibool	srv_created_new_raw	= FALSE;
179 
180 UNIV_INTERN char**	srv_log_group_home_dirs = NULL;
181 
182 UNIV_INTERN ulint	srv_n_log_groups	= ULINT_MAX;
183 UNIV_INTERN ulint	srv_n_log_files		= ULINT_MAX;
184 /* size in database pages */
185 UNIV_INTERN ulint	srv_log_file_size	= ULINT_MAX;
186 /* size in database pages */
187 UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
188 UNIV_INTERN ulong	srv_flush_log_at_trx_commit = 1;
189 
190 /* Try to flush dirty pages so as to avoid IO bursts at
191 the checkpoints. */
192 UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
193 
194 /** Maximum number of times allowed to conditionally acquire
195 mutex before switching to blocking wait on the mutex */
196 #define MAX_MUTEX_NOWAIT	20
197 
198 /** Check whether the number of failed nonblocking mutex
199 acquisition attempts exceeds maximum allowed value. If so,
200 srv_printf_innodb_monitor() will request mutex acquisition
201 with mutex_enter(), which will wait until it gets the mutex. */
202 #define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
203 
204 /** The sort order table of the MySQL latin1_swedish_ci character set
205 collation */
206 UNIV_INTERN const byte*	srv_latin1_ordering;
207 
208 /* use os/external memory allocator */
209 UNIV_INTERN my_bool	srv_use_sys_malloc	= TRUE;
210 /* requested size in kilobytes */
211 UNIV_INTERN ulint	srv_buf_pool_size	= ULINT_MAX;
212 /* requested number of buffer pool instances */
213 UNIV_INTERN ulint       srv_buf_pool_instances  = 1;
214 /* previously requested size */
215 UNIV_INTERN ulint	srv_buf_pool_old_size;
216 /* current size in kilobytes */
217 UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
218 /* size in bytes */
219 UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
220 UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
221 
222 /* This parameter is deprecated. Use srv_n_io_[read|write]_threads
223 instead. */
224 UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
225 UNIV_INTERN ulint	srv_n_read_io_threads	= ULINT_MAX;
226 UNIV_INTERN ulint	srv_n_write_io_threads	= ULINT_MAX;
227 
228 /* Switch to enable random read ahead. */
229 UNIV_INTERN my_bool	srv_random_read_ahead	= FALSE;
230 /* User settable value of the number of pages that must be present
231 in the buffer cache and accessed sequentially for InnoDB to trigger a
232 readahead request. */
233 UNIV_INTERN ulong	srv_read_ahead_threshold	= 56;
234 
235 #ifdef UNIV_LOG_ARCHIVE
236 UNIV_INTERN ibool		srv_log_archive_on	= FALSE;
237 UNIV_INTERN ibool		srv_archive_recovery	= 0;
238 UNIV_INTERN ib_uint64_t	srv_archive_recovery_limit_lsn;
239 #endif /* UNIV_LOG_ARCHIVE */
240 
241 /* This parameter is used to throttle the number of insert buffers that are
242 merged in a batch. By increasing this parameter on a faster disk you can
243 possibly reduce the number of I/O operations performed to complete the
244 merge operation. The value of this parameter is used as is by the
245 background loop when the system is idle (low load), on a busy system
246 the parameter is scaled down by a factor of 4, this is to avoid putting
247 a heavier load on the I/O sub system. */
248 
249 UNIV_INTERN ulong	srv_insert_buffer_batch_size = 20;
250 
251 UNIV_INTERN char*	srv_file_flush_method_str = NULL;
252 UNIV_INTERN ulint	srv_unix_file_flush_method = SRV_UNIX_FSYNC;
253 UNIV_INTERN ulint	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
254 
255 UNIV_INTERN ulint	srv_max_n_open_files	  = 300;
256 
257 /* Number of IO operations per second the server can do */
258 UNIV_INTERN ulong	srv_io_capacity         = 200;
259 
260 /* The InnoDB main thread tries to keep the ratio of modified pages
261 in the buffer pool to all database pages in the buffer pool smaller than
262 the following number. But it is not guaranteed that the value stays below
263 that during a time of heavy update/insert activity. */
264 
265 UNIV_INTERN ulong	srv_max_buf_pool_modified_pct	= 75;
266 
267 /* the number of purge threads to use from the worker pool (currently 0 or 1).*/
268 UNIV_INTERN ulong srv_n_purge_threads = 0;
269 
270 /* the number of pages to purge in one batch */
271 UNIV_INTERN ulong srv_purge_batch_size = 20;
272 
273 /* the number of rollback segments to use */
274 UNIV_INTERN ulong srv_rollback_segments = TRX_SYS_N_RSEGS;
275 
276 /* variable counts amount of data read in total (in bytes) */
277 UNIV_INTERN ulint srv_data_read = 0;
278 
279 /* Internal setting for "innodb_stats_method". Decides how InnoDB treats
280 NULL value when collecting statistics. By default, it is set to
281 SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
282 UNIV_INTERN ulong srv_innodb_stats_method = SRV_STATS_NULLS_EQUAL;
283 
284 /* here we count the amount of data written in total (in bytes) */
285 UNIV_INTERN ulint srv_data_written = 0;
286 
287 /* the number of the log write requests done */
288 UNIV_INTERN ulint srv_log_write_requests = 0;
289 
290 /* the number of physical writes to the log performed */
291 UNIV_INTERN ulint srv_log_writes = 0;
292 
293 /* amount of data written to the log files in bytes */
294 UNIV_INTERN ulint srv_os_log_written = 0;
295 
296 /* amount of writes being done to the log files */
297 UNIV_INTERN ulint srv_os_log_pending_writes = 0;
298 
299 /* we increase this counter, when there we don't have enough space in the
300 log buffer and have to flush it */
301 UNIV_INTERN ulint srv_log_waits = 0;
302 
303 /* this variable counts the amount of times, when the doublewrite buffer
304 was flushed */
305 UNIV_INTERN ulint srv_dblwr_writes = 0;
306 
307 /* here we store the number of pages that have been flushed to the
308 doublewrite buffer */
309 UNIV_INTERN ulint srv_dblwr_pages_written = 0;
310 
311 /* in this variable we store the number of write requests issued */
312 UNIV_INTERN ulint srv_buf_pool_write_requests = 0;
313 
314 /* here we store the number of times when we had to wait for a free page
315 in the buffer pool. It happens when the buffer pool is full and we need
316 to make a flush, in order to be able to read or create a page. */
317 UNIV_INTERN ulint srv_buf_pool_wait_free = 0;
318 
319 /* variable to count the number of pages that were written from buffer
320 pool to the disk */
321 UNIV_INTERN ulint srv_buf_pool_flushed = 0;
322 
323 /** Number of buffer pool reads that led to the
324 reading of a disk page */
325 UNIV_INTERN ulint srv_buf_pool_reads = 0;
326 
327 /* structure to pass status variables to MySQL */
328 UNIV_INTERN export_struc export_vars;
329 
330 /* If the following is != 0 we do not allow inserts etc. This protects
331 the user from forgetting the innodb_force_recovery keyword to my.cnf */
332 
333 UNIV_INTERN ulint	srv_force_recovery	= 0;
334 /*-----------------------*/
335 /* We are prepared for a situation that we have this many threads waiting for
336 a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
337 value. */
338 
339 UNIV_INTERN ulint	srv_max_n_threads	= 0;
340 
341 /* The following controls how many threads we let inside InnoDB concurrently:
342 threads waiting for locks are not counted into the number because otherwise
343 we could get a deadlock. MySQL creates a thread for each user session, and
344 semaphore contention and convoy problems can occur withput this restriction.
345 Value 10 should be good if there are less than 4 processors + 4 disks in the
346 computer. Bigger computers need bigger values. Value 0 will disable the
347 concurrency check. */
348 
349 UNIV_INTERN ulong	srv_thread_concurrency	= 0;
350 
351 /* this mutex protects srv_conc data structures */
352 UNIV_INTERN os_fast_mutex_t	srv_conc_mutex;
353 /* number of transactions that have declared_to_be_inside_innodb set.
354 It used to be a non-error for this value to drop below zero temporarily.
355 This is no longer true. We'll, however, keep the lint datatype to add
356 assertions to catch any corner cases that we may have missed. */
357 UNIV_INTERN lint	srv_conc_n_threads	= 0;
358 /* number of OS threads waiting in the FIFO for a permission to enter
359 InnoDB */
360 UNIV_INTERN ulint	srv_conc_n_waiting_threads = 0;
361 
362 /* print all user-level transactions deadlocks to mysqld stderr */
363 UNIV_INTERN my_bool	srv_print_all_deadlocks = FALSE;
364 
365 typedef struct srv_conc_slot_struct	srv_conc_slot_t;
366 struct srv_conc_slot_struct{
367 	os_event_t			event;		/*!< event to wait */
368 	ibool				reserved;	/*!< TRUE if slot
369 							reserved */
370 	ibool				wait_ended;	/*!< TRUE when another
371 							thread has already set
372 							the event and the
373 							thread in this slot is
374 							free to proceed; but
375 							reserved may still be
376 							TRUE at that point */
377 	UT_LIST_NODE_T(srv_conc_slot_t)	srv_conc_queue;	/*!< queue node */
378 };
379 
380 /* queue of threads waiting to get in */
381 UNIV_INTERN UT_LIST_BASE_NODE_T(srv_conc_slot_t)	srv_conc_queue;
382 /* array of wait slots */
383 UNIV_INTERN srv_conc_slot_t* srv_conc_slots;
384 
385 /* Number of times a thread is allowed to enter InnoDB within the same
386 SQL query after it has once got the ticket at srv_conc_enter_innodb */
387 #define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter
388 #define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay
389 /*-----------------------*/
390 /* If the following is set to 1 then we do not run purge and insert buffer
391 merge to completion before shutdown. If it is set to 2, do not even flush the
392 buffer pool to data files at the shutdown: we effectively 'crash'
393 InnoDB (but lose no committed transactions). */
394 UNIV_INTERN ulint	srv_fast_shutdown	= 0;
395 
396 /* Generate a innodb_status.<pid> file */
397 UNIV_INTERN ibool	srv_innodb_status	= FALSE;
398 
399 /* When estimating number of different key values in an index, sample
400 this many index pages */
401 UNIV_INTERN unsigned long long	srv_stats_sample_pages = 8;
402 
403 UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
404 UNIV_INTERN ibool	srv_use_checksums = TRUE;
405 
406 UNIV_INTERN ulong	srv_replication_delay		= 0;
407 
408 /*-------------------------------------------*/
409 UNIV_INTERN ulong	srv_n_spin_wait_rounds	= 30;
410 UNIV_INTERN ulong	srv_n_free_tickets_to_enter = 500;
411 UNIV_INTERN ulong	srv_thread_sleep_delay = 10000;
412 UNIV_INTERN ulong	srv_spin_wait_delay	= 6;
413 UNIV_INTERN ibool	srv_priority_boost	= TRUE;
414 
415 #ifdef UNIV_DEBUG
416 UNIV_INTERN ibool	srv_print_thread_releases	= FALSE;
417 UNIV_INTERN ibool	srv_print_lock_waits		= FALSE;
418 UNIV_INTERN ibool	srv_print_buf_io		= FALSE;
419 UNIV_INTERN ibool	srv_print_log_io		= FALSE;
420 UNIV_INTERN ibool	srv_print_latch_waits		= FALSE;
421 #endif /* UNIV_DEBUG */
422 
423 UNIV_INTERN ulint		srv_n_rows_inserted		= 0;
424 UNIV_INTERN ulint		srv_n_rows_updated		= 0;
425 UNIV_INTERN ulint		srv_n_rows_deleted		= 0;
426 UNIV_INTERN ulint		srv_n_rows_read			= 0;
427 
428 static ulint	srv_n_rows_inserted_old		= 0;
429 static ulint	srv_n_rows_updated_old		= 0;
430 static ulint	srv_n_rows_deleted_old		= 0;
431 static ulint	srv_n_rows_read_old		= 0;
432 
433 UNIV_INTERN ulint		srv_n_lock_wait_count		= 0;
434 UNIV_INTERN ulint		srv_n_lock_wait_current_count	= 0;
435 UNIV_INTERN ib_int64_t	srv_n_lock_wait_time		= 0;
436 UNIV_INTERN ulint		srv_n_lock_max_wait_time	= 0;
437 
438 UNIV_INTERN ulint		srv_truncated_status_writes	= 0;
439 
440 /*
441   Set the following to 0 if you want InnoDB to write messages on
442   stderr on startup/shutdown
443 */
444 UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
445 UNIV_INTERN ibool	srv_print_innodb_monitor	= FALSE;
446 UNIV_INTERN ibool	srv_print_innodb_lock_monitor	= FALSE;
447 UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
448 UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
449 
450 /* Array of English strings describing the current state of an
451 i/o handler thread */
452 
453 UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
454 UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
455 
456 UNIV_INTERN time_t	srv_last_monitor_time;
457 
458 UNIV_INTERN mutex_t	srv_innodb_monitor_mutex;
459 
460 /* Mutex for locking srv_monitor_file */
461 UNIV_INTERN mutex_t	srv_monitor_file_mutex;
462 
463 #ifdef UNIV_PFS_MUTEX
464 /* Key to register kernel_mutex with performance schema */
465 UNIV_INTERN mysql_pfs_key_t	kernel_mutex_key;
466 /* Key to register srv_innodb_monitor_mutex with performance schema */
467 UNIV_INTERN mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
468 /* Key to register srv_monitor_file_mutex with performance schema */
469 UNIV_INTERN mysql_pfs_key_t	srv_monitor_file_mutex_key;
470 /* Key to register srv_dict_tmpfile_mutex with performance schema */
471 UNIV_INTERN mysql_pfs_key_t	srv_dict_tmpfile_mutex_key;
472 /* Key to register the mutex with performance schema */
473 UNIV_INTERN mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
474 #endif /* UNIV_PFS_MUTEX */
475 
476 /* Temporary file for innodb monitor output */
477 UNIV_INTERN FILE*	srv_monitor_file;
478 /* Mutex for locking srv_dict_tmpfile.
479 This mutex has a very high rank; threads reserving it should not
480 be holding any InnoDB latches. */
481 UNIV_INTERN mutex_t	srv_dict_tmpfile_mutex;
482 /* Temporary file for output from the data dictionary */
483 UNIV_INTERN FILE*	srv_dict_tmpfile;
484 /* Mutex for locking srv_misc_tmpfile.
485 This mutex has a very low rank; threads reserving it should not
486 acquire any further latches or sleep before releasing this one. */
487 UNIV_INTERN mutex_t	srv_misc_tmpfile_mutex;
488 /* Temporary file for miscellanous diagnostic output */
489 UNIV_INTERN FILE*	srv_misc_tmpfile;
490 
491 UNIV_INTERN ulint	srv_main_thread_process_no	= 0;
492 UNIV_INTERN ulint	srv_main_thread_id		= 0;
493 
494 /* The following count work done by srv_master_thread. */
495 
496 /* Iterations by the 'once per second' loop. */
497 static ulint   srv_main_1_second_loops		= 0;
498 /* Calls to sleep by the 'once per second' loop. */
499 static ulint   srv_main_sleeps			= 0;
500 /* Iterations by the 'once per 10 seconds' loop. */
501 static ulint   srv_main_10_second_loops		= 0;
502 /* Iterations of the loop bounded by the 'background_loop' label. */
503 static ulint   srv_main_background_loops	= 0;
504 /* Iterations of the loop bounded by the 'flush_loop' label. */
505 static ulint   srv_main_flush_loops		= 0;
506 /* Log writes involving flush. */
507 static ulint   srv_log_writes_and_flush		= 0;
508 
509 /* This is only ever touched by the master thread. It records the
510 time when the last flush of log file has happened. The master
511 thread ensures that we flush the log files at least once per
512 second. */
513 static time_t	srv_last_log_flush_time;
514 
515 /* The master thread performs various tasks based on the current
516 state of IO activity and the level of IO utilization is past
517 intervals. Following macros define thresholds for these conditions. */
518 #define SRV_PEND_IO_THRESHOLD	(PCT_IO(3))
519 #define SRV_RECENT_IO_ACTIVITY	(PCT_IO(5))
520 #define SRV_PAST_IO_ACTIVITY	(PCT_IO(200))
521 
522 /*
523 	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
524 	=========================================
525 
526 There is the following analogue between this database
527 server and an operating system kernel:
528 
529 DB concept			equivalent OS concept
530 ----------			---------------------
531 transaction		--	process;
532 
533 query thread		--	thread;
534 
535 lock			--	semaphore;
536 
537 transaction set to
538 the rollback state	--	kill signal delivered to a process;
539 
540 kernel			--	kernel;
541 
542 query thread execution:
543 (a) without kernel mutex
544 reserved		--	process executing in user mode;
545 (b) with kernel mutex reserved
546 			--	process executing in kernel mode;
547 
548 The server is controlled by a master thread which runs at
549 a priority higher than normal, that is, higher than user threads.
550 It sleeps most of the time, and wakes up, say, every 300 milliseconds,
551 to check whether there is anything happening in the server which
552 requires intervention of the master thread. Such situations may be,
553 for example, when flushing of dirty blocks is needed in the buffer
554 pool or old version of database rows have to be cleaned away.
555 
556 The threads which we call user threads serve the queries of
557 the clients and input from the console of the server.
558 They run at normal priority. The server may have several
559 communications endpoints. A dedicated set of user threads waits
560 at each of these endpoints ready to receive a client request.
561 Each request is taken by a single user thread, which then starts
562 processing and, when the result is ready, sends it to the client
563 and returns to wait at the same endpoint the thread started from.
564 
565 So, we do not have dedicated communication threads listening at
566 the endpoints and dealing the jobs to dedicated worker threads.
567 Our architecture saves one thread swithch per request, compared
568 to the solution with dedicated communication threads
569 which amounts to 15 microseconds on 100 MHz Pentium
570 running NT. If the client
571 is communicating over a network, this saving is negligible, but
572 if the client resides in the same machine, maybe in an SMP machine
573 on a different processor from the server thread, the saving
574 can be important as the threads can communicate over shared
575 memory with an overhead of a few microseconds.
576 
577 We may later implement a dedicated communication thread solution
578 for those endpoints which communicate over a network.
579 
580 Our solution with user threads has two problems: for each endpoint
581 there has to be a number of listening threads. If there are many
582 communication endpoints, it may be difficult to set the right number
583 of concurrent threads in the system, as many of the threads
584 may always be waiting at less busy endpoints. Another problem
585 is queuing of the messages, as the server internally does not
586 offer any queue for jobs.
587 
588 Another group of user threads is intended for splitting the
589 queries and processing them in parallel. Let us call these
590 parallel communication threads. These threads are waiting for
591 parallelized tasks, suspended on event semaphores.
592 
593 A single user thread waits for input from the console,
594 like a command to shut the database.
595 
596 Utility threads are a different group of threads which takes
597 care of the buffer pool flushing and other, mainly background
598 operations, in the server.
599 Some of these utility threads always run at a lower than normal
600 priority, so that they are always in background. Some of them
601 may dynamically boost their priority by the pri_adjust function,
602 even to higher than normal priority, if their task becomes urgent.
603 The running of utilities is controlled by high- and low-water marks
604 of urgency. The urgency may be measured by the number of dirty blocks
605 in the buffer pool, in the case of the flush thread, for example.
606 When the high-water mark is exceeded, an utility starts running, until
607 the urgency drops under the low-water mark. Then the utility thread
608 suspend itself to wait for an event. The master thread is
609 responsible of signaling this event when the utility thread is
610 again needed.
611 
612 For each individual type of utility, some threads always remain
613 at lower than normal priority. This is because pri_adjust is implemented
614 so that the threads at normal or higher priority control their
615 share of running time by calling sleep. Thus, if the load of the
616 system sudenly drops, these threads cannot necessarily utilize
617 the system fully. The background priority threads make up for this,
618 starting to run when the load drops.
619 
620 When there is no activity in the system, also the master thread
621 suspends itself to wait for an event making
622 the server totally silent. The responsibility to signal this
623 event is on the user thread which again receives a message
624 from a client.
625 
626 There is still one complication in our server design. If a
627 background utility thread obtains a resource (e.g., mutex) needed by a user
628 thread, and there is also some other user activity in the system,
629 the user thread may have to wait indefinitely long for the
630 resource, as the OS does not schedule a background thread if
631 there is some other runnable user thread. This problem is called
632 priority inversion in real-time programming.
633 
634 One solution to the priority inversion problem would be to
635 keep record of which thread owns which resource and
636 in the above case boost the priority of the background thread
637 so that it will be scheduled and it can release the resource.
638 This solution is called priority inheritance in real-time programming.
639 A drawback of this solution is that the overhead of acquiring a mutex
640 increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because
641 the thread has to call os_thread_get_curr_id.
642 This may be compared to 0.5 microsecond overhead for a mutex lock-unlock
643 pair. Note that the thread
644 cannot store the information in the resource, say mutex, itself,
645 because competing threads could wipe out the information if it is
646 stored before acquiring the mutex, and if it stored afterwards,
647 the information is outdated for the time of one machine instruction,
648 at least. (To be precise, the information could be stored to
649 lock_word in mutex if the machine supports atomic swap.)
650 
651 The above solution with priority inheritance may become actual in the
652 future, but at the moment we plan to implement a more coarse solution,
653 which could be called a global priority inheritance. If a thread
654 has to wait for a long time, say 300 milliseconds, for a resource,
655 we just guess that it may be waiting for a resource owned by a background
656 thread, and boost the priority of all runnable background threads
657 to the normal level. The background threads then themselves adjust
658 their fixed priority back to background after releasing all resources
659 they had (or, at some fixed points in their program code).
660 
661 What is the performance of the global priority inheritance solution?
662 We may weigh the length of the wait time 300 milliseconds, during
663 which the system processes some other thread
664 to the cost of boosting the priority of each runnable background
665 thread, rescheduling it, and lowering the priority again.
666 On 100 MHz Pentium + NT this overhead may be of the order 100
667 microseconds per thread. So, if the number of runnable background
668 threads is not very big, say < 100, the cost is tolerable.
669 Utility threads probably will access resources used by
670 user threads not very often, so collisions of user threads
671 to preempted utility threads should not happen very often.
672 
673 The thread table contains
674 information of the current status of each thread existing in the system,
675 and also the event semaphores used in suspending the master thread
676 and utility and parallel communication threads when they have nothing to do.
677 The thread table can be seen as an analogue to the process table
678 in a traditional Unix implementation.
679 
680 The thread table is also used in the global priority inheritance
681 scheme. This brings in one additional complication: threads accessing
682 the thread table must have at least normal fixed priority,
683 because the priority inheritance solution does not work if a background
684 thread is preempted while possessing the mutex protecting the thread table.
685 So, if a thread accesses the thread table, its priority has to be
686 boosted at least to normal. This priority requirement can be seen similar to
687 the privileged mode used when processing the kernel calls in traditional
688 Unix.*/
689 
690 /* Thread slot in the thread table */
691 struct srv_slot_struct{
692 	unsigned	type:1;		/*!< thread type: user, utility etc. */
693 	unsigned	in_use:1;	/*!< TRUE if this slot is in use */
694 	unsigned	suspended:1;	/*!< TRUE if the thread is waiting
695 					for the event of this slot */
696 	ib_time_t	suspend_time;	/*!< time when the thread was
697 					suspended */
698 	os_event_t	event;		/*!< event used in suspending the
699 					thread when it has nothing to do */
700 	que_thr_t*	thr;		/*!< suspended query thread (only
701 					used for MySQL threads) */
702 };
703 
704 /* Table for MySQL threads where they will be suspended to wait for locks */
705 UNIV_INTERN srv_slot_t*	srv_mysql_table = NULL;
706 
707 UNIV_INTERN os_event_t	srv_timeout_event;
708 
709 UNIV_INTERN os_event_t	srv_monitor_event;
710 
711 UNIV_INTERN os_event_t	srv_error_event;
712 
713 UNIV_INTERN os_event_t	srv_lock_timeout_thread_event;
714 
715 UNIV_INTERN srv_sys_t*	srv_sys	= NULL;
716 
717 /* padding to prevent other memory update hotspots from residing on
718 the same memory cache line */
719 UNIV_INTERN byte	srv_pad1[64];
720 /* mutex protecting the server, trx structs, query threads, and lock table */
721 UNIV_INTERN mutex_t*	kernel_mutex_temp;
722 /* padding to prevent other memory update hotspots from residing on
723 the same memory cache line */
724 UNIV_INTERN byte	srv_pad2[64];
725 
726 #if 0
727 /* The following three values measure the urgency of the jobs of
728 buffer, version, and insert threads. They may vary from 0 - 1000.
729 The server mutex protects all these variables. The low-water values
730 tell that the server can acquiesce the utility when the value
731 drops below this low-water mark. */
732 
733 static ulint	srv_meter[SRV_MASTER + 1];
734 static ulint	srv_meter_low_water[SRV_MASTER + 1];
735 static ulint	srv_meter_high_water[SRV_MASTER + 1];
736 static ulint	srv_meter_high_water2[SRV_MASTER + 1];
737 static ulint	srv_meter_foreground[SRV_MASTER + 1];
738 #endif
739 
740 /* The following values give info about the activity going on in
741 the database. They are protected by the server mutex. The arrays
742 are indexed by the type of the thread. */
743 
744 UNIV_INTERN ulint	srv_n_threads_active[SRV_MASTER + 1];
745 UNIV_INTERN ulint	srv_n_threads[SRV_MASTER + 1];
746 
747 /*********************************************************************//**
748 Asynchronous purge thread.
749 @return	a dummy parameter */
750 UNIV_INTERN
751 os_thread_ret_t
752 srv_purge_thread(
753 /*=============*/
754 	void*	arg __attribute__((unused))); /*!< in: a dummy parameter
755 					      required by os_thread_create */
756 
757 /***********************************************************************
758 Prints counters for work done by srv_master_thread. */
759 static
760 void
srv_print_master_thread_info(FILE * file)761 srv_print_master_thread_info(
762 /*=========================*/
763 	FILE  *file)    /* in: output stream */
764 {
765 	fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, "
766 		"%lu 10_second, %lu background, %lu flush\n",
767 		srv_main_1_second_loops, srv_main_sleeps,
768 		srv_main_10_second_loops, srv_main_background_loops,
769 		srv_main_flush_loops);
770 	fprintf(file, "srv_master_thread log flush and writes: %lu\n",
771 		      srv_log_writes_and_flush);
772 }
773 
774 /*********************************************************************//**
775 Sets the info describing an i/o thread current state. */
776 UNIV_INTERN
777 void
srv_set_io_thread_op_info(ulint i,const char * str)778 srv_set_io_thread_op_info(
779 /*======================*/
780 	ulint		i,	/*!< in: the 'segment' of the i/o thread */
781 	const char*	str)	/*!< in: constant char string describing the
782 				state */
783 {
784 	ut_a(i < SRV_MAX_N_IO_THREADS);
785 
786 	srv_io_thread_op_info[i] = str;
787 }
788 
789 /*********************************************************************//**
790 Accessor function to get pointer to n'th slot in the server thread
791 table.
792 @return	pointer to the slot */
793 static
794 srv_slot_t*
srv_table_get_nth_slot(ulint index)795 srv_table_get_nth_slot(
796 /*===================*/
797 	ulint	index)		/*!< in: index of the slot */
798 {
799 	ut_ad(mutex_own(&kernel_mutex));
800 	ut_a(index < OS_THREAD_MAX_N);
801 
802 	return(srv_sys->threads + index);
803 }
804 
805 /*********************************************************************//**
806 Gets the number of threads in the system.
807 @return	sum of srv_n_threads[] */
808 UNIV_INTERN
809 ulint
srv_get_n_threads(void)810 srv_get_n_threads(void)
811 /*===================*/
812 {
813 	ulint	i;
814 	ulint	n_threads	= 0;
815 
816 	mutex_enter(&kernel_mutex);
817 
818 	for (i = 0; i < SRV_MASTER + 1; i++) {
819 
820 		n_threads += srv_n_threads[i];
821 	}
822 
823 	mutex_exit(&kernel_mutex);
824 
825 	return(n_threads);
826 }
827 
828 #ifdef UNIV_DEBUG
829 /*********************************************************************//**
830 Validates the type of a thread table slot.
831 @return TRUE if ok */
832 static
833 ibool
srv_thread_type_validate(enum srv_thread_type type)834 srv_thread_type_validate(
835 /*=====================*/
836 	enum srv_thread_type	type)	/*!< in: thread type */
837 {
838 	switch (type) {
839 	case SRV_WORKER:
840 	case SRV_MASTER:
841 		return(TRUE);
842 	}
843 	ut_error;
844 	return(FALSE);
845 }
846 #endif /* UNIV_DEBUG */
847 
848 /*********************************************************************//**
849 Gets the type of a thread table slot.
850 @return thread type */
851 static
852 enum srv_thread_type
srv_slot_get_type(const srv_slot_t * slot)853 srv_slot_get_type(
854 /*==============*/
855 	const srv_slot_t*	slot)	/*!< in: thread slot */
856 {
857 	enum srv_thread_type	type	= (enum srv_thread_type) slot->type;
858 	ut_ad(srv_thread_type_validate(type));
859 	return(type);
860 }
861 
862 /*********************************************************************//**
863 Reserves a slot in the thread table for the current thread.
864 NOTE! The server mutex has to be reserved by the caller!
865 @return	reserved slot */
866 static
867 srv_slot_t*
srv_table_reserve_slot(enum srv_thread_type type)868 srv_table_reserve_slot(
869 /*===================*/
870 	enum srv_thread_type	type)	/*!< in: type of the thread */
871 {
872 	srv_slot_t*	slot;
873 	ulint		i;
874 
875 	ut_ad(srv_thread_type_validate(type));
876 	ut_ad(mutex_own(&kernel_mutex));
877 
878 	i = 0;
879 	slot = srv_table_get_nth_slot(i);
880 
881 	while (slot->in_use) {
882 		i++;
883 		slot = srv_table_get_nth_slot(i);
884 	}
885 
886 	slot->in_use = TRUE;
887 	slot->suspended = FALSE;
888 	slot->type = type;
889 	ut_ad(srv_slot_get_type(slot) == type);
890 
891 	return(slot);
892 }
893 
894 /*********************************************************************//**
895 Suspends the calling thread to wait for the event in its thread slot.
896 NOTE! The server mutex has to be reserved by the caller! */
897 static
898 void
srv_suspend_thread(srv_slot_t * slot)899 srv_suspend_thread(
900 /*===============*/
901 	srv_slot_t*	slot)	/*!< in/out: thread slot */
902 {
903 	enum srv_thread_type	type;
904 
905 	ut_ad(mutex_own(&kernel_mutex));
906 	ut_ad(slot->in_use);
907 	ut_ad(!slot->suspended);
908 
909 	if (srv_print_thread_releases) {
910 		fprintf(stderr,
911 			"Suspending thread %lu to slot %lu\n",
912 			(ulong) os_thread_get_curr_id(),
913 			(ulong) (slot - srv_sys->threads));
914 	}
915 
916 	type = srv_slot_get_type(slot);
917 
918 	slot->suspended = TRUE;
919 
920 	ut_ad(srv_n_threads_active[type] > 0);
921 
922 	srv_n_threads_active[type]--;
923 
924 	os_event_reset(slot->event);
925 }
926 
927 /*********************************************************************//**
928 Releases threads of the type given from suspension in the thread table.
929 NOTE! The server mutex has to be reserved by the caller!
930 @return number of threads released: this may be less than n if not
931 enough threads were suspended at the moment */
932 UNIV_INTERN
933 ulint
srv_release_threads(enum srv_thread_type type,ulint n)934 srv_release_threads(
935 /*================*/
936 	enum srv_thread_type	type,	/*!< in: thread type */
937 	ulint			n)	/*!< in: number of threads to release */
938 {
939 	srv_slot_t*	slot;
940 	ulint		i;
941 	ulint		count	= 0;
942 
943 	ut_ad(srv_thread_type_validate(type));
944 	ut_ad(n > 0);
945 	ut_ad(mutex_own(&kernel_mutex));
946 
947 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
948 
949 		slot = srv_table_get_nth_slot(i);
950 
951 		if (slot->in_use && slot->suspended
952 		    && srv_slot_get_type(slot) == type) {
953 
954 			slot->suspended = FALSE;
955 
956 			srv_n_threads_active[type]++;
957 
958 			os_event_set(slot->event);
959 
960 			if (srv_print_thread_releases) {
961 				fprintf(stderr,
962 					"Releasing thread type %lu"
963 					" from slot %lu\n",
964 					(ulong) type, (ulong) i);
965 			}
966 
967 			count++;
968 
969 			if (count == n) {
970 				break;
971 			}
972 		}
973 	}
974 
975 	return(count);
976 }
977 
978 /*********************************************************************//**
979 Check whether thread type has reserved a slot. Return the first slot that
980 is found. This works because we currently have only 1 thread of each type.
981 @return	slot number or ULINT_UNDEFINED if not found*/
982 UNIV_INTERN
983 ulint
srv_thread_has_reserved_slot(enum srv_thread_type type)984 srv_thread_has_reserved_slot(
985 /*=========================*/
986 	enum srv_thread_type	type)	/*!< in: thread type to check */
987 {
988 	ulint			i;
989 	ulint			slot_no = ULINT_UNDEFINED;
990 
991 	ut_ad(srv_thread_type_validate(type));
992 	mutex_enter(&kernel_mutex);
993 
994 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
995 		srv_slot_t*	slot;
996 
997 		slot = srv_table_get_nth_slot(i);
998 
999 		if (slot->in_use && slot->type == type) {
1000 			slot_no = i;
1001 			break;
1002 		}
1003 	}
1004 
1005 	mutex_exit(&kernel_mutex);
1006 
1007 	return(slot_no);
1008 }
1009 
1010 /*********************************************************************//**
1011 Initializes the server. */
1012 UNIV_INTERN
1013 void
srv_init(void)1014 srv_init(void)
1015 /*==========*/
1016 {
1017 	srv_conc_slot_t*	conc_slot;
1018 	srv_slot_t*		slot;
1019 	ulint			i;
1020 
1021 	srv_sys = mem_alloc(sizeof(srv_sys_t));
1022 
1023 	kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
1024 	mutex_create(kernel_mutex_key, &kernel_mutex, SYNC_KERNEL);
1025 
1026 	mutex_create(srv_innodb_monitor_mutex_key,
1027 		     &srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
1028 
1029 	srv_sys->threads = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
1030 
1031 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
1032 		slot = srv_sys->threads + i;
1033 		slot->event = os_event_create(NULL);
1034 		ut_a(slot->event);
1035 	}
1036 
1037 	srv_mysql_table = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
1038 
1039 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
1040 		slot = srv_mysql_table + i;
1041 		slot->event = os_event_create(NULL);
1042 		ut_a(slot->event);
1043 	}
1044 
1045 	srv_error_event = os_event_create(NULL);
1046 
1047 	srv_timeout_event = os_event_create(NULL);
1048 
1049 	srv_monitor_event = os_event_create(NULL);
1050 
1051 	srv_lock_timeout_thread_event = os_event_create(NULL);
1052 
1053 	for (i = 0; i < SRV_MASTER + 1; i++) {
1054 		srv_n_threads_active[i] = 0;
1055 		srv_n_threads[i] = 0;
1056 #if 0
1057 		srv_meter[i] = 30;
1058 		srv_meter_low_water[i] = 50;
1059 		srv_meter_high_water[i] = 100;
1060 		srv_meter_high_water2[i] = 200;
1061 		srv_meter_foreground[i] = 250;
1062 #endif
1063 	}
1064 
1065 	UT_LIST_INIT(srv_sys->tasks);
1066 
1067 	/* Create dummy indexes for infimum and supremum records */
1068 
1069 	dict_ind_init();
1070 
1071 	/* Init the server concurrency restriction data structures */
1072 
1073 	os_fast_mutex_init(&srv_conc_mutex);
1074 
1075 	UT_LIST_INIT(srv_conc_queue);
1076 
1077 	srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t));
1078 
1079 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
1080 		conc_slot = srv_conc_slots + i;
1081 		conc_slot->reserved = FALSE;
1082 		conc_slot->event = os_event_create(NULL);
1083 		ut_a(conc_slot->event);
1084 	}
1085 
1086 	/* Initialize some INFORMATION SCHEMA internal structures */
1087 	trx_i_s_cache_init(trx_i_s_cache);
1088 }
1089 
1090 /*********************************************************************//**
1091 Frees the data structures created in srv_init(). */
1092 UNIV_INTERN
1093 void
srv_free(void)1094 srv_free(void)
1095 /*==========*/
1096 {
1097 	os_fast_mutex_free(&srv_conc_mutex);
1098 	mem_free(srv_conc_slots);
1099 	srv_conc_slots = NULL;
1100 
1101 	mem_free(srv_sys->threads);
1102 	mem_free(srv_sys);
1103 	srv_sys = NULL;
1104 
1105 	mem_free(kernel_mutex_temp);
1106 	kernel_mutex_temp = NULL;
1107 	mem_free(srv_mysql_table);
1108 	srv_mysql_table = NULL;
1109 
1110 	trx_i_s_cache_free(trx_i_s_cache);
1111 }
1112 
1113 /*********************************************************************//**
1114 Initializes the synchronization primitives, memory system, and the thread
1115 local storage. */
1116 UNIV_INTERN
1117 void
srv_general_init(void)1118 srv_general_init(void)
1119 /*==================*/
1120 {
1121 	ut_mem_init();
1122 	/* Reset the system variables in the recovery module. */
1123 	recv_sys_var_init();
1124 	os_sync_init();
1125 	sync_init();
1126 	mem_init(srv_mem_pool_size);
1127 }
1128 
1129 /*======================= InnoDB Server FIFO queue =======================*/
1130 
1131 /* Maximum allowable purge history length.  <=0 means 'infinite'. */
1132 UNIV_INTERN ulong	srv_max_purge_lag		= 0;
1133 
1134 /*********************************************************************//**
1135 Puts an OS thread to wait if there are too many concurrent threads
1136 (>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
1137 UNIV_INTERN
1138 void
srv_conc_enter_innodb(trx_t * trx)1139 srv_conc_enter_innodb(
1140 /*==================*/
1141 	trx_t*	trx)	/*!< in: transaction object associated with the
1142 			thread */
1143 {
1144 	ibool			has_slept = FALSE;
1145 	srv_conc_slot_t*	slot	  = NULL;
1146 	ulint			i;
1147 
1148 #ifdef UNIV_SYNC_DEBUG
1149 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
1150 #endif /* UNIV_SYNC_DEBUG */
1151 
1152 	if (trx->mysql_thd != NULL
1153 	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
1154 
1155 		UT_WAIT_FOR(srv_conc_n_threads
1156 			    < (lint)srv_thread_concurrency,
1157 			    srv_replication_delay * 1000);
1158 
1159 		return;
1160 	}
1161 
1162 	/* If trx has 'free tickets' to enter the engine left, then use one
1163 	such ticket */
1164 
1165 	if (trx->n_tickets_to_enter_innodb > 0) {
1166 		trx->n_tickets_to_enter_innodb--;
1167 
1168 		return;
1169 	}
1170 
1171 	os_fast_mutex_lock(&srv_conc_mutex);
1172 retry:
1173 	if (trx->declared_to_be_inside_innodb) {
1174 		ut_print_timestamp(stderr);
1175 		fputs("  InnoDB: Error: trying to declare trx"
1176 		      " to enter InnoDB, but\n"
1177 		      "InnoDB: it already is declared.\n", stderr);
1178 		trx_print(stderr, trx, 0);
1179 		putc('\n', stderr);
1180 		os_fast_mutex_unlock(&srv_conc_mutex);
1181 
1182 		return;
1183 	}
1184 
1185 	ut_ad(srv_conc_n_threads >= 0);
1186 
1187 	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
1188 
1189 		srv_conc_n_threads++;
1190 		trx->declared_to_be_inside_innodb = TRUE;
1191 		trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
1192 
1193 		os_fast_mutex_unlock(&srv_conc_mutex);
1194 
1195 		return;
1196 	}
1197 
1198 	/* If the transaction is not holding resources, let it sleep
1199 	for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */
1200 
1201 	if (!has_slept && !trx->has_search_latch
1202 	    && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) {
1203 
1204 		has_slept = TRUE; /* We let it sleep only once to avoid
1205 				  starvation */
1206 
1207 		srv_conc_n_waiting_threads++;
1208 
1209 		os_fast_mutex_unlock(&srv_conc_mutex);
1210 
1211 		trx->op_info = "sleeping before joining InnoDB queue";
1212 
1213 		/* Peter Zaitsev suggested that we take the sleep away
1214 		altogether. But the sleep may be good in pathological
1215 		situations of lots of thread switches. Simply put some
1216 		threads aside for a while to reduce the number of thread
1217 		switches. */
1218 		if (SRV_THREAD_SLEEP_DELAY > 0) {
1219 			os_thread_sleep(SRV_THREAD_SLEEP_DELAY);
1220 		}
1221 
1222 		trx->op_info = "";
1223 
1224 		os_fast_mutex_lock(&srv_conc_mutex);
1225 
1226 		srv_conc_n_waiting_threads--;
1227 
1228 		goto retry;
1229 	}
1230 
1231 	/* Too many threads inside: put the current thread to a queue */
1232 
1233 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
1234 		slot = srv_conc_slots + i;
1235 
1236 		if (!slot->reserved) {
1237 
1238 			break;
1239 		}
1240 	}
1241 
1242 	if (i == OS_THREAD_MAX_N) {
1243 		/* Could not find a free wait slot, we must let the
1244 		thread enter */
1245 
1246 		srv_conc_n_threads++;
1247 		trx->declared_to_be_inside_innodb = TRUE;
1248 		trx->n_tickets_to_enter_innodb = 0;
1249 
1250 		os_fast_mutex_unlock(&srv_conc_mutex);
1251 
1252 		return;
1253 	}
1254 
1255 	/* Release possible search system latch this thread has */
1256 	if (trx->has_search_latch) {
1257 		trx_search_latch_release_if_reserved(trx);
1258 	}
1259 
1260 	/* Add to the queue */
1261 	slot->reserved = TRUE;
1262 	slot->wait_ended = FALSE;
1263 
1264 	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
1265 
1266 	os_event_reset(slot->event);
1267 
1268 	srv_conc_n_waiting_threads++;
1269 
1270 	os_fast_mutex_unlock(&srv_conc_mutex);
1271 
1272 	/* Go to wait for the event; when a thread leaves InnoDB it will
1273 	release this thread */
1274 
1275 	ut_ad(!trx->has_search_latch);
1276 #ifdef UNIV_SYNC_DEBUG
1277 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
1278 #endif /* UNIV_SYNC_DEBUG */
1279 	trx->op_info = "waiting in InnoDB queue";
1280 
1281 	thd_wait_begin(trx->mysql_thd, THD_WAIT_USER_LOCK);
1282 	os_event_wait(slot->event);
1283 	thd_wait_end(trx->mysql_thd);
1284 
1285 	trx->op_info = "";
1286 
1287 	os_fast_mutex_lock(&srv_conc_mutex);
1288 
1289 	srv_conc_n_waiting_threads--;
1290 
1291 	/* NOTE that the thread which released this thread already
1292 	incremented the thread counter on behalf of this thread */
1293 
1294 	slot->reserved = FALSE;
1295 
1296 	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
1297 
1298 	trx->declared_to_be_inside_innodb = TRUE;
1299 	trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
1300 
1301 	os_fast_mutex_unlock(&srv_conc_mutex);
1302 }
1303 
1304 /*********************************************************************//**
1305 This lets a thread enter InnoDB regardless of the number of threads inside
1306 InnoDB. This must be called when a thread ends a lock wait. */
1307 UNIV_INTERN
1308 void
srv_conc_force_enter_innodb(trx_t * trx)1309 srv_conc_force_enter_innodb(
1310 /*========================*/
1311 	trx_t*	trx)	/*!< in: transaction object associated with the
1312 			thread */
1313 {
1314 #ifdef UNIV_SYNC_DEBUG
1315 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
1316 #endif /* UNIV_SYNC_DEBUG */
1317 
1318 	if (UNIV_LIKELY(!srv_thread_concurrency)) {
1319 
1320 		return;
1321 	}
1322 
1323 	ut_ad(srv_conc_n_threads >= 0);
1324 
1325 	os_fast_mutex_lock(&srv_conc_mutex);
1326 
1327 	srv_conc_n_threads++;
1328 	trx->declared_to_be_inside_innodb = TRUE;
1329 	trx->n_tickets_to_enter_innodb = 1;
1330 
1331 	os_fast_mutex_unlock(&srv_conc_mutex);
1332 }
1333 
1334 /*********************************************************************//**
1335 This must be called when a thread exits InnoDB in a lock wait or at the
1336 end of an SQL statement. */
1337 UNIV_INTERN
1338 void
srv_conc_force_exit_innodb(trx_t * trx)1339 srv_conc_force_exit_innodb(
1340 /*=======================*/
1341 	trx_t*	trx)	/*!< in: transaction object associated with the
1342 			thread */
1343 {
1344 	srv_conc_slot_t*	slot	= NULL;
1345 
1346 	if (trx->mysql_thd != NULL
1347 	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
1348 
1349 		return;
1350 	}
1351 
1352 	if (trx->declared_to_be_inside_innodb == FALSE) {
1353 
1354 		return;
1355 	}
1356 
1357 	os_fast_mutex_lock(&srv_conc_mutex);
1358 
1359 	ut_ad(srv_conc_n_threads > 0);
1360 	srv_conc_n_threads--;
1361 	trx->declared_to_be_inside_innodb = FALSE;
1362 	trx->n_tickets_to_enter_innodb = 0;
1363 
1364 	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
1365 		/* Look for a slot where a thread is waiting and no other
1366 		thread has yet released the thread */
1367 
1368 		slot = UT_LIST_GET_FIRST(srv_conc_queue);
1369 
1370 		while (slot && slot->wait_ended == TRUE) {
1371 			slot = UT_LIST_GET_NEXT(srv_conc_queue, slot);
1372 		}
1373 
1374 		if (slot != NULL) {
1375 			slot->wait_ended = TRUE;
1376 
1377 			/* We increment the count on behalf of the released
1378 			thread */
1379 
1380 			srv_conc_n_threads++;
1381 		}
1382 	}
1383 
1384 	os_fast_mutex_unlock(&srv_conc_mutex);
1385 
1386 	if (slot != NULL) {
1387 		os_event_set(slot->event);
1388 	}
1389 
1390 #ifdef UNIV_SYNC_DEBUG
1391 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
1392 #endif /* UNIV_SYNC_DEBUG */
1393 }
1394 
1395 /*********************************************************************//**
1396 This must be called when a thread exits InnoDB. */
1397 UNIV_INTERN
1398 void
srv_conc_exit_innodb(trx_t * trx)1399 srv_conc_exit_innodb(
1400 /*=================*/
1401 	trx_t*	trx)	/*!< in: transaction object associated with the
1402 			thread */
1403 {
1404 #ifdef UNIV_SYNC_DEBUG
1405 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
1406 #endif /* UNIV_SYNC_DEBUG */
1407 
1408 	if (trx->n_tickets_to_enter_innodb > 0) {
1409 		/* We will pretend the thread is still inside InnoDB though it
1410 		now leaves the InnoDB engine. In this way we save
1411 		a lot of semaphore operations. srv_conc_force_exit_innodb is
1412 		used to declare the thread definitely outside InnoDB. It
1413 		should be called when there is a lock wait or an SQL statement
1414 		ends. */
1415 
1416 		return;
1417 	}
1418 
1419 	srv_conc_force_exit_innodb(trx);
1420 }
1421 
1422 /*========================================================================*/
1423 
1424 /*********************************************************************//**
1425 Normalizes init parameter values to use units we use inside InnoDB.
1426 @return	DB_SUCCESS or error code */
1427 static
1428 ulint
srv_normalize_init_values(void)1429 srv_normalize_init_values(void)
1430 /*===========================*/
1431 {
1432 	ulint	n;
1433 	ulint	i;
1434 
1435 	n = srv_n_data_files;
1436 
1437 	for (i = 0; i < n; i++) {
1438 		srv_data_file_sizes[i] = srv_data_file_sizes[i]
1439 			* ((1024 * 1024) / UNIV_PAGE_SIZE);
1440 	}
1441 
1442 	srv_last_file_size_max = srv_last_file_size_max
1443 		* ((1024 * 1024) / UNIV_PAGE_SIZE);
1444 
1445 	srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
1446 
1447 	srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
1448 
1449 	srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
1450 
1451 	return(DB_SUCCESS);
1452 }
1453 
1454 /*********************************************************************//**
1455 Boots the InnoDB server.
1456 @return	DB_SUCCESS or error code */
1457 UNIV_INTERN
1458 ulint
srv_boot(void)1459 srv_boot(void)
1460 /*==========*/
1461 {
1462 	ulint	err;
1463 
1464 	/* Transform the init parameter values given by MySQL to
1465 	use units we use inside InnoDB: */
1466 
1467 	err = srv_normalize_init_values();
1468 
1469 	if (err != DB_SUCCESS) {
1470 		return(err);
1471 	}
1472 
1473 	/* Initialize synchronization primitives, memory management, and thread
1474 	local storage */
1475 
1476 	srv_general_init();
1477 
1478 	/* Initialize this module */
1479 
1480 	srv_init();
1481 
1482 	return(DB_SUCCESS);
1483 }
1484 
1485 /*********************************************************************//**
1486 Reserves a slot in the thread table for the current MySQL OS thread.
1487 NOTE! The kernel mutex has to be reserved by the caller!
1488 @return	reserved slot */
1489 static
1490 srv_slot_t*
srv_table_reserve_slot_for_mysql(void)1491 srv_table_reserve_slot_for_mysql(void)
1492 /*==================================*/
1493 {
1494 	srv_slot_t*	slot;
1495 	ulint		i;
1496 
1497 	ut_ad(mutex_own(&kernel_mutex));
1498 
1499 	i = 0;
1500 	slot = srv_mysql_table + i;
1501 
1502 	while (slot->in_use) {
1503 		i++;
1504 
1505 		if (UNIV_UNLIKELY(i >= OS_THREAD_MAX_N)) {
1506 
1507 			ut_print_timestamp(stderr);
1508 
1509 			fprintf(stderr,
1510 				"  InnoDB: There appear to be %lu MySQL"
1511 				" threads currently waiting\n"
1512 				"InnoDB: inside InnoDB, which is the"
1513 				" upper limit. Cannot continue operation.\n"
1514 				"InnoDB: We intentionally generate"
1515 				" a seg fault to print a stack trace\n"
1516 				"InnoDB: on Linux. But first we print"
1517 				" a list of waiting threads.\n", (ulong) i);
1518 
1519 			for (i = 0; i < OS_THREAD_MAX_N; i++) {
1520 
1521 				slot = srv_mysql_table + i;
1522 
1523 				fprintf(stderr,
1524 					"Slot %lu: thread type %lu,"
1525 					" in use %lu, susp %lu, time %lu\n",
1526 					(ulong) i,
1527 					(ulong) slot->type,
1528 					(ulong) slot->in_use,
1529 					(ulong) slot->suspended,
1530 					(ulong) difftime(ut_time(),
1531 							 slot->suspend_time));
1532 			}
1533 
1534 			ut_error;
1535 		}
1536 
1537 		slot = srv_mysql_table + i;
1538 	}
1539 
1540 	ut_a(slot->in_use == FALSE);
1541 
1542 	slot->in_use = TRUE;
1543 
1544 	return(slot);
1545 }
1546 
1547 /***************************************************************//**
1548 Puts a MySQL OS thread to wait for a lock to be released. If an error
1549 occurs during the wait trx->error_state associated with thr is
1550 != DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
1551 are possible errors. DB_DEADLOCK is returned if selective deadlock
1552 resolution chose this transaction as a victim. */
1553 UNIV_INTERN
1554 void
srv_suspend_mysql_thread(que_thr_t * thr)1555 srv_suspend_mysql_thread(
1556 /*=====================*/
1557 	que_thr_t*	thr)	/*!< in: query thread associated with the MySQL
1558 				OS thread */
1559 {
1560 	srv_slot_t*	slot;
1561 	os_event_t	event;
1562 	double		wait_time;
1563 	trx_t*		trx;
1564 	ulint		had_dict_lock;
1565 	ibool		was_declared_inside_innodb	= FALSE;
1566 	ib_int64_t	start_time			= 0;
1567 	ib_int64_t	finish_time;
1568 	ulint		diff_time;
1569 	ulint		sec;
1570 	ulint		ms;
1571 	ulong		lock_wait_timeout;
1572 
1573 	ut_ad(!mutex_own(&kernel_mutex));
1574 
1575 	trx = thr_get_trx(thr);
1576 
1577 	if (trx->mysql_thd != 0) {
1578 		DEBUG_SYNC_C("srv_suspend_mysql_thread_enter");
1579 	}
1580 
1581 	os_event_set(srv_lock_timeout_thread_event);
1582 
1583 	mutex_enter(&kernel_mutex);
1584 
1585 	trx->error_state = DB_SUCCESS;
1586 
1587 	if (thr->state == QUE_THR_RUNNING) {
1588 
1589 		ut_ad(thr->is_active == TRUE);
1590 
1591 		/* The lock has already been released or this transaction
1592 		was chosen as a deadlock victim: no need to suspend */
1593 
1594 		if (trx->was_chosen_as_deadlock_victim) {
1595 
1596 			trx->error_state = DB_DEADLOCK;
1597 			trx->was_chosen_as_deadlock_victim = FALSE;
1598 		}
1599 
1600 		mutex_exit(&kernel_mutex);
1601 
1602 		return;
1603 	}
1604 
1605 	ut_ad(thr->is_active == FALSE);
1606 
1607 	slot = srv_table_reserve_slot_for_mysql();
1608 
1609 	event = slot->event;
1610 
1611 	slot->thr = thr;
1612 
1613 	os_event_reset(event);
1614 
1615 	slot->suspend_time = ut_time();
1616 
1617 	if (thr->lock_state == QUE_THR_LOCK_ROW) {
1618 		srv_n_lock_wait_count++;
1619 		srv_n_lock_wait_current_count++;
1620 
1621 		if (ut_usectime(&sec, &ms) == -1) {
1622 			start_time = -1;
1623 		} else {
1624 			start_time = (ib_int64_t) sec * 1000000 + ms;
1625 		}
1626 	}
1627 	/* Wake the lock timeout monitor thread, if it is suspended */
1628 
1629 	os_event_set(srv_lock_timeout_thread_event);
1630 
1631 	mutex_exit(&kernel_mutex);
1632 
1633 	had_dict_lock = trx->dict_operation_lock_mode;
1634 
1635 	switch (had_dict_lock) {
1636 	case RW_S_LATCH:
1637 		/* Release foreign key check latch */
1638 		row_mysql_unfreeze_data_dictionary(trx);
1639 		break;
1640 	case RW_X_LATCH:
1641 		/* There should never be a lock wait when the
1642 		dictionary latch is reserved in X mode.  Dictionary
1643 		transactions should only acquire locks on dictionary
1644 		tables, not other tables. All access to dictionary
1645 		tables should be covered by dictionary
1646 		transactions. */
1647 		ut_print_timestamp(stderr);
1648 		fputs("  InnoDB: Error: dict X latch held in "
1649 		      "srv_suspend_mysql_thread\n", stderr);
1650 		/* This should never occur. This incorrect handling
1651 		was added in the early development of
1652 		ha_innobase::add_index() in InnoDB Plugin 1.0. */
1653 		/* Release fast index creation latch */
1654 		row_mysql_unlock_data_dictionary(trx);
1655 		break;
1656 	}
1657 
1658 	ut_a(trx->dict_operation_lock_mode == 0);
1659 
1660 	if (trx->declared_to_be_inside_innodb) {
1661 
1662 		was_declared_inside_innodb = TRUE;
1663 
1664 		/* We must declare this OS thread to exit InnoDB, since a
1665 		possible other thread holding a lock which this thread waits
1666 		for must be allowed to enter, sooner or later */
1667 
1668 		srv_conc_force_exit_innodb(trx);
1669 	}
1670 
1671 	/* Suspend this thread and wait for the event. */
1672 
1673 	thd_wait_begin(trx->mysql_thd, THD_WAIT_ROW_LOCK);
1674 	os_event_wait(event);
1675 	thd_wait_end(trx->mysql_thd);
1676 
1677 #ifdef UNIV_SYNC_DEBUG
1678 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
1679 #endif /* UNIV_SYNC_DEBUG */
1680 
1681 	if (was_declared_inside_innodb) {
1682 
1683 		/* Return back inside InnoDB */
1684 
1685 		srv_conc_force_enter_innodb(trx);
1686 	}
1687 
1688 	/* After resuming, reacquire the data dictionary latch if
1689 	necessary. */
1690 
1691 	switch (had_dict_lock) {
1692 	case RW_S_LATCH:
1693 		row_mysql_freeze_data_dictionary(trx);
1694 		break;
1695 	case RW_X_LATCH:
1696 		/* This should never occur. This incorrect handling
1697 		was added in the early development of
1698 		ha_innobase::add_index() in InnoDB Plugin 1.0. */
1699 		row_mysql_lock_data_dictionary(trx);
1700 		break;
1701 	}
1702 
1703 	mutex_enter(&kernel_mutex);
1704 
1705 	/* Release the slot for others to use */
1706 
1707 	slot->in_use = FALSE;
1708 
1709 	wait_time = ut_difftime(ut_time(), slot->suspend_time);
1710 
1711 	if (thr->lock_state == QUE_THR_LOCK_ROW) {
1712 		if (ut_usectime(&sec, &ms) == -1) {
1713 			finish_time = -1;
1714 		} else {
1715 			finish_time = (ib_int64_t) sec * 1000000 + ms;
1716 		}
1717 
1718 		diff_time = (finish_time > start_time) ?
1719 			    (ulint) (finish_time - start_time) : 0;
1720 
1721 		srv_n_lock_wait_current_count--;
1722 		srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time;
1723 		if (diff_time > srv_n_lock_max_wait_time &&
1724 		    /* only update the variable if we successfully
1725 		    retrieved the start and finish times. See Bug#36819. */
1726 		    start_time != -1 && finish_time != -1) {
1727 			srv_n_lock_max_wait_time = diff_time;
1728 		}
1729 
1730 		/* Record the lock wait time for this thread */
1731 		thd_set_lock_wait_time(trx->mysql_thd, diff_time);
1732 	}
1733 
1734 	if (trx->was_chosen_as_deadlock_victim) {
1735 
1736 		trx->error_state = DB_DEADLOCK;
1737 		trx->was_chosen_as_deadlock_victim = FALSE;
1738 	}
1739 
1740 	mutex_exit(&kernel_mutex);
1741 
1742 	/* InnoDB system transactions (such as the purge, and
1743 	incomplete transactions that are being rolled back after crash
1744 	recovery) will use the global value of
1745 	innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
1746 	lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd);
1747 
1748 	if (lock_wait_timeout < 100000000
1749 	    && wait_time > (double) lock_wait_timeout) {
1750 
1751 		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
1752 	}
1753 
1754 	if (trx_is_interrupted(trx)) {
1755 
1756 		trx->error_state = DB_INTERRUPTED;
1757 	}
1758 }
1759 
1760 /********************************************************************//**
1761 Releases a MySQL OS thread waiting for a lock to be released, if the
1762 thread is already suspended. */
1763 UNIV_INTERN
1764 void
srv_release_mysql_thread_if_suspended(que_thr_t * thr)1765 srv_release_mysql_thread_if_suspended(
1766 /*==================================*/
1767 	que_thr_t*	thr)	/*!< in: query thread associated with the
1768 				MySQL OS thread	 */
1769 {
1770 	srv_slot_t*	slot;
1771 	ulint		i;
1772 
1773 	ut_ad(mutex_own(&kernel_mutex));
1774 
1775 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
1776 
1777 		slot = srv_mysql_table + i;
1778 
1779 		if (slot->in_use && slot->thr == thr) {
1780 			/* Found */
1781 
1782 			os_event_set(slot->event);
1783 
1784 			return;
1785 		}
1786 	}
1787 
1788 	/* not found */
1789 }
1790 
1791 /******************************************************************//**
1792 Refreshes the values used to calculate per-second averages. */
1793 static
1794 void
srv_refresh_innodb_monitor_stats(void)1795 srv_refresh_innodb_monitor_stats(void)
1796 /*==================================*/
1797 {
1798 	mutex_enter(&srv_innodb_monitor_mutex);
1799 
1800 	srv_last_monitor_time = time(NULL);
1801 
1802 	os_aio_refresh_stats();
1803 
1804 	btr_cur_n_sea_old = btr_cur_n_sea;
1805 	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
1806 
1807 	log_refresh_stats();
1808 
1809 	buf_refresh_io_stats_all();
1810 
1811 	srv_n_rows_inserted_old = srv_n_rows_inserted;
1812 	srv_n_rows_updated_old = srv_n_rows_updated;
1813 	srv_n_rows_deleted_old = srv_n_rows_deleted;
1814 	srv_n_rows_read_old = srv_n_rows_read;
1815 
1816 	mutex_exit(&srv_innodb_monitor_mutex);
1817 }
1818 
1819 /******************************************************************//**
1820 Outputs to a file the output of the InnoDB Monitor.
1821 @return FALSE if not all information printed
1822 due to failure to obtain necessary mutex */
1823 UNIV_INTERN
1824 ibool
srv_printf_innodb_monitor(FILE * file,ibool nowait,ulint * trx_start,ulint * trx_end)1825 srv_printf_innodb_monitor(
1826 /*======================*/
1827 	FILE*	file,		/*!< in: output stream */
1828 	ibool	nowait,		/*!< in: whether to wait for kernel mutex */
1829 	ulint*	trx_start,	/*!< out: file position of the start of
1830 				the list of active transactions */
1831 	ulint*	trx_end)	/*!< out: file position of the end of
1832 				the list of active transactions */
1833 {
1834 	double	time_elapsed;
1835 	time_t	current_time;
1836 	ulint	n_reserved;
1837 	ibool	ret;
1838 
1839 	mutex_enter(&srv_innodb_monitor_mutex);
1840 
1841 	current_time = time(NULL);
1842 
1843 	/* We add 0.001 seconds to time_elapsed to prevent division
1844 	by zero if two users happen to call SHOW INNODB STATUS at the same
1845 	time */
1846 
1847 	time_elapsed = difftime(current_time, srv_last_monitor_time)
1848 		+ 0.001;
1849 
1850 	srv_last_monitor_time = time(NULL);
1851 
1852 	fputs("\n=====================================\n", file);
1853 
1854 	ut_print_timestamp(file);
1855 	fprintf(file,
1856 		" INNODB MONITOR OUTPUT\n"
1857 		"=====================================\n"
1858 		"Per second averages calculated from the last %lu seconds\n",
1859 		(ulong)time_elapsed);
1860 
1861 	fputs("-----------------\n"
1862 	      "BACKGROUND THREAD\n"
1863 	      "-----------------\n", file);
1864 	srv_print_master_thread_info(file);
1865 
1866 	fputs("----------\n"
1867 	      "SEMAPHORES\n"
1868 	      "----------\n", file);
1869 	sync_print(file);
1870 
1871 	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
1872 	order level in sync0sync.h, while dict_foreign_err_mutex has a very
1873 	low level 135. Therefore we can reserve the latter mutex here without
1874 	a danger of a deadlock of threads. */
1875 
1876 	mutex_enter(&dict_foreign_err_mutex);
1877 
1878 	if (ftell(dict_foreign_err_file) != 0L) {
1879 		fputs("------------------------\n"
1880 		      "LATEST FOREIGN KEY ERROR\n"
1881 		      "------------------------\n", file);
1882 		ut_copy_file(file, dict_foreign_err_file);
1883 	}
1884 
1885 	mutex_exit(&dict_foreign_err_mutex);
1886 
1887 	/* Only if lock_print_info_summary proceeds correctly,
1888 	before we call the lock_print_info_all_transactions
1889 	to print all the lock information. */
1890 	ret = lock_print_info_summary(file, nowait);
1891 
1892 	if (ret) {
1893 		if (trx_start) {
1894 			long	t = ftell(file);
1895 			if (t < 0) {
1896 				*trx_start = ULINT_UNDEFINED;
1897 			} else {
1898 				*trx_start = (ulint) t;
1899 			}
1900 		}
1901 		lock_print_info_all_transactions(file);
1902 		if (trx_end) {
1903 			long	t = ftell(file);
1904 			if (t < 0) {
1905 				*trx_end = ULINT_UNDEFINED;
1906 			} else {
1907 				*trx_end = (ulint) t;
1908 			}
1909 		}
1910 	}
1911 
1912 	fputs("--------\n"
1913 	      "FILE I/O\n"
1914 	      "--------\n", file);
1915 	os_aio_print(file);
1916 
1917 	fputs("-------------------------------------\n"
1918 	      "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
1919 	      "-------------------------------------\n", file);
1920 	ibuf_print(file);
1921 
1922 	ha_print_info(file, btr_search_sys->hash_index);
1923 
1924 	fprintf(file,
1925 		"%.2f hash searches/s, %.2f non-hash searches/s\n",
1926 		(btr_cur_n_sea - btr_cur_n_sea_old)
1927 		/ time_elapsed,
1928 		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
1929 		/ time_elapsed);
1930 	btr_cur_n_sea_old = btr_cur_n_sea;
1931 	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
1932 
1933 	fputs("---\n"
1934 	      "LOG\n"
1935 	      "---\n", file);
1936 	log_print(file);
1937 
1938 	fputs("----------------------\n"
1939 	      "BUFFER POOL AND MEMORY\n"
1940 	      "----------------------\n", file);
1941 	fprintf(file,
1942 		"Total memory allocated " ULINTPF
1943 		"; in additional pool allocated " ULINTPF "\n",
1944 		ut_total_allocated_memory,
1945 		mem_pool_get_reserved(mem_comm_pool));
1946 	fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
1947 		dict_sys->size);
1948 
1949 	buf_print_io(file);
1950 
1951 	fputs("--------------\n"
1952 	      "ROW OPERATIONS\n"
1953 	      "--------------\n", file);
1954 	fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
1955 		(long) srv_conc_n_threads,
1956 		(ulong) srv_conc_n_waiting_threads);
1957 
1958 	fprintf(file, "%lu read views open inside InnoDB\n",
1959 		UT_LIST_GET_LEN(trx_sys->view_list));
1960 
1961 	n_reserved = fil_space_get_n_reserved_extents(0);
1962 	if (n_reserved > 0) {
1963 		fprintf(file,
1964 			"%lu tablespace extents now reserved for"
1965 			" B-tree split operations\n",
1966 			(ulong) n_reserved);
1967 	}
1968 
1969 #ifdef UNIV_LINUX
1970 	fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
1971 		(ulong) srv_main_thread_process_no,
1972 		(ulong) srv_main_thread_id,
1973 		srv_main_thread_op_info);
1974 #else
1975 	fprintf(file, "Main thread id %lu, state: %s\n",
1976 		(ulong) srv_main_thread_id,
1977 		srv_main_thread_op_info);
1978 #endif
1979 	fprintf(file,
1980 		"Number of rows inserted " ULINTPF
1981 		", updated " ULINTPF ", deleted " ULINTPF
1982 		", read " ULINTPF "\n",
1983 		srv_n_rows_inserted,
1984 		srv_n_rows_updated,
1985 		srv_n_rows_deleted,
1986 		srv_n_rows_read);
1987 	fprintf(file,
1988 		"%.2f inserts/s, %.2f updates/s,"
1989 		" %.2f deletes/s, %.2f reads/s\n",
1990 		(srv_n_rows_inserted - srv_n_rows_inserted_old)
1991 		/ time_elapsed,
1992 		(srv_n_rows_updated - srv_n_rows_updated_old)
1993 		/ time_elapsed,
1994 		(srv_n_rows_deleted - srv_n_rows_deleted_old)
1995 		/ time_elapsed,
1996 		(srv_n_rows_read - srv_n_rows_read_old)
1997 		/ time_elapsed);
1998 
1999 	srv_n_rows_inserted_old = srv_n_rows_inserted;
2000 	srv_n_rows_updated_old = srv_n_rows_updated;
2001 	srv_n_rows_deleted_old = srv_n_rows_deleted;
2002 	srv_n_rows_read_old = srv_n_rows_read;
2003 
2004 	fputs("----------------------------\n"
2005 	      "END OF INNODB MONITOR OUTPUT\n"
2006 	      "============================\n", file);
2007 	mutex_exit(&srv_innodb_monitor_mutex);
2008 	fflush(file);
2009 
2010 	return(ret);
2011 }
2012 
2013 /******************************************************************//**
2014 Function to pass InnoDB status variables to MySQL */
2015 UNIV_INTERN
2016 void
srv_export_innodb_status(void)2017 srv_export_innodb_status(void)
2018 /*==========================*/
2019 {
2020 	buf_pool_stat_t		stat;
2021 	buf_pools_list_size_t	buf_pools_list_size;
2022 	ulint			LRU_len;
2023 	ulint			free_len;
2024 	ulint			flush_list_len;
2025 
2026 	buf_get_total_stat(&stat);
2027 	buf_get_total_list_len(&LRU_len, &free_len, &flush_list_len);
2028 	buf_get_total_list_size_in_bytes(&buf_pools_list_size);
2029 
2030 	mutex_enter(&srv_innodb_monitor_mutex);
2031 
2032 	export_vars.innodb_data_pending_reads
2033 		= os_n_pending_reads;
2034 	export_vars.innodb_data_pending_writes
2035 		= os_n_pending_writes;
2036 	export_vars.innodb_data_pending_fsyncs
2037 		= fil_n_pending_log_flushes
2038 		+ fil_n_pending_tablespace_flushes;
2039 	export_vars.innodb_data_fsyncs = os_n_fsyncs;
2040 	export_vars.innodb_data_read = srv_data_read;
2041 	export_vars.innodb_data_reads = os_n_file_reads;
2042 	export_vars.innodb_data_writes = os_n_file_writes;
2043 	export_vars.innodb_data_written = srv_data_written;
2044 	export_vars.innodb_buffer_pool_read_requests = stat.n_page_gets;
2045 	export_vars.innodb_buffer_pool_write_requests
2046 		= srv_buf_pool_write_requests;
2047 	export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
2048 	export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
2049 	export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
2050 	export_vars.innodb_buffer_pool_read_ahead_rnd
2051 		= stat.n_ra_pages_read_rnd;
2052 	export_vars.innodb_buffer_pool_read_ahead
2053 		= stat.n_ra_pages_read;
2054 	export_vars.innodb_buffer_pool_read_ahead_evicted
2055 		= stat.n_ra_pages_evicted;
2056 	export_vars.innodb_buffer_pool_pages_data = LRU_len;
2057 	export_vars.innodb_buffer_pool_bytes_data =
2058 		buf_pools_list_size.LRU_bytes
2059 		+ buf_pools_list_size.unzip_LRU_bytes;
2060 	export_vars.innodb_buffer_pool_pages_dirty = flush_list_len;
2061 	export_vars.innodb_buffer_pool_bytes_dirty =
2062 		buf_pools_list_size.flush_list_bytes;
2063 	export_vars.innodb_buffer_pool_pages_free = free_len;
2064 #ifdef UNIV_DEBUG
2065 	export_vars.innodb_buffer_pool_pages_latched
2066 		= buf_get_latched_pages_number();
2067 #endif /* UNIV_DEBUG */
2068 	export_vars.innodb_buffer_pool_pages_total = buf_pool_get_n_pages();
2069 
2070 	export_vars.innodb_buffer_pool_pages_misc
2071 	       	= buf_pool_get_n_pages() - LRU_len - free_len;
2072 #ifdef HAVE_ATOMIC_BUILTINS
2073 	export_vars.innodb_have_atomic_builtins = 1;
2074 #else
2075 	export_vars.innodb_have_atomic_builtins = 0;
2076 #endif
2077 	export_vars.innodb_page_size = UNIV_PAGE_SIZE;
2078 	export_vars.innodb_log_waits = srv_log_waits;
2079 	export_vars.innodb_os_log_written = srv_os_log_written;
2080 	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
2081 	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
2082 	export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes;
2083 	export_vars.innodb_log_write_requests = srv_log_write_requests;
2084 	export_vars.innodb_log_writes = srv_log_writes;
2085 	export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written;
2086 	export_vars.innodb_dblwr_writes = srv_dblwr_writes;
2087 	export_vars.innodb_pages_created = stat.n_pages_created;
2088 	export_vars.innodb_pages_read = stat.n_pages_read;
2089 	export_vars.innodb_pages_written = stat.n_pages_written;
2090 	export_vars.innodb_row_lock_waits = srv_n_lock_wait_count;
2091 	export_vars.innodb_row_lock_current_waits
2092 		= srv_n_lock_wait_current_count;
2093 	export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000;
2094 	if (srv_n_lock_wait_count > 0) {
2095 		export_vars.innodb_row_lock_time_avg = (ulint)
2096 			(srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count);
2097 	} else {
2098 		export_vars.innodb_row_lock_time_avg = 0;
2099 	}
2100 	export_vars.innodb_row_lock_time_max
2101 		= srv_n_lock_max_wait_time / 1000;
2102 	export_vars.innodb_rows_read = srv_n_rows_read;
2103 	export_vars.innodb_rows_inserted = srv_n_rows_inserted;
2104 	export_vars.innodb_rows_updated = srv_n_rows_updated;
2105 	export_vars.innodb_rows_deleted = srv_n_rows_deleted;
2106 	export_vars.innodb_truncated_status_writes = srv_truncated_status_writes;
2107 
2108 #ifdef UNIV_DEBUG
2109 	{
2110 		trx_id_t	done_trx_no;
2111 		trx_id_t	up_limit_id;
2112 
2113 		rw_lock_s_lock(&purge_sys->latch);
2114 		done_trx_no	= purge_sys->done_trx_no;
2115 		up_limit_id	= purge_sys->view
2116 			? purge_sys->view->up_limit_id
2117 			: 0;
2118 		rw_lock_s_unlock(&purge_sys->latch);
2119 
2120 		if (trx_sys->max_trx_id < done_trx_no) {
2121 			export_vars.innodb_purge_trx_id_age = 0;
2122 		} else {
2123 			export_vars.innodb_purge_trx_id_age =
2124 				trx_sys->max_trx_id - done_trx_no;
2125 		}
2126 
2127 		if (!up_limit_id
2128 		    || trx_sys->max_trx_id < up_limit_id) {
2129 			export_vars.innodb_purge_view_trx_id_age = 0;
2130 		} else {
2131 			export_vars.innodb_purge_view_trx_id_age =
2132 				trx_sys->max_trx_id - up_limit_id;
2133 		}
2134 	}
2135 #endif /* UNIV_DEBUG */
2136 
2137 	mutex_exit(&srv_innodb_monitor_mutex);
2138 }
2139 
2140 /*********************************************************************//**
2141 A thread which prints the info output by various InnoDB monitors.
2142 @return	a dummy parameter */
2143 UNIV_INTERN
2144 os_thread_ret_t
srv_monitor_thread(void * arg)2145 srv_monitor_thread(
2146 /*===============*/
2147 	void*	arg __attribute__((unused)))
2148 			/*!< in: a dummy parameter required by
2149 			os_thread_create */
2150 {
2151 	ib_int64_t	sig_count;
2152 	double		time_elapsed;
2153 	time_t		current_time;
2154 	time_t		last_table_monitor_time;
2155 	time_t		last_tablespace_monitor_time;
2156 	time_t		last_monitor_time;
2157 	ulint		mutex_skipped;
2158 	ibool		last_srv_print_monitor;
2159 
2160 #ifdef UNIV_DEBUG_THREAD_CREATION
2161 	fprintf(stderr, "Lock timeout thread starts, id %lu\n",
2162 		os_thread_pf(os_thread_get_curr_id()));
2163 #endif
2164 
2165 #ifdef UNIV_PFS_THREAD
2166 	pfs_register_thread(srv_monitor_thread_key);
2167 #endif
2168 
2169 	UT_NOT_USED(arg);
2170 	srv_last_monitor_time = ut_time();
2171 	last_table_monitor_time = ut_time();
2172 	last_tablespace_monitor_time = ut_time();
2173 	last_monitor_time = ut_time();
2174 	mutex_skipped = 0;
2175 	last_srv_print_monitor = srv_print_innodb_monitor;
2176 loop:
2177 	srv_monitor_active = TRUE;
2178 
2179 	/* Wake up every 5 seconds to see if we need to print
2180 	monitor information or if signalled at shutdown. */
2181 
2182 	sig_count = os_event_reset(srv_monitor_event);
2183 
2184 	os_event_wait_time_low(srv_monitor_event, 5000000, sig_count);
2185 
2186 	current_time = ut_time();
2187 
2188 	time_elapsed = difftime(current_time, last_monitor_time);
2189 
2190 	if (time_elapsed > 15) {
2191 		last_monitor_time = ut_time();
2192 
2193 		if (srv_print_innodb_monitor) {
2194 			/* Reset mutex_skipped counter everytime
2195 			srv_print_innodb_monitor changes. This is to
2196 			ensure we will not be blocked by kernel_mutex
2197 			for short duration information printing,
2198 			such as requested by sync_array_print_long_waits() */
2199 			if (!last_srv_print_monitor) {
2200 				mutex_skipped = 0;
2201 				last_srv_print_monitor = TRUE;
2202 			}
2203 
2204 			if (!srv_printf_innodb_monitor(stderr,
2205 						MUTEX_NOWAIT(mutex_skipped),
2206 						NULL, NULL)) {
2207 				mutex_skipped++;
2208 			} else {
2209 				/* Reset the counter */
2210 				mutex_skipped = 0;
2211 			}
2212 		} else {
2213 			last_srv_print_monitor = FALSE;
2214 		}
2215 
2216 
2217 		if (srv_innodb_status) {
2218 			mutex_enter(&srv_monitor_file_mutex);
2219 			rewind(srv_monitor_file);
2220 			if (!srv_printf_innodb_monitor(srv_monitor_file,
2221 						MUTEX_NOWAIT(mutex_skipped),
2222 						NULL, NULL)) {
2223 				mutex_skipped++;
2224 			} else {
2225 				mutex_skipped = 0;
2226 			}
2227 
2228 			os_file_set_eof(srv_monitor_file);
2229 			mutex_exit(&srv_monitor_file_mutex);
2230 		}
2231 
2232 		if (srv_print_innodb_tablespace_monitor
2233 		    && difftime(current_time,
2234 				last_tablespace_monitor_time) > 60) {
2235 			last_tablespace_monitor_time = ut_time();
2236 
2237 			fputs("========================"
2238 			      "========================\n",
2239 			      stderr);
2240 
2241 			ut_print_timestamp(stderr);
2242 
2243 			fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
2244 			      "========================"
2245 			      "========================\n",
2246 			      stderr);
2247 
2248 			fsp_print(0);
2249 			fputs("Validating tablespace\n", stderr);
2250 			fsp_validate(0);
2251 			fputs("Validation ok\n"
2252 			      "---------------------------------------\n"
2253 			      "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
2254 			      "=======================================\n",
2255 			      stderr);
2256 		}
2257 
2258 		if (srv_print_innodb_table_monitor
2259 		    && difftime(current_time, last_table_monitor_time) > 60) {
2260 
2261 			last_table_monitor_time = ut_time();
2262 
2263 			fputs("===========================================\n",
2264 			      stderr);
2265 
2266 			ut_print_timestamp(stderr);
2267 
2268 			fputs(" INNODB TABLE MONITOR OUTPUT\n"
2269 			      "===========================================\n",
2270 			      stderr);
2271 			dict_print();
2272 
2273 			fputs("-----------------------------------\n"
2274 			      "END OF INNODB TABLE MONITOR OUTPUT\n"
2275 			      "==================================\n",
2276 			      stderr);
2277 		}
2278 	}
2279 
2280 	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
2281 		goto exit_func;
2282 	}
2283 
2284 	if (srv_print_innodb_monitor
2285 	    || srv_print_innodb_lock_monitor
2286 	    || srv_print_innodb_tablespace_monitor
2287 	    || srv_print_innodb_table_monitor) {
2288 		goto loop;
2289 	}
2290 
2291 	srv_monitor_active = FALSE;
2292 
2293 	goto loop;
2294 
2295 exit_func:
2296 	srv_monitor_active = FALSE;
2297 
2298 	/* We count the number of threads in os_thread_exit(). A created
2299 	thread should always use that to exit and not use return() to exit. */
2300 
2301 	os_thread_exit(NULL);
2302 
2303 	OS_THREAD_DUMMY_RETURN;
2304 }
2305 
2306 /*********************************************************************//**
2307 A thread which wakes up threads whose lock wait may have lasted too long.
2308 @return	a dummy parameter */
2309 UNIV_INTERN
2310 os_thread_ret_t
srv_lock_timeout_thread(void * arg)2311 srv_lock_timeout_thread(
2312 /*====================*/
2313 	void*	arg __attribute__((unused)))
2314 			/* in: a dummy parameter required by
2315 			os_thread_create */
2316 {
2317 	srv_slot_t*	slot;
2318 	ibool		some_waits;
2319 	double		wait_time;
2320 	ulint		i;
2321 	ib_int64_t	sig_count;
2322 
2323 #ifdef UNIV_PFS_THREAD
2324 	pfs_register_thread(srv_lock_timeout_thread_key);
2325 #endif
2326 
2327 loop:
2328 
2329 	/* When someone is waiting for a lock, we wake up every second
2330 	and check if a timeout has passed for a lock wait */
2331 
2332 	sig_count = os_event_reset(srv_timeout_event);
2333 
2334 	os_event_wait_time_low(srv_timeout_event, 1000000, sig_count);
2335 
2336 	srv_lock_timeout_active = TRUE;
2337 
2338 	mutex_enter(&kernel_mutex);
2339 
2340 	some_waits = FALSE;
2341 
2342 	/* Check of all slots if a thread is waiting there, and if it
2343 	has exceeded the time limit */
2344 
2345 	for (i = 0; i < OS_THREAD_MAX_N; i++) {
2346 
2347 		slot = srv_mysql_table + i;
2348 
2349 		if (slot->in_use) {
2350 			trx_t*	trx;
2351 			ulong	lock_wait_timeout;
2352 
2353 			some_waits = TRUE;
2354 
2355 			wait_time = ut_difftime(ut_time(), slot->suspend_time);
2356 
2357 			trx = thr_get_trx(slot->thr);
2358 			lock_wait_timeout = thd_lock_wait_timeout(
2359 				trx->mysql_thd);
2360 
2361 			if (trx_is_interrupted(trx)
2362 			    || (lock_wait_timeout < 100000000
2363 				&& (wait_time > (double) lock_wait_timeout
2364 				    || wait_time < 0))) {
2365 
2366 				/* Timeout exceeded or a wrap-around in system
2367 				time counter: cancel the lock request queued
2368 				by the transaction and release possible
2369 				other transactions waiting behind; it is
2370 				possible that the lock has already been
2371 				granted: in that case do nothing */
2372 
2373 				if (trx->wait_lock) {
2374 					lock_cancel_waiting_and_release(
2375 						trx->wait_lock);
2376 				}
2377 			}
2378 		}
2379 	}
2380 
2381 	os_event_reset(srv_lock_timeout_thread_event);
2382 
2383 	mutex_exit(&kernel_mutex);
2384 
2385 	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
2386 		goto exit_func;
2387 	}
2388 
2389 	if (some_waits) {
2390 		goto loop;
2391 	}
2392 
2393 	srv_lock_timeout_active = FALSE;
2394 
2395 #if 0
2396 	/* The following synchronisation is disabled, since
2397 	the InnoDB monitor output is to be updated every 15 seconds. */
2398 	os_event_wait(srv_lock_timeout_thread_event);
2399 #endif
2400 	goto loop;
2401 
2402 exit_func:
2403 	srv_lock_timeout_active = FALSE;
2404 
2405 	/* We count the number of threads in os_thread_exit(). A created
2406 	thread should always use that to exit and not use return() to exit. */
2407 
2408 	os_thread_exit(NULL);
2409 
2410 	OS_THREAD_DUMMY_RETURN;
2411 }
2412 
2413 /*********************************************************************//**
2414 A thread which prints warnings about semaphore waits which have lasted
2415 too long. These can be used to track bugs which cause hangs.
2416 Note: In order to make sync_arr_wake_threads_if_sema_free work as expected,
2417 we should avoid waiting any mutexes in this function!
2418 @return	a dummy parameter */
2419 UNIV_INTERN
2420 os_thread_ret_t
srv_error_monitor_thread(void * arg)2421 srv_error_monitor_thread(
2422 /*=====================*/
2423 	void*	arg __attribute__((unused)))
2424 			/*!< in: a dummy parameter required by
2425 			os_thread_create */
2426 {
2427 	/* number of successive fatal timeouts observed */
2428 	ulint		fatal_cnt	= 0;
2429 	ib_uint64_t	old_lsn;
2430 	ib_uint64_t	new_lsn;
2431 	ib_int64_t	sig_count;
2432 	/* longest waiting thread for a semaphore */
2433 	os_thread_id_t	waiter		= os_thread_get_curr_id();
2434 	os_thread_id_t	old_waiter	= waiter;
2435 	/* the semaphore that is being waited for */
2436 	const void*	sema		= NULL;
2437 	const void*	old_sema	= NULL;
2438 
2439 	old_lsn = srv_start_lsn;
2440 
2441 #ifdef UNIV_DEBUG_THREAD_CREATION
2442 	fprintf(stderr, "Error monitor thread starts, id %lu\n",
2443 		os_thread_pf(os_thread_get_curr_id()));
2444 #endif
2445 
2446 #ifdef UNIV_PFS_THREAD
2447 	pfs_register_thread(srv_error_monitor_thread_key);
2448 #endif
2449 
2450 loop:
2451 	srv_error_monitor_active = TRUE;
2452 
2453 	/* Try to track a strange bug reported by Harald Fuchs and others,
2454 	where the lsn seems to decrease at times */
2455 	if (log_peek_lsn(&new_lsn)) {
2456 		if (new_lsn < old_lsn) {
2457 			ut_print_timestamp(stderr);
2458 			fprintf(stderr,
2459 				"  InnoDB: Error: old log sequence number %llu"
2460 				" was greater\n"
2461 				"InnoDB: than the new log sequence number %llu!\n"
2462 				"InnoDB: Please submit a bug report"
2463 				" to http://bugs.mysql.com\n",
2464 				old_lsn, new_lsn);
2465 			ut_ad(0);
2466 		}
2467 
2468 		old_lsn = new_lsn;
2469 	}
2470 
2471 	if (difftime(time(NULL), srv_last_monitor_time) > 60) {
2472 		/* We referesh InnoDB Monitor values so that averages are
2473 		printed from at most 60 last seconds */
2474 
2475 		srv_refresh_innodb_monitor_stats();
2476 	}
2477 
2478 	/* Update the statistics collected for deciding LRU
2479 	eviction policy. */
2480 	buf_LRU_stat_update();
2481 
2482 	/* Update the statistics collected for flush rate policy. */
2483 	buf_flush_stat_update();
2484 
2485 	/* In case mutex_exit is not a memory barrier, it is
2486 	theoretically possible some threads are left waiting though
2487 	the semaphore is already released. Wake up those threads: */
2488 
2489 	sync_arr_wake_threads_if_sema_free();
2490 
2491 	if (sync_array_print_long_waits(&waiter, &sema)
2492 	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
2493 		fatal_cnt++;
2494 		if (fatal_cnt > 10) {
2495 
2496 			fprintf(stderr,
2497 				"InnoDB: Error: semaphore wait has lasted"
2498 				" > %lu seconds\n"
2499 				"InnoDB: We intentionally crash the server,"
2500 				" because it appears to be hung.\n",
2501 				(ulong) srv_fatal_semaphore_wait_threshold);
2502 
2503 			ut_error;
2504 		}
2505 	} else {
2506 		fatal_cnt = 0;
2507 		old_waiter = waiter;
2508 		old_sema = sema;
2509 	}
2510 
2511 	/* Flush stderr so that a database user gets the output
2512 	to possible MySQL error file */
2513 
2514 	fflush(stderr);
2515 
2516 	sig_count = os_event_reset(srv_error_event);
2517 
2518 	os_event_wait_time_low(srv_error_event, 1000000, sig_count);
2519 
2520 	if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
2521 
2522 		goto loop;
2523 	}
2524 
2525 	srv_error_monitor_active = FALSE;
2526 
2527 	/* We count the number of threads in os_thread_exit(). A created
2528 	thread should always use that to exit and not use return() to exit. */
2529 
2530 	os_thread_exit(NULL);
2531 
2532 	OS_THREAD_DUMMY_RETURN;
2533 }
2534 
2535 /**********************************************************************//**
2536 Check whether any background thread is active. If so return the thread
2537 type
2538 @return ULINT_UNDEFINED if all are suspended or have exited, thread
2539 type if any are still active. */
2540 UNIV_INTERN
2541 ulint
srv_get_active_thread_type(void)2542 srv_get_active_thread_type(void)
2543 /*============================*/
2544 {
2545 	ulint	i;
2546 	ibool	ret = ULINT_UNDEFINED;
2547 
2548 	mutex_enter(&kernel_mutex);
2549 
2550 	for (i = 0; i <= SRV_MASTER; ++i) {
2551 		if (srv_n_threads_active[i] != 0) {
2552 			ret = i;
2553 			break;
2554 		}
2555 	}
2556 
2557 	mutex_exit(&kernel_mutex);
2558 
2559 	return(ret);
2560 }
2561 
2562 /*********************************************************************//**
2563 This function prints progress message every 60 seconds during server
2564 shutdown, for any activities that master thread is pending on. */
2565 static
2566 void
srv_shutdown_print_master_pending(ib_time_t * last_print_time,ulint n_tables_to_drop,ulint n_bytes_merged,ulint n_pages_flushed)2567 srv_shutdown_print_master_pending(
2568 /*==============================*/
2569 	ib_time_t*	last_print_time,	/*!< last time the function
2570 						print the message */
2571 	ulint		n_tables_to_drop,	/*!< number of tables to
2572 						be dropped */
2573 	ulint		n_bytes_merged,		/*!< number of change buffer
2574 						just merged */
2575 	ulint		n_pages_flushed)	/*!< number of pages flushed */
2576 {
2577 	ib_time_t	current_time;
2578 	double		time_elapsed;
2579 
2580 	current_time = ut_time();
2581 	time_elapsed = ut_difftime(current_time, *last_print_time);
2582 
2583 	if (time_elapsed > 60) {
2584 		*last_print_time = ut_time();
2585 
2586 		if (n_tables_to_drop) {
2587 			ut_print_timestamp(stderr);
2588 			fprintf(stderr, "  InnoDB: Waiting for "
2589 				"%lu table(s) to be dropped\n",
2590 				(ulong) n_tables_to_drop);
2591 		}
2592 
2593 		/* Check change buffer merge, we only wait for change buffer
2594 		merge if it is a slow shutdown */
2595 		if (!srv_fast_shutdown && n_bytes_merged) {
2596 			ut_print_timestamp(stderr);
2597 			fprintf(stderr, "  InnoDB: Waiting for change "
2598 				"buffer merge to complete\n"
2599 				"  InnoDB: number of bytes of change buffer "
2600 				"just merged:  %lu\n",
2601 				n_bytes_merged);
2602 		}
2603 
2604 		if (n_pages_flushed) {
2605 			ut_print_timestamp(stderr);
2606 			fprintf(stderr, "  InnoDB: Waiting for "
2607 				"%lu pages to be flushed\n",
2608 				(ulong) n_pages_flushed);
2609 		}
2610         }
2611 }
2612 
2613 /*******************************************************************//**
2614 Tells the InnoDB server that there has been activity in the database
2615 and wakes up the master thread if it is suspended (not sleeping). Used
2616 in the MySQL interface. Note that there is a small chance that the master
2617 thread stays suspended (we do not protect our operation with the
2618 srv_sys_t->mutex, for performance reasons). */
2619 UNIV_INTERN
2620 void
srv_active_wake_master_thread(void)2621 srv_active_wake_master_thread(void)
2622 /*===============================*/
2623 {
2624 	srv_activity_count++;
2625 
2626 	if (srv_n_threads_active[SRV_MASTER] == 0) {
2627 
2628 		mutex_enter(&kernel_mutex);
2629 
2630 		srv_release_threads(SRV_MASTER, 1);
2631 
2632 		mutex_exit(&kernel_mutex);
2633 	}
2634 }
2635 
2636 /*******************************************************************//**
2637 Tells the purge thread that there has been activity in the database
2638 and wakes up the purge thread if it is suspended (not sleeping).  Note
2639 that there is a small chance that the purge thread stays suspended
2640 (we do not protect our operation with the kernel mutex, for
2641 performace reasons). */
2642 UNIV_INTERN
2643 void
srv_wake_purge_thread_if_not_active(void)2644 srv_wake_purge_thread_if_not_active(void)
2645 /*=====================================*/
2646 {
2647 	ut_ad(!mutex_own(&kernel_mutex));
2648 
2649 	if (srv_n_purge_threads > 0
2650 	    && srv_n_threads_active[SRV_WORKER] == 0) {
2651 
2652 		mutex_enter(&kernel_mutex);
2653 
2654 		srv_release_threads(SRV_WORKER, 1);
2655 
2656 		mutex_exit(&kernel_mutex);
2657 	}
2658 }
2659 
2660 /*******************************************************************//**
2661 Wakes up the master thread if it is suspended or being suspended. */
2662 UNIV_INTERN
2663 void
srv_wake_master_thread(void)2664 srv_wake_master_thread(void)
2665 /*========================*/
2666 {
2667 	srv_activity_count++;
2668 
2669 	mutex_enter(&kernel_mutex);
2670 
2671 	srv_release_threads(SRV_MASTER, 1);
2672 
2673 	mutex_exit(&kernel_mutex);
2674 }
2675 
2676 /*******************************************************************//**
2677 Wakes up the purge thread if it's not already awake. */
2678 UNIV_INTERN
2679 void
srv_wake_purge_thread(void)2680 srv_wake_purge_thread(void)
2681 /*=======================*/
2682 {
2683 	ut_ad(!mutex_own(&kernel_mutex));
2684 
2685 	if (srv_n_purge_threads > 0) {
2686 
2687 		mutex_enter(&kernel_mutex);
2688 
2689 		srv_release_threads(SRV_WORKER, 1);
2690 
2691 		mutex_exit(&kernel_mutex);
2692 	}
2693 }
2694 
2695 /**********************************************************************
2696 The master thread is tasked to ensure that flush of log file happens
2697 once every second in the background. This is to ensure that not more
2698 than one second of trxs are lost in case of crash when
2699 innodb_flush_logs_at_trx_commit != 1 */
2700 static
2701 void
srv_sync_log_buffer_in_background(void)2702 srv_sync_log_buffer_in_background(void)
2703 /*===================================*/
2704 {
2705 	time_t	current_time = time(NULL);
2706 
2707 	srv_main_thread_op_info = "flushing log";
2708 	if (difftime(current_time, srv_last_log_flush_time) >= 1) {
2709 		log_buffer_sync_in_background(TRUE);
2710 		srv_last_log_flush_time = current_time;
2711 		srv_log_writes_and_flush++;
2712 	}
2713 }
2714 
2715 /********************************************************************//**
2716 Do a full purge, reconfigure the purge sub-system if a dynamic
2717 change is detected. */
2718 static
2719 void
srv_master_do_purge(void)2720 srv_master_do_purge(void)
2721 /*=====================*/
2722 {
2723 	ulint	n_pages_purged;
2724 
2725 	ut_ad(!mutex_own(&kernel_mutex));
2726 
2727 	ut_a(srv_n_purge_threads == 0);
2728 
2729 	do {
2730 		/* Check for shutdown and change in purge config. */
2731 		if (srv_fast_shutdown && srv_shutdown_state > 0) {
2732 			/* Nothing to purge. */
2733 			n_pages_purged = 0;
2734 		} else {
2735 			n_pages_purged = trx_purge(srv_purge_batch_size);
2736 		}
2737 
2738 		srv_sync_log_buffer_in_background();
2739 
2740 	} while (n_pages_purged > 0);
2741 }
2742 
2743 /*********************************************************************//**
2744 The master thread controlling the server.
2745 @return	a dummy parameter */
2746 UNIV_INTERN
2747 os_thread_ret_t
srv_master_thread(void * arg)2748 srv_master_thread(
2749 /*==============*/
2750 	void*	arg __attribute__((unused)))
2751 			/*!< in: a dummy parameter required by
2752 			os_thread_create */
2753 {
2754 	buf_pool_stat_t buf_stat;
2755 	srv_slot_t*	slot;
2756 	ulint		old_activity_count;
2757 	ulint		n_pages_purged	= 0;
2758 	ulint		n_bytes_merged;
2759 	ulint		n_pages_flushed;
2760 	ulint		n_bytes_archived;
2761 	ulint		n_tables_to_drop;
2762 	ulint		n_ios;
2763 	ulint		n_ios_old;
2764 	ulint		n_ios_very_old;
2765 	ulint		n_pend_ios;
2766 	ulint		next_itr_time;
2767 	ulint		i;
2768 	ib_time_t	last_print_time;
2769 
2770 	my_thread_init();
2771 #ifdef UNIV_DEBUG_THREAD_CREATION
2772 	fprintf(stderr, "Master thread starts, id %lu\n",
2773 		os_thread_pf(os_thread_get_curr_id()));
2774 #endif
2775 
2776 #ifdef UNIV_PFS_THREAD
2777 	pfs_register_thread(srv_master_thread_key);
2778 #endif
2779 
2780 	srv_main_thread_process_no = os_proc_get_number();
2781 	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
2782 
2783 	mutex_enter(&kernel_mutex);
2784 
2785 	slot = srv_table_reserve_slot(SRV_MASTER);
2786 
2787 	srv_n_threads_active[SRV_MASTER]++;
2788 
2789 	mutex_exit(&kernel_mutex);
2790 
2791 	last_print_time = ut_time();
2792 loop:
2793 	/*****************************************************************/
2794 	/* ---- When there is database activity by users, we cycle in this
2795 	loop */
2796 
2797 	srv_main_thread_op_info = "reserving kernel mutex";
2798 
2799 	buf_get_total_stat(&buf_stat);
2800 	n_ios_very_old = log_sys->n_log_ios + buf_stat.n_pages_read
2801 		+ buf_stat.n_pages_written;
2802 	mutex_enter(&kernel_mutex);
2803 
2804 	/* Store the user activity counter at the start of this loop */
2805 	old_activity_count = srv_activity_count;
2806 
2807 	mutex_exit(&kernel_mutex);
2808 
2809 	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
2810 
2811 		goto suspend_thread;
2812 	}
2813 
2814 	/* ---- We run the following loop approximately once per second
2815 	when there is database activity */
2816 
2817 	srv_last_log_flush_time = time(NULL);
2818 
2819 	/* Sleep for 1 second on entrying the for loop below the first time. */
2820 	next_itr_time = ut_time_ms() + 1000;
2821 
2822 	for (i = 0; i < 10; i++) {
2823 		ulint	cur_time = ut_time_ms();
2824 
2825 #ifdef UNIV_DEBUG
2826 		if (btr_cur_limit_optimistic_insert_debug
2827 		    && srv_n_purge_threads == 0) {
2828 			/* If btr_cur_limit_optimistic_insert_debug is enabled
2829 			and no purge_threads, purge opportunity is increased
2830 			by x100 (1purge/100msec), to speed up debug scripts
2831 			which should wait for purged. */
2832 			next_itr_time -= 900;
2833 
2834 			srv_main_thread_op_info = "master purging";
2835 
2836 			srv_master_do_purge();
2837 
2838 			if (srv_fast_shutdown && srv_shutdown_state > 0) {
2839 
2840 				goto background_loop;
2841 			}
2842 		}
2843 #endif /* UNIV_DEBUG */
2844 
2845 		/* ALTER TABLE in MySQL requires on Unix that the table handler
2846 		can drop tables lazily after there no longer are SELECT
2847 		queries to them. */
2848 
2849 		srv_main_thread_op_info = "doing background drop tables";
2850 
2851 		row_drop_tables_for_mysql_in_background();
2852 
2853 		srv_main_thread_op_info = "";
2854 
2855 		if (srv_fast_shutdown && srv_shutdown_state > 0) {
2856 
2857 			goto background_loop;
2858 		}
2859 
2860 		buf_get_total_stat(&buf_stat);
2861 
2862 		n_ios_old = log_sys->n_log_ios + buf_stat.n_pages_read
2863 			+ buf_stat.n_pages_written;
2864 
2865 		srv_main_thread_op_info = "sleeping";
2866 		srv_main_1_second_loops++;
2867 
2868 		if (next_itr_time > cur_time
2869 		    && srv_shutdown_state == SRV_SHUTDOWN_NONE) {
2870 
2871 			/* Get sleep interval in micro seconds. We use
2872 			ut_min() to avoid long sleep in case of
2873 			wrap around. */
2874 			os_thread_sleep(ut_min(1000000,
2875 					(next_itr_time - cur_time)
2876 					 * 1000));
2877 			srv_main_sleeps++;
2878 		}
2879 
2880 		/* Each iteration should happen at 1 second interval. */
2881 		next_itr_time = ut_time_ms() + 1000;
2882 
2883 		/* Flush logs if needed */
2884 		srv_sync_log_buffer_in_background();
2885 
2886 		srv_main_thread_op_info = "making checkpoint";
2887 		log_free_check();
2888 
2889 		/* If i/os during one second sleep were less than 5% of
2890 		capacity, we assume that there is free disk i/o capacity
2891 		available, and it makes sense to do an insert buffer merge. */
2892 
2893 		buf_get_total_stat(&buf_stat);
2894 		n_pend_ios = buf_get_n_pending_ios()
2895 			+ log_sys->n_pending_writes;
2896 		n_ios = log_sys->n_log_ios + buf_stat.n_pages_read
2897 			+ buf_stat.n_pages_written;
2898 		if (n_pend_ios < SRV_PEND_IO_THRESHOLD
2899 		    && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
2900 			srv_main_thread_op_info = "doing insert buffer merge";
2901 			ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
2902 
2903 			/* Flush logs if needed */
2904 			srv_sync_log_buffer_in_background();
2905 		}
2906 
2907 		if (UNIV_UNLIKELY(buf_get_modified_ratio_pct()
2908 				  > srv_max_buf_pool_modified_pct)) {
2909 
2910 			/* Try to keep the number of modified pages in the
2911 			buffer pool under the limit wished by the user */
2912 
2913 			srv_main_thread_op_info =
2914 				"flushing buffer pool pages";
2915 			n_pages_flushed = buf_flush_list(
2916 				PCT_IO(100), IB_ULONGLONG_MAX);
2917 
2918 		} else if (srv_adaptive_flushing) {
2919 
2920 			/* Try to keep the rate of flushing of dirty
2921 			pages such that redo log generation does not
2922 			produce bursts of IO at checkpoint time. */
2923 			ulint n_flush = buf_flush_get_desired_flush_rate();
2924 
2925 			if (n_flush) {
2926 				srv_main_thread_op_info =
2927 					"flushing buffer pool pages";
2928 				n_flush = ut_min(PCT_IO(100), n_flush);
2929 				n_pages_flushed =
2930 					buf_flush_list(
2931 						n_flush,
2932 						IB_ULONGLONG_MAX);
2933 			}
2934 		}
2935 
2936 		if (srv_activity_count == old_activity_count) {
2937 
2938 			/* There is no user activity at the moment, go to
2939 			the background loop */
2940 
2941 			goto background_loop;
2942 		}
2943 	}
2944 
2945 	/* ---- We perform the following code approximately once per
2946 	10 seconds when there is database activity */
2947 
2948 #ifdef MEM_PERIODIC_CHECK
2949 	/* Check magic numbers of every allocated mem block once in 10
2950 	seconds */
2951 	mem_validate_all_blocks();
2952 #endif
2953 	/* If i/os during the 10 second period were less than 200% of
2954 	capacity, we assume that there is free disk i/o capacity
2955 	available, and it makes sense to flush srv_io_capacity pages.
2956 
2957 	Note that this is done regardless of the fraction of dirty
2958 	pages relative to the max requested by the user. The one second
2959 	loop above requests writes for that case. The writes done here
2960 	are not required, and may be disabled. */
2961 
2962 	buf_get_total_stat(&buf_stat);
2963 	n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
2964 	n_ios = log_sys->n_log_ios + buf_stat.n_pages_read
2965 		+ buf_stat.n_pages_written;
2966 
2967 	srv_main_10_second_loops++;
2968 	if (n_pend_ios < SRV_PEND_IO_THRESHOLD
2969 	    && (n_ios - n_ios_very_old < SRV_PAST_IO_ACTIVITY)) {
2970 
2971 		srv_main_thread_op_info = "flushing buffer pool pages";
2972 		buf_flush_list(PCT_IO(100), IB_ULONGLONG_MAX);
2973 
2974 		/* Flush logs if needed */
2975 		srv_sync_log_buffer_in_background();
2976 	}
2977 
2978 	/* We run a batch of insert buffer merge every 10 seconds,
2979 	even if the server were active */
2980 
2981 	srv_main_thread_op_info = "doing insert buffer merge";
2982 	ibuf_contract_for_n_pages(FALSE, PCT_IO(5));
2983 
2984 	/* Flush logs if needed */
2985 	srv_sync_log_buffer_in_background();
2986 
2987 	if (srv_n_purge_threads == 0) {
2988 		srv_main_thread_op_info = "master purging";
2989 
2990 		srv_master_do_purge();
2991 
2992 		if (srv_fast_shutdown && srv_shutdown_state > 0) {
2993 
2994 			goto background_loop;
2995 		}
2996 	}
2997 
2998 	srv_main_thread_op_info = "flushing buffer pool pages";
2999 
3000 	/* Flush a few oldest pages to make a new checkpoint younger */
3001 
3002 	if (buf_get_modified_ratio_pct() > 70) {
3003 
3004 		/* If there are lots of modified pages in the buffer pool
3005 		(> 70 %), we assume we can afford reserving the disk(s) for
3006 		the time it requires to flush 100 pages */
3007 
3008 		n_pages_flushed = buf_flush_list(
3009 			PCT_IO(100), IB_ULONGLONG_MAX);
3010 	} else {
3011 		/* Otherwise, we only flush a small number of pages so that
3012 		we do not unnecessarily use much disk i/o capacity from
3013 		other work */
3014 
3015 		n_pages_flushed = buf_flush_list(
3016 			  PCT_IO(10), IB_ULONGLONG_MAX);
3017 	}
3018 
3019 	srv_main_thread_op_info = "making checkpoint";
3020 
3021 	/* Make a new checkpoint about once in 10 seconds */
3022 
3023 	log_checkpoint(TRUE, FALSE);
3024 
3025 	srv_main_thread_op_info = "reserving kernel mutex";
3026 
3027 	mutex_enter(&kernel_mutex);
3028 
3029 	/* ---- When there is database activity, we jump from here back to
3030 	the start of loop */
3031 
3032 	if (srv_activity_count != old_activity_count) {
3033 		mutex_exit(&kernel_mutex);
3034 		goto loop;
3035 	}
3036 
3037 	mutex_exit(&kernel_mutex);
3038 
3039 	/* If the database is quiet, we enter the background loop */
3040 
3041 	/*****************************************************************/
3042 background_loop:
3043 	/* ---- In this loop we run background operations when the server
3044 	is quiet from user activity. Also in the case of a shutdown, we
3045 	loop here, flushing the buffer pool to the data files. */
3046 
3047 	/* The server has been quiet for a while: start running background
3048 	operations */
3049 	srv_main_background_loops++;
3050 	srv_main_thread_op_info = "doing background drop tables";
3051 
3052 	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
3053 
3054 	if (n_tables_to_drop > 0) {
3055 		/* Do not monopolize the CPU even if there are tables waiting
3056 		in the background drop queue. (It is essentially a bug if
3057 		MySQL tries to drop a table while there are still open handles
3058 		to it and we had to put it to the background drop queue.) */
3059 
3060 		if (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3061 			os_thread_sleep(100000);
3062 		}
3063 	}
3064 
3065 	if (srv_n_purge_threads == 0) {
3066 		srv_main_thread_op_info = "master purging";
3067 
3068 		srv_master_do_purge();
3069 	}
3070 
3071 	srv_main_thread_op_info = "reserving kernel mutex";
3072 
3073 	mutex_enter(&kernel_mutex);
3074 	if (srv_activity_count != old_activity_count) {
3075 		mutex_exit(&kernel_mutex);
3076 		goto loop;
3077 	}
3078 	mutex_exit(&kernel_mutex);
3079 
3080 	srv_main_thread_op_info = "doing insert buffer merge";
3081 
3082 	if (srv_fast_shutdown && srv_shutdown_state > 0) {
3083 		n_bytes_merged = 0;
3084 	} else {
3085 		/* This should do an amount of IO similar to the number of
3086 		dirty pages that will be flushed in the call to
3087 		buf_flush_list below. Otherwise, the system favors
3088 		clean pages over cleanup throughput. */
3089 		n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
3090 							   PCT_IO(100));
3091 	}
3092 
3093 	srv_main_thread_op_info = "reserving kernel mutex";
3094 
3095 	mutex_enter(&kernel_mutex);
3096 	if (srv_activity_count != old_activity_count) {
3097 		mutex_exit(&kernel_mutex);
3098 		goto loop;
3099 	}
3100 	mutex_exit(&kernel_mutex);
3101 
3102 flush_loop:
3103 	srv_main_thread_op_info = "flushing buffer pool pages";
3104 	srv_main_flush_loops++;
3105 	if (srv_fast_shutdown < 2 || srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3106 		n_pages_flushed = buf_flush_list(
3107 			  PCT_IO(100), IB_ULONGLONG_MAX);
3108 	} else {
3109 		/* In the fastest shutdown we do not flush the buffer pool
3110 		to data files: we set n_pages_flushed to 0 artificially. */
3111 		ut_ad(srv_fast_shutdown == 2);
3112 		ut_ad(srv_shutdown_state > 0);
3113 
3114 		n_pages_flushed = 0;
3115 
3116 		DBUG_PRINT("master", ("doing very fast shutdown"));
3117 	}
3118 
3119 	srv_main_thread_op_info = "reserving kernel mutex";
3120 
3121 	mutex_enter(&kernel_mutex);
3122 	if (srv_activity_count != old_activity_count) {
3123 		mutex_exit(&kernel_mutex);
3124 		goto loop;
3125 	}
3126 	mutex_exit(&kernel_mutex);
3127 
3128 	srv_main_thread_op_info = "waiting for buffer pool flush to end";
3129 	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3130 
3131 	/* Flush logs if needed */
3132 	srv_sync_log_buffer_in_background();
3133 
3134 	srv_main_thread_op_info = "making checkpoint";
3135 
3136 	log_checkpoint(TRUE, FALSE);
3137 
3138 	if (!(srv_fast_shutdown == 2 && srv_shutdown_state > 0)
3139 	    && (buf_get_modified_ratio_pct()
3140 		> srv_max_buf_pool_modified_pct)) {
3141 
3142 		/* If the server is doing a very fast shutdown, then
3143 		we will not come here. */
3144 
3145 		/* Try to keep the number of modified pages in the
3146 		buffer pool under the limit wished by the user */
3147 
3148 		goto flush_loop;
3149 	}
3150 
3151 	srv_main_thread_op_info = "reserving kernel mutex";
3152 
3153 	mutex_enter(&kernel_mutex);
3154 	if (srv_activity_count != old_activity_count) {
3155 		mutex_exit(&kernel_mutex);
3156 		goto loop;
3157 	}
3158 	mutex_exit(&kernel_mutex);
3159 	/*
3160 	srv_main_thread_op_info = "archiving log (if log archive is on)";
3161 
3162 	log_archive_do(FALSE, &n_bytes_archived);
3163 	*/
3164 	n_bytes_archived = 0;
3165 
3166 	/* Print progress message every 60 seconds during shutdown */
3167 	if (srv_shutdown_state > 0 && srv_print_verbose_log) {
3168 		srv_shutdown_print_master_pending(&last_print_time,
3169 						  n_tables_to_drop,
3170 						  n_bytes_merged,
3171 						  n_pages_flushed);
3172 	}
3173 
3174 	/* Keep looping in the background loop if still work to do */
3175 
3176 	if (srv_fast_shutdown && srv_shutdown_state > 0) {
3177 		if (n_tables_to_drop + n_pages_flushed
3178 		    + n_bytes_archived != 0) {
3179 
3180 			/* If we are doing a fast shutdown (= the default)
3181 			we do not do purge or insert buffer merge. But we
3182 			flush the buffer pool completely to disk.
3183 			In a 'very fast' shutdown we do not flush the buffer
3184 			pool to data files: we have set n_pages_flushed to
3185 			0 artificially. */
3186 
3187 			goto background_loop;
3188 		}
3189 	} else if (n_tables_to_drop
3190 		   + n_pages_purged + n_bytes_merged + n_pages_flushed
3191 		   + n_bytes_archived != 0) {
3192 
3193 		/* In a 'slow' shutdown we run purge and the insert buffer
3194 		merge to completion */
3195 
3196 		goto background_loop;
3197 	}
3198 
3199 	/* There is no work for background operations either: suspend
3200 	master thread to wait for more server activity */
3201 
3202 suspend_thread:
3203 	srv_main_thread_op_info = "suspending";
3204 
3205 	mutex_enter(&kernel_mutex);
3206 
3207 	if (row_get_background_drop_list_len_low() > 0) {
3208 		mutex_exit(&kernel_mutex);
3209 
3210 		goto loop;
3211 	}
3212 
3213 	srv_suspend_thread(slot);
3214 
3215 	mutex_exit(&kernel_mutex);
3216 
3217 	/* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
3218 	waits for database activity to die down when converting < 4.1.x
3219 	databases, and relies on this string being exactly as it is. InnoDB
3220 	manual also mentions this string in several places. */
3221 	srv_main_thread_op_info = "waiting for server activity";
3222 
3223 	os_event_wait(slot->event);
3224 
3225 	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3226 		my_thread_end();
3227 		os_thread_exit(NULL);
3228 	}
3229 
3230 	/* When there is user activity, InnoDB will set the event and the
3231 	main thread goes back to loop. */
3232 
3233 	goto loop;
3234 }
3235 
3236 /*********************************************************************//**
3237 Asynchronous purge thread.
3238 @return	a dummy parameter */
3239 UNIV_INTERN
3240 os_thread_ret_t
srv_purge_thread(void * arg)3241 srv_purge_thread(
3242 /*=============*/
3243 	void*	arg __attribute__((unused)))	/*!< in: a dummy parameter
3244 						required by os_thread_create */
3245 {
3246 	srv_slot_t*	slot;
3247 	ulint		retries = 0;
3248 	ulint		n_total_purged = ULINT_UNDEFINED;
3249 
3250 	my_thread_init();
3251 	ut_a(srv_n_purge_threads == 1);
3252 
3253 #ifdef UNIV_PFS_THREAD
3254 	pfs_register_thread(srv_purge_thread_key);
3255 #endif /* UNIV_PFS_THREAD */
3256 
3257 #ifdef UNIV_DEBUG_THREAD_CREATION
3258 	fprintf(stderr, "InnoDB: Purge thread running, id %lu\n",
3259 		os_thread_pf(os_thread_get_curr_id()));
3260 #endif /* UNIV_DEBUG_THREAD_CREATION */
3261 
3262 	mutex_enter(&kernel_mutex);
3263 
3264 	slot = srv_table_reserve_slot(SRV_WORKER);
3265 
3266 	++srv_n_threads_active[SRV_WORKER];
3267 
3268 	mutex_exit(&kernel_mutex);
3269 
3270 	while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
3271 
3272 		ulint	n_pages_purged = 0;
3273 
3274 		/* If there are very few records to purge or the last
3275 		purge didn't purge any records then wait for activity.
3276 	        We peek at the history len without holding any mutex
3277 		because in the worst case we will end up waiting for
3278 		the next purge event. */
3279 		if (trx_sys->rseg_history_len < srv_purge_batch_size
3280 		    || (n_total_purged == 0
3281 			&& retries >= TRX_SYS_N_RSEGS)) {
3282 
3283 			mutex_enter(&kernel_mutex);
3284 
3285 			srv_suspend_thread(slot);
3286 
3287 			mutex_exit(&kernel_mutex);
3288 
3289 			os_event_wait(slot->event);
3290 
3291 			retries = 0;
3292 		}
3293 
3294 		/* Check for shutdown and whether we should do purge at all. */
3295 		if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND
3296 		    || srv_shutdown_state != 0
3297 		    || srv_fast_shutdown) {
3298 
3299 			break;
3300 		}
3301 
3302 		if (n_total_purged == 0 && retries <= TRX_SYS_N_RSEGS) {
3303 			++retries;
3304 		} else if (n_total_purged > 0) {
3305 			retries = 0;
3306 			n_total_purged = 0;
3307 		}
3308 
3309 		/* Purge until there are no more records to purge and there is
3310 		no change in configuration or server state. */
3311 		do {
3312 			n_pages_purged = trx_purge(srv_purge_batch_size);
3313 
3314 			n_total_purged += n_pages_purged;
3315 
3316 		} while (n_pages_purged > 0 && !srv_fast_shutdown);
3317 
3318 		srv_sync_log_buffer_in_background();
3319 	}
3320 
3321 	mutex_enter(&kernel_mutex);
3322 
3323 	/* Decrement the active count. */
3324 	srv_suspend_thread(slot);
3325 
3326 	slot->in_use = FALSE;
3327 
3328 	mutex_exit(&kernel_mutex);
3329 
3330 #ifdef UNIV_DEBUG_THREAD_CREATION
3331 	fprintf(stderr, "InnoDB: Purge thread exiting, id %lu\n",
3332 		os_thread_pf(os_thread_get_curr_id()));
3333 #endif /* UNIV_DEBUG_THREAD_CREATION */
3334 
3335 	my_thread_end();
3336 
3337 	/* We count the number of threads in os_thread_exit(). A created
3338 	thread should always use that to exit and not use return() to exit. */
3339 	os_thread_exit(NULL);
3340 
3341 	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
3342 }
3343 
3344 /**********************************************************************//**
3345 Enqueues a task to server task queue and releases a worker thread, if there
3346 is a suspended one. */
3347 UNIV_INTERN
3348 void
srv_que_task_enqueue_low(que_thr_t * thr)3349 srv_que_task_enqueue_low(
3350 /*=====================*/
3351 	que_thr_t*	thr)	/*!< in: query thread */
3352 {
3353 	ut_ad(thr);
3354 
3355 	mutex_enter(&kernel_mutex);
3356 
3357 	UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
3358 
3359 	srv_release_threads(SRV_WORKER, 1);
3360 
3361 	mutex_exit(&kernel_mutex);
3362 }
3363