1 /* Copyright (c) 2000, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    Without limiting anything contained in the foregoing, this file,
15    which is part of C Driver for MySQL (Connector/C), is also subject to the
16    Universal FOSS Exception, version 1.0, a copy of which can be found at
17    http://oss.oracle.com/licenses/universal-foss-exception.
18 
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License, version 2.0, for more details.
23 
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, write to the Free Software
26    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
27 
28 /**
29   @file
30   These functions handle keyblock cacheing for ISAM and MyISAM tables.
31 
32   One cache can handle many files.
33   It must contain buffers of the same blocksize.
34   init_key_cache() should be used to init cache handler.
35 
36   The free list (free_block_list) is a stack like structure.
37   When a block is freed by free_block(), it is pushed onto the stack.
38   When a new block is required it is first tried to pop one from the stack.
39   If the stack is empty, it is tried to get a never-used block from the pool.
40   If this is empty too, then a block is taken from the LRU ring, flushing it
41   to disk, if neccessary. This is handled in find_key_block().
42   With the new free list, the blocks can have three temperatures:
43   hot, warm and cold (which is free). This is remembered in the block header
44   by the enum BLOCK_TEMPERATURE temperature variable. Remembering the
45   temperature is neccessary to correctly count the number of warm blocks,
46   which is required to decide when blocks are allowed to become hot. Whenever
47   a block is inserted to another (sub-)chain, we take the old and new
48   temperature into account to decide if we got one more or less warm block.
49   blocks_unused is the sum of never used blocks in the pool and of currently
50   free blocks. blocks_used is the number of blocks fetched from the pool and
51   as such gives the maximum number of in-use blocks at any time.
52 */
53 
54 /*
55   Key Cache Locking
56   =================
57 
58   All key cache locking is done with a single mutex per key cache:
59   keycache->cache_lock. This mutex is locked almost all the time
60   when executing code in this file (mf_keycache.c).
61   However it is released for I/O and some copy operations.
62 
63   The cache_lock is also released when waiting for some event. Waiting
64   and signalling is done via condition variables. In most cases the
65   thread waits on its thread->suspend condition variable. Every thread
66   has a my_thread_var structure, which contains this variable and a
67   '*next' and '**prev' pointer. These pointers are used to insert the
68   thread into a wait queue.
69 
70   A thread can wait for one block and thus be in one wait queue at a
71   time only.
72 
73   Before starting to wait on its condition variable with
74   mysql_cond_wait(), the thread enters itself to a specific wait queue
75   with link_into_queue() (double linked with '*next' + '**prev') or
76   wait_on_queue() (single linked with '*next').
77 
78   Another thread, when releasing a resource, looks up the waiting thread
79   in the related wait queue. It sends a signal with
80   mysql_cond_signal() to the waiting thread.
81 
82   NOTE: Depending on the particular wait situation, either the sending
83   thread removes the waiting thread from the wait queue with
84   unlink_from_queue() or release_whole_queue() respectively, or the waiting
85   thread removes itself.
86 
87   There is one exception from this locking scheme when one thread wants
88   to reuse a block for some other address. This works by first marking
89   the block reserved (status= BLOCK_IN_SWITCH) and then waiting for all
90   threads that are reading the block to finish. Each block has a
91   reference to a condition variable (condvar). It holds a reference to
92   the thread->suspend condition variable for the waiting thread (if such
93   a thread exists). When that thread is signaled, the reference is
94   cleared. The number of readers of a block is registered in
95   block->hash_link->requests. See wait_for_readers() / remove_reader()
96   for details. This is similar to the above, but it clearly means that
97   only one thread can wait for a particular block. There is no queue in
98   this case. Strangely enough block->convar is used for waiting for the
99   assigned hash_link only. More precisely it is used to wait for all
100   requests to be unregistered from the assigned hash_link.
101 
102   The resize_queue serves two purposes:
103   1. Threads that want to do a resize wait there if in_resize is set.
104      This is not used in the server. The server refuses a second resize
105      request if one is already active. keycache->in_init is used for the
106      synchronization. See set_var.cc.
107   2. Threads that want to access blocks during resize wait here during
108      the re-initialization phase.
109   When the resize is done, all threads on the queue are signalled.
110   Hypothetical resizers can compete for resizing, and read/write
111   requests will restart to request blocks from the freshly resized
112   cache. If the cache has been resized too small, it is disabled and
113   'can_be_used' is false. In this case read/write requests bypass the
114   cache. Since they increment and decrement 'cnt_for_resize_op', the
115   next resizer can wait on the queue 'waiting_for_resize_cnt' until all
116   I/O finished.
117 */
118 
119 #include "mysys_priv.h"
120 #include "mysys_err.h"
121 #include <keycache.h>
122 #include "my_static.h"
123 #include <m_string.h>
124 #include <my_bit.h>
125 #include <errno.h>
126 #include <stdarg.h>
127 #include "probes_mysql.h"
128 #include "my_thread_local.h"
129 
130 #define STRUCT_PTR(TYPE, MEMBER, a)                                           \
131           (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER))
132 
133 /* types of condition variables */
134 #define  COND_FOR_REQUESTED 0
135 #define  COND_FOR_SAVED     1
136 #define  COND_FOR_READERS   2
137 
138 typedef mysql_cond_t KEYCACHE_CONDVAR;
139 
140 /* descriptor of the page in the key cache block buffer */
141 struct st_keycache_page
142 {
143   int file;               /* file to which the page belongs to  */
144   my_off_t filepos;       /* position of the page in the file   */
145 };
146 typedef struct st_keycache_page KEYCACHE_PAGE;
147 
148 /* element in the chain of a hash table bucket */
149 struct st_hash_link
150 {
151   struct st_hash_link *next, **prev; /* to connect links in the same bucket  */
152   struct st_block_link *block;       /* reference to the block for the page: */
153   File file;                         /* from such a file                     */
154   my_off_t diskpos;                  /* with such an offset                  */
155   uint requests;                     /* number of requests for the page      */
156 };
157 
158 /* simple states of a block */
159 #define BLOCK_ERROR           1 /* an error occured when performing file i/o */
160 #define BLOCK_READ            2 /* file block is in the block buffer         */
161 #define BLOCK_IN_SWITCH       4 /* block is preparing to read new page       */
162 #define BLOCK_REASSIGNED      8 /* blk does not accept requests for old page */
163 #define BLOCK_IN_FLUSH       16 /* block is selected for flush               */
164 #define BLOCK_CHANGED        32 /* block buffer contains a dirty page        */
165 #define BLOCK_IN_USE         64 /* block is not free                         */
166 #define BLOCK_IN_EVICTION   128 /* block is selected for eviction            */
167 #define BLOCK_IN_FLUSHWRITE 256 /* block is in write to file                 */
168 #define BLOCK_FOR_UPDATE    512 /* block is selected for buffer modification */
169 
170 /* page status, returned by find_key_block */
171 #define PAGE_READ               0
172 #define PAGE_TO_BE_READ         1
173 #define PAGE_WAIT_TO_BE_READ    2
174 
175 /* block temperature determines in which (sub-)chain the block currently is */
176 enum BLOCK_TEMPERATURE { BLOCK_COLD /*free*/ , BLOCK_WARM , BLOCK_HOT };
177 
178 /* key cache block */
179 struct st_block_link
180 {
181   struct st_block_link
182     *next_used, **prev_used;   /* to connect links in the LRU chain (ring)   */
183   struct st_block_link
184     *next_changed, **prev_changed; /* for lists of file dirty/clean blocks   */
185   struct st_hash_link *hash_link; /* backward ptr to referring hash_link     */
186   KEYCACHE_WQUEUE wqueue[2]; /* queues on waiting requests for new/old pages */
187   uint requests;          /* number of requests for the block                */
188   uchar *buffer;           /* buffer for the block page                       */
189   uint offset;            /* beginning of modified data in the buffer        */
190   uint length;            /* end of data in the buffer                       */
191   uint status;            /* state of the block                              */
192   enum BLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot */
193   uint hits_left;         /* number of hits left until promotion             */
194   ulonglong last_hit_time; /* timestamp of the last hit                      */
195   KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event    */
196 };
197 
198 KEY_CACHE dflt_key_cache_var;
199 KEY_CACHE *dflt_key_cache= &dflt_key_cache_var;
200 
201 #define FLUSH_CACHE         2000            /* sort this many blocks at once */
202 
203 static void change_key_cache_param(KEY_CACHE *keycache,
204                                    ulonglong division_limit,
205                                    ulonglong age_threshold);
206 static int flush_all_key_blocks(KEY_CACHE *keycache,
207                                 st_keycache_thread_var *thread_var);
208 
209 static void wait_on_queue(KEYCACHE_WQUEUE *wqueue,
210                           mysql_mutex_t *mutex,
211                           st_keycache_thread_var *thread);
212 static void release_whole_queue(KEYCACHE_WQUEUE *wqueue);
213 
214 static void free_block(KEY_CACHE *keycache,
215                        st_keycache_thread_var *thread_var,
216                        BLOCK_LINK *block);
217 
218 #define KEYCACHE_HASH(f, pos)                                                 \
219 (((ulong) ((pos) / keycache->key_cache_block_size) +                          \
220                                      (ulong) (f)) & (keycache->hash_entries-1))
221 #define FILE_HASH(f)                 ((uint) (f) & (CHANGED_BLOCKS_HASH-1))
222 
223 #define BLOCK_NUMBER(b)                                                       \
224   ((uint) (((char*)(b)-(char *) keycache->block_root)/sizeof(BLOCK_LINK)))
225 #define HASH_LINK_NUMBER(h)                                                   \
226   ((uint) (((char*)(h)-(char *) keycache->hash_link_root)/sizeof(HASH_LINK)))
227 
228 #if !defined(NDEBUG)
229 static int fail_block(BLOCK_LINK *block);
230 static int fail_hlink(HASH_LINK *hlink);
231 static int cache_empty(KEY_CACHE *keycache);
232 #endif
233 
next_power(uint value)234 static inline uint next_power(uint value)
235 {
236   return (uint) my_round_up_to_next_power((uint32) value) << 1;
237 }
238 
239 
240 /*
241   Initialize a key cache
242 
243   SYNOPSIS
244     init_key_cache()
245     keycache			pointer to a key cache data structure
246     key_cache_block_size	size of blocks to keep cached data
247     use_mem                 	total memory to use for the key cache
248     division_limit		division limit (may be zero)
249     age_threshold		age threshold (may be zero)
250 
251   RETURN VALUE
252     number of blocks in the key cache, if successful,
253     0 - otherwise.
254 
255   NOTES.
256     if keycache->key_cache_inited != 0 we assume that the key cache
257     is already initialized.  This is for now used by myisamchk, but shouldn't
258     be something that a program should rely on!
259 
260     It's assumed that no two threads call this function simultaneously
261     referring to the same key cache handle.
262 
263 */
264 
init_key_cache(KEY_CACHE * keycache,ulonglong key_cache_block_size,size_t use_mem,ulonglong division_limit,ulonglong age_threshold)265 int init_key_cache(KEY_CACHE *keycache, ulonglong key_cache_block_size,
266                    size_t use_mem, ulonglong division_limit,
267                    ulonglong age_threshold)
268 {
269   ulong blocks, hash_links;
270   size_t length;
271   int error;
272   DBUG_ENTER("init_key_cache");
273   assert(key_cache_block_size >= 512);
274 
275   if (keycache->key_cache_inited && keycache->disk_blocks > 0)
276   {
277     DBUG_PRINT("warning",("key cache already in use"));
278     DBUG_RETURN(0);
279   }
280 
281   keycache->global_cache_w_requests= keycache->global_cache_r_requests= 0;
282   keycache->global_cache_read= keycache->global_cache_write= 0;
283   keycache->disk_blocks= -1;
284   if (! keycache->key_cache_inited)
285   {
286     keycache->key_cache_inited= 1;
287     /*
288       Initialize these variables once only.
289       Their value must survive re-initialization during resizing.
290     */
291     keycache->in_resize= 0;
292     keycache->resize_in_flush= 0;
293     keycache->cnt_for_resize_op= 0;
294     keycache->waiting_for_resize_cnt.last_thread= NULL;
295     keycache->in_init= 0;
296     mysql_mutex_init(key_KEY_CACHE_cache_lock,
297                      &keycache->cache_lock, MY_MUTEX_INIT_FAST);
298     keycache->resize_queue.last_thread= NULL;
299   }
300 
301   keycache->key_cache_mem_size= use_mem;
302   keycache->key_cache_block_size= (uint)key_cache_block_size;
303   DBUG_PRINT("info", ("key_cache_block_size: %llu",
304 		      key_cache_block_size));
305 
306   blocks= (ulong) (use_mem / (sizeof(BLOCK_LINK) + 2 * sizeof(HASH_LINK) +
307                               sizeof(HASH_LINK*) * 5/4 + key_cache_block_size));
308   /* It doesn't make sense to have too few blocks (less than 8) */
309   if (blocks >= 8)
310   {
311     for ( ; ; )
312     {
313       /* Set my_hash_entries to the next bigger 2 power */
314       if ((keycache->hash_entries= next_power(blocks)) < blocks * 5/4)
315         keycache->hash_entries<<= 1;
316       hash_links= 2 * blocks;
317       while ((length= (ALIGN_SIZE(blocks * sizeof(BLOCK_LINK)) +
318 		       ALIGN_SIZE(hash_links * sizeof(HASH_LINK)) +
319 		       ALIGN_SIZE(sizeof(HASH_LINK*) *
320                                   keycache->hash_entries))) +
321 	     ((size_t) blocks * keycache->key_cache_block_size) > use_mem)
322         blocks--;
323       /* Allocate memory for cache page buffers */
324       if ((keycache->block_mem=
325 	   my_large_malloc(key_memory_KEY_CACHE,
326                            (size_t) blocks * keycache->key_cache_block_size,
327 			  MYF(0))))
328       {
329         /*
330 	  Allocate memory for blocks, hash_links and hash entries;
331 	  For each block 2 hash links are allocated
332         */
333         if ((keycache->block_root= (BLOCK_LINK*) my_malloc(key_memory_KEY_CACHE,
334                                                            length,
335                                                            MYF(0))))
336           break;
337         my_large_free(keycache->block_mem);
338         keycache->block_mem= 0;
339       }
340       if (blocks < 8)
341       {
342         set_my_errno(ENOMEM);
343         my_error(EE_OUTOFMEMORY, MYF(ME_FATALERROR),
344                  blocks * keycache->key_cache_block_size);
345         goto err;
346       }
347       blocks= blocks / 4*3;
348     }
349     keycache->blocks_unused= blocks;
350     keycache->disk_blocks= (int) blocks;
351     keycache->hash_links= hash_links;
352     keycache->hash_root= (HASH_LINK**) ((char*) keycache->block_root +
353 				        ALIGN_SIZE(blocks*sizeof(BLOCK_LINK)));
354     keycache->hash_link_root= (HASH_LINK*) ((char*) keycache->hash_root +
355 				            ALIGN_SIZE((sizeof(HASH_LINK*) *
356 							keycache->hash_entries)));
357     memset(keycache->block_root, 0,
358 	  keycache->disk_blocks * sizeof(BLOCK_LINK));
359     memset(keycache->hash_root, 0,
360           keycache->hash_entries * sizeof(HASH_LINK*));
361     memset(keycache->hash_link_root, 0,
362 	  keycache->hash_links * sizeof(HASH_LINK));
363     keycache->hash_links_used= 0;
364     keycache->free_hash_list= NULL;
365     keycache->blocks_used= keycache->blocks_changed= 0;
366 
367     keycache->global_blocks_changed= 0;
368     keycache->blocks_available=0;		/* For debugging */
369 
370     /* The LRU chain is empty after initialization */
371     keycache->used_last= NULL;
372     keycache->used_ins= NULL;
373     keycache->free_block_list= NULL;
374     keycache->keycache_time= 0;
375     keycache->warm_blocks= 0;
376     keycache->min_warm_blocks= (division_limit ?
377 				blocks * division_limit / 100 + 1 :
378 				blocks);
379     keycache->age_threshold= (age_threshold ?
380 			      blocks * age_threshold / 100 :
381 			      blocks);
382 
383     keycache->can_be_used= 1;
384 
385     keycache->waiting_for_hash_link.last_thread= NULL;
386     keycache->waiting_for_block.last_thread= NULL;
387     DBUG_PRINT("exit",
388 	       ("disk_blocks: %d  block_root: 0x%lx  hash_entries: %d\
389  hash_root: 0x%lx  hash_links: %d  hash_link_root: 0x%lx",
390 		keycache->disk_blocks,  (long) keycache->block_root,
391 		keycache->hash_entries, (long) keycache->hash_root,
392 		keycache->hash_links,   (long) keycache->hash_link_root));
393     memset(keycache->changed_blocks, 0,
394 	  sizeof(keycache->changed_blocks[0]) * CHANGED_BLOCKS_HASH);
395     memset(keycache->file_blocks, 0,
396 	  sizeof(keycache->file_blocks[0]) * CHANGED_BLOCKS_HASH);
397   }
398   else
399   {
400     /* key_buffer_size is specified too small. Disable the cache. */
401     keycache->can_be_used= 0;
402   }
403 
404   keycache->blocks= keycache->disk_blocks > 0 ? keycache->disk_blocks : 0;
405   DBUG_RETURN((int) keycache->disk_blocks);
406 
407 err:
408   error= my_errno();
409   keycache->disk_blocks= 0;
410   keycache->blocks=  0;
411   if (keycache->block_mem)
412   {
413     my_large_free((uchar*) keycache->block_mem);
414     keycache->block_mem= NULL;
415   }
416   if (keycache->block_root)
417   {
418     my_free(keycache->block_root);
419     keycache->block_root= NULL;
420   }
421   set_my_errno(error);
422   keycache->can_be_used= 0;
423   DBUG_RETURN(0);
424 }
425 
426 
427 /*
428   Resize a key cache
429 
430   SYNOPSIS
431     resize_key_cache()
432     keycache     	        pointer to a key cache data structure
433     thread_var                  pointer to thread specific variables
434     key_cache_block_size        size of blocks to keep cached data
435     use_mem			total memory to use for the new key cache
436     division_limit		new division limit (if not zero)
437     age_threshold		new age threshold (if not zero)
438 
439   RETURN VALUE
440     number of blocks in the key cache, if successful,
441     0 - otherwise.
442 
443   NOTES.
444     The function first compares the memory size and the block size parameters
445     with the key cache values.
446 
447     If they differ the function free the the memory allocated for the
448     old key cache blocks by calling the end_key_cache function and
449     then rebuilds the key cache with new blocks by calling
450     init_key_cache.
451 
452     The function starts the operation only when all other threads
453     performing operations with the key cache let her to proceed
454     (when cnt_for_resize=0).
455 */
456 
resize_key_cache(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,ulonglong key_cache_block_size,size_t use_mem,ulonglong division_limit,ulonglong age_threshold)457 int resize_key_cache(KEY_CACHE *keycache,
458                      st_keycache_thread_var *thread_var,
459                      ulonglong key_cache_block_size,
460                      size_t use_mem, ulonglong division_limit,
461                      ulonglong age_threshold)
462 {
463   int blocks;
464   DBUG_ENTER("resize_key_cache");
465 
466   if (!keycache->key_cache_inited)
467     DBUG_RETURN(keycache->disk_blocks);
468 
469   if(key_cache_block_size == keycache->key_cache_block_size &&
470      use_mem == keycache->key_cache_mem_size)
471   {
472     change_key_cache_param(keycache, division_limit, age_threshold);
473     DBUG_RETURN(keycache->disk_blocks);
474   }
475 
476   mysql_mutex_lock(&keycache->cache_lock);
477 
478   /*
479     We may need to wait for another thread which is doing a resize
480     already. This cannot happen in the MySQL server though. It allows
481     one resizer only. In set_var.cc keycache->in_init is used to block
482     multiple attempts.
483   */
484   while (keycache->in_resize)
485   {
486     /* purecov: begin inspected */
487     wait_on_queue(&keycache->resize_queue, &keycache->cache_lock,
488                   thread_var);
489     /* purecov: end */
490   }
491 
492   /*
493     Mark the operation in progress. This blocks other threads from doing
494     a resize in parallel. It prohibits new blocks to enter the cache.
495     Read/write requests can bypass the cache during the flush phase.
496   */
497   keycache->in_resize= 1;
498 
499   /* Need to flush only if keycache is enabled. */
500   if (keycache->can_be_used)
501   {
502     /* Start the flush phase. */
503     keycache->resize_in_flush= 1;
504 
505     if (flush_all_key_blocks(keycache, thread_var))
506     {
507       /* TODO: if this happens, we should write a warning in the log file ! */
508       keycache->resize_in_flush= 0;
509       blocks= 0;
510       keycache->can_be_used= 0;
511       goto finish;
512     }
513     assert(cache_empty(keycache));
514 
515     /* End the flush phase. */
516     keycache->resize_in_flush= 0;
517   }
518 
519   /*
520     Some direct read/write operations (bypassing the cache) may still be
521     unfinished. Wait until they are done. If the key cache can be used,
522     direct I/O is done in increments of key_cache_block_size. That is,
523     every block is checked if it is in the cache. We need to wait for
524     pending I/O before re-initializing the cache, because we may change
525     the block size. Otherwise they could check for blocks at file
526     positions where the new block division has none. We do also want to
527     wait for I/O done when (if) the cache was disabled. It must not
528     run in parallel with normal cache operation.
529   */
530   while (keycache->cnt_for_resize_op)
531     wait_on_queue(&keycache->waiting_for_resize_cnt, &keycache->cache_lock,
532                   thread_var);
533 
534   /*
535     Free old cache structures, allocate new structures, and initialize
536     them. Note that the cache_lock mutex and the resize_queue are left
537     untouched. We do not lose the cache_lock and will release it only at
538     the end of this function.
539   */
540   end_key_cache(keycache, 0);			/* Don't free mutex */
541   /* The following will work even if use_mem is 0 */
542   blocks= init_key_cache(keycache, key_cache_block_size, use_mem,
543 			 division_limit, age_threshold);
544 
545 finish:
546   /*
547     Mark the resize finished. This allows other threads to start a
548     resize or to request new cache blocks.
549   */
550   keycache->in_resize= 0;
551 
552   /* Signal waiting threads. */
553   release_whole_queue(&keycache->resize_queue);
554 
555   mysql_mutex_unlock(&keycache->cache_lock);
556   DBUG_RETURN(blocks);
557 }
558 
559 
560 /*
561   Increment counter blocking resize key cache operation
562 */
inc_counter_for_resize_op(KEY_CACHE * keycache)563 static inline void inc_counter_for_resize_op(KEY_CACHE *keycache)
564 {
565   keycache->cnt_for_resize_op++;
566 }
567 
568 
569 /*
570   Decrement counter blocking resize key cache operation;
571   Signal the operation to proceed when counter becomes equal zero
572 */
dec_counter_for_resize_op(KEY_CACHE * keycache)573 static inline void dec_counter_for_resize_op(KEY_CACHE *keycache)
574 {
575   if (!--keycache->cnt_for_resize_op)
576     release_whole_queue(&keycache->waiting_for_resize_cnt);
577 }
578 
579 /*
580   Change the key cache parameters
581 
582   SYNOPSIS
583     change_key_cache_param()
584     keycache			pointer to a key cache data structure
585     division_limit		new division limit (if not zero)
586     age_threshold		new age threshold (if not zero)
587 
588   RETURN VALUE
589     none
590 
591   NOTES.
592     Presently the function resets the key cache parameters
593     concerning midpoint insertion strategy - division_limit and
594     age_threshold.
595 */
596 
change_key_cache_param(KEY_CACHE * keycache,ulonglong division_limit,ulonglong age_threshold)597 static void change_key_cache_param(KEY_CACHE *keycache,
598                                    ulonglong division_limit,
599                                    ulonglong age_threshold)
600 {
601   DBUG_ENTER("change_key_cache_param");
602 
603   mysql_mutex_lock(&keycache->cache_lock);
604   if (division_limit)
605     keycache->min_warm_blocks= (keycache->disk_blocks *
606 				division_limit / 100 + 1);
607   if (age_threshold)
608     keycache->age_threshold=   (keycache->disk_blocks *
609 				age_threshold / 100);
610   mysql_mutex_unlock(&keycache->cache_lock);
611   DBUG_VOID_RETURN;
612 }
613 
614 
615 /*
616   Remove key_cache from memory
617 
618   SYNOPSIS
619     end_key_cache()
620     keycache		key cache handle
621     cleanup		Complete free (Free also mutex for key cache)
622 
623   RETURN VALUE
624     none
625 */
626 
end_key_cache(KEY_CACHE * keycache,my_bool cleanup)627 void end_key_cache(KEY_CACHE *keycache, my_bool cleanup)
628 {
629   DBUG_ENTER("end_key_cache");
630   DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) keycache));
631 
632   if (!keycache->key_cache_inited)
633     DBUG_VOID_RETURN;
634 
635   if (keycache->disk_blocks > 0)
636   {
637     if (keycache->block_mem)
638     {
639       my_large_free((uchar*) keycache->block_mem);
640       keycache->block_mem= NULL;
641       my_free(keycache->block_root);
642       keycache->block_root= NULL;
643     }
644     keycache->disk_blocks= -1;
645     /* Reset blocks_changed to be safe if flush_all_key_blocks is called */
646     keycache->blocks_changed= 0;
647   }
648 
649   DBUG_PRINT("status", ("used: %lu  changed: %lu  w_requests: %lu  "
650                         "writes: %lu  r_requests: %lu  reads: %lu",
651                         keycache->blocks_used, keycache->global_blocks_changed,
652                         (ulong) keycache->global_cache_w_requests,
653                         (ulong) keycache->global_cache_write,
654                         (ulong) keycache->global_cache_r_requests,
655                         (ulong) keycache->global_cache_read));
656 
657   /*
658     Reset these values to be able to detect a disabled key cache.
659     See Bug#44068 (RESTORE can disable the MyISAM Key Cache).
660   */
661   keycache->blocks_used= 0;
662   keycache->blocks_unused= 0;
663 
664   if (cleanup)
665   {
666     mysql_mutex_destroy(&keycache->cache_lock);
667     keycache->key_cache_inited= keycache->can_be_used= 0;
668   }
669   DBUG_VOID_RETURN;
670 } /* end_key_cache */
671 
672 
673 /**
674   Link a thread into double-linked queue of waiting threads.
675 
676   @param wqueue   pointer to the queue structure
677   @param thread   pointer to the keycache variables for the
678                   thread to be added to the queue
679 
680   Queue is represented by a circular list of the keycache variable structures.
681   Since each thread has its own keycache variables, this is equal to a list
682   of threads. The list is double-linked of the type (**prev,*next), accessed by
683   a pointer to the last element.
684 */
685 
link_into_queue(KEYCACHE_WQUEUE * wqueue,st_keycache_thread_var * thread)686 static void link_into_queue(KEYCACHE_WQUEUE *wqueue,
687                             st_keycache_thread_var *thread)
688 {
689   st_keycache_thread_var *last;
690 
691   assert(!thread->next && !thread->prev);
692   if (! (last= wqueue->last_thread))
693   {
694     /* Queue is empty */
695     thread->next= thread;
696     thread->prev= &thread->next;
697   }
698   else
699   {
700     thread->prev= last->next->prev;
701     last->next->prev= &thread->next;
702     thread->next= last->next;
703     last->next= thread;
704   }
705   wqueue->last_thread= thread;
706 }
707 
708 
709 /**
710   Unlink a thread from double-linked queue of waiting threads
711 
712   @param wqueue   pointer to the queue structure
713   @param thread   pointer to the keycache variables for the
714                   thread to be removed to the queue
715 
716   @note See link_into_queue
717 */
718 
unlink_from_queue(KEYCACHE_WQUEUE * wqueue,st_keycache_thread_var * thread)719 static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue,
720                               st_keycache_thread_var *thread)
721 {
722   assert(thread->next && thread->prev);
723   if (thread->next == thread)
724     /* The queue contains only one member */
725     wqueue->last_thread= NULL;
726   else
727   {
728     thread->next->prev= thread->prev;
729     *thread->prev=thread->next;
730     if (wqueue->last_thread == thread)
731       wqueue->last_thread= STRUCT_PTR(st_keycache_thread_var, next,
732                                       thread->prev);
733   }
734   thread->next= NULL;
735 #if !defined(NDEBUG)
736   /*
737     This makes it easier to see it's not in a chain during debugging.
738     And some assert() rely on it.
739   */
740   thread->prev= NULL;
741 #endif
742 }
743 
744 
745 /*
746   Add a thread to single-linked queue of waiting threads
747 
748   SYNOPSIS
749     wait_on_queue()
750       wqueue            Pointer to the queue structure.
751       mutex             Cache_lock to acquire after awake.
752       thread            Thread to be added
753 
754   RETURN VALUE
755     none
756 
757   NOTES.
758     Queue is represented by a circular list of the thread structures
759     The list is single-linked of the type (*next), accessed by a pointer
760     to the last element.
761 
762     The function protects against stray signals by verifying that the
763     current thread is unlinked from the queue when awaking. However,
764     since several threads can wait for the same event, it might be
765     necessary for the caller of the function to check again if the
766     condition for awake is indeed matched.
767 */
768 
wait_on_queue(KEYCACHE_WQUEUE * wqueue,mysql_mutex_t * mutex,st_keycache_thread_var * thread)769 static void wait_on_queue(KEYCACHE_WQUEUE *wqueue,
770                           mysql_mutex_t *mutex,
771                           st_keycache_thread_var *thread)
772 {
773   st_keycache_thread_var *last;
774 
775   /* Add to queue. */
776   assert(!thread->next);
777   assert(!thread->prev); /* Not required, but must be true anyway. */
778   if (! (last= wqueue->last_thread))
779     thread->next= thread;
780   else
781   {
782     thread->next= last->next;
783     last->next= thread;
784   }
785   wqueue->last_thread= thread;
786 
787   /*
788     Wait until thread is removed from queue by the signalling thread.
789     The loop protects against stray signals.
790   */
791   do
792   {
793     mysql_cond_wait(&thread->suspend, mutex);
794   }
795   while (thread->next);
796 }
797 
798 
799 /*
800   Remove all threads from queue signaling them to proceed
801 
802   SYNOPSIS
803     release_whole_queue()
804       wqueue            pointer to the queue structure
805 
806   RETURN VALUE
807     none
808 
809   NOTES.
810     See notes for wait_on_queue().
811     When removed from the queue each thread is signaled via condition
812     variable thread->suspend.
813 */
814 
release_whole_queue(KEYCACHE_WQUEUE * wqueue)815 static void release_whole_queue(KEYCACHE_WQUEUE *wqueue)
816 {
817   st_keycache_thread_var *last;
818   st_keycache_thread_var *next;
819   st_keycache_thread_var *thread;
820 
821   /* Queue may be empty. */
822   if (!(last= wqueue->last_thread))
823     return;
824 
825   next= last->next;
826   do
827   {
828     thread=next;
829     /* Signal the thread. */
830     mysql_cond_signal(&thread->suspend);
831     /* Take thread from queue. */
832     next=thread->next;
833     thread->next= NULL;
834   }
835   while (thread != last);
836 
837   /* Now queue is definitely empty. */
838   wqueue->last_thread= NULL;
839 }
840 
841 
842 /*
843   Unlink a block from the chain of dirty/clean blocks
844 */
845 
unlink_changed(BLOCK_LINK * block)846 static inline void unlink_changed(BLOCK_LINK *block)
847 {
848   assert(block->prev_changed && *block->prev_changed == block);
849   if (block->next_changed)
850     block->next_changed->prev_changed= block->prev_changed;
851   *block->prev_changed= block->next_changed;
852 
853 #if !defined(NDEBUG)
854   /*
855     This makes it easier to see it's not in a chain during debugging.
856     And some assert() rely on it.
857   */
858   block->next_changed= NULL;
859   block->prev_changed= NULL;
860 #endif
861 }
862 
863 
864 /*
865   Link a block into the chain of dirty/clean blocks
866 */
867 
link_changed(BLOCK_LINK * block,BLOCK_LINK ** phead)868 static inline void link_changed(BLOCK_LINK *block, BLOCK_LINK **phead)
869 {
870   assert(!block->next_changed);
871   assert(!block->prev_changed);
872   block->prev_changed= phead;
873   if ((block->next_changed= *phead))
874     (*phead)->prev_changed= &block->next_changed;
875   *phead= block;
876 }
877 
878 
879 /*
880   Link a block in a chain of clean blocks of a file.
881 
882   SYNOPSIS
883     link_to_file_list()
884       keycache		Key cache handle
885       block             Block to relink
886       file              File to be linked to
887       unlink            If to unlink first
888 
889   DESCRIPTION
890     Unlink a block from whichever chain it is linked in, if it's
891     asked for, and link it to the chain of clean blocks of the
892     specified file.
893 
894   NOTE
895     Please do never set/clear BLOCK_CHANGED outside of
896     link_to_file_list() or link_to_changed_list().
897     You would risk to damage correct counting of changed blocks
898     and to find blocks in the wrong hash.
899 
900   RETURN
901     void
902 */
903 
link_to_file_list(KEY_CACHE * keycache,BLOCK_LINK * block,int file,my_bool unlink_block)904 static void link_to_file_list(KEY_CACHE *keycache,
905                               BLOCK_LINK *block, int file,
906                               my_bool unlink_block)
907 {
908   assert(block->status & BLOCK_IN_USE);
909   assert(block->hash_link && block->hash_link->block == block);
910   assert(block->hash_link->file == file);
911   if (unlink_block)
912     unlink_changed(block);
913   link_changed(block, &keycache->file_blocks[FILE_HASH(file)]);
914   if (block->status & BLOCK_CHANGED)
915   {
916     block->status&= ~BLOCK_CHANGED;
917     keycache->blocks_changed--;
918     keycache->global_blocks_changed--;
919   }
920 }
921 
922 
923 /*
924   Re-link a block from the clean chain to the dirty chain of a file.
925 
926   SYNOPSIS
927     link_to_changed_list()
928       keycache		key cache handle
929       block             block to relink
930 
931   DESCRIPTION
932     Unlink a block from the chain of clean blocks of a file
933     and link it to the chain of dirty blocks of the same file.
934 
935   NOTE
936     Please do never set/clear BLOCK_CHANGED outside of
937     link_to_file_list() or link_to_changed_list().
938     You would risk to damage correct counting of changed blocks
939     and to find blocks in the wrong hash.
940 
941   RETURN
942     void
943 */
944 
link_to_changed_list(KEY_CACHE * keycache,BLOCK_LINK * block)945 static void link_to_changed_list(KEY_CACHE *keycache,
946                                  BLOCK_LINK *block)
947 {
948   assert(block->status & BLOCK_IN_USE);
949   assert(!(block->status & BLOCK_CHANGED));
950   assert(block->hash_link && block->hash_link->block == block);
951 
952   unlink_changed(block);
953   link_changed(block,
954                &keycache->changed_blocks[FILE_HASH(block->hash_link->file)]);
955   block->status|=BLOCK_CHANGED;
956   keycache->blocks_changed++;
957   keycache->global_blocks_changed++;
958 }
959 
960 
961 /*
962   Link a block to the LRU chain at the beginning or at the end of
963   one of two parts.
964 
965   SYNOPSIS
966     link_block()
967       keycache            pointer to a key cache data structure
968       block               pointer to the block to link to the LRU chain
969       hot                 <-> to link the block into the hot subchain
970       at_end              <-> to link the block at the end of the subchain
971 
972   RETURN VALUE
973     none
974 
975   NOTES.
976     The LRU ring is represented by a circular list of block structures.
977     The list is double-linked of the type (**prev,*next) type.
978     The LRU ring is divided into two parts - hot and warm.
979     There are two pointers to access the last blocks of these two
980     parts. The beginning of the warm part follows right after the
981     end of the hot part.
982     Only blocks of the warm part can be used for eviction.
983     The first block from the beginning of this subchain is always
984     taken for eviction (keycache->last_used->next)
985 
986     LRU chain:       +------+   H O T    +------+
987                 +----| end  |----...<----| beg  |----+
988                 |    +------+last        +------+    |
989                 v<-link in latest hot (new end)      |
990                 |     link in latest warm (new end)->^
991                 |    +------+  W A R M   +------+    |
992                 +----| beg  |---->...----| end  |----+
993                      +------+            +------+ins
994                   first for eviction
995 
996     It is also possible that the block is selected for eviction and thus
997     not linked in the LRU ring.
998 */
999 
link_block(KEY_CACHE * keycache,BLOCK_LINK * block,my_bool hot,my_bool at_end)1000 static void link_block(KEY_CACHE *keycache, BLOCK_LINK *block, my_bool hot,
1001                        my_bool at_end)
1002 {
1003   BLOCK_LINK *ins;
1004   BLOCK_LINK **pins;
1005 
1006   assert((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE));
1007   assert(block->hash_link); /*backptr to block NULL from free_block()*/
1008   assert(!block->requests);
1009   assert(block->prev_changed && *block->prev_changed == block);
1010   assert(!block->next_used);
1011   assert(!block->prev_used);
1012 
1013   if (!hot && keycache->waiting_for_block.last_thread)
1014   {
1015     /* Signal that in the LRU warm sub-chain an available block has appeared */
1016     st_keycache_thread_var *last_thread=
1017       keycache->waiting_for_block.last_thread;
1018     st_keycache_thread_var *first_thread= last_thread->next;
1019     st_keycache_thread_var *next_thread= first_thread;
1020     HASH_LINK *hash_link= (HASH_LINK *) first_thread->opt_info;
1021     st_keycache_thread_var *thread;
1022     do
1023     {
1024       thread= next_thread;
1025       next_thread= thread->next;
1026       /*
1027          We notify about the event all threads that ask
1028          for the same page as the first thread in the queue
1029       */
1030       if ((HASH_LINK *) thread->opt_info == hash_link)
1031       {
1032         mysql_cond_signal(&thread->suspend);
1033         unlink_from_queue(&keycache->waiting_for_block, thread);
1034         block->requests++;
1035       }
1036     }
1037     while (thread != last_thread);
1038     hash_link->block= block;
1039     /*
1040       NOTE: We assigned the block to the hash_link and signalled the
1041       requesting thread(s). But it is possible that other threads runs
1042       first. These threads see the hash_link assigned to a block which
1043       is assigned to another hash_link and not marked BLOCK_IN_SWITCH.
1044       This can be a problem for functions that do not select the block
1045       via its hash_link: flush and free. They do only see a block which
1046       is in a "normal" state and don't know that it will be evicted soon.
1047 
1048       We cannot set BLOCK_IN_SWITCH here because only one of the
1049       requesting threads must handle the eviction. All others must wait
1050       for it to complete. If we set the flag here, the threads would not
1051       know who is in charge of the eviction. Without the flag, the first
1052       thread takes the stick and sets the flag.
1053 
1054       But we need to note in the block that is has been selected for
1055       eviction. It must not be freed. The evicting thread will not
1056       expect the block in the free list. Before freeing we could also
1057       check if block->requests > 1. But I think including another flag
1058       in the check of block->status is slightly more efficient and
1059       probably easier to read.
1060     */
1061     block->status|= BLOCK_IN_EVICTION;
1062     return;
1063   }
1064 
1065   pins= hot ? &keycache->used_ins : &keycache->used_last;
1066   ins= *pins;
1067   if (ins)
1068   {
1069     ins->next_used->prev_used= &block->next_used;
1070     block->next_used= ins->next_used;
1071     block->prev_used= &ins->next_used;
1072     ins->next_used= block;
1073     if (at_end)
1074       *pins= block;
1075   }
1076   else
1077   {
1078     /* The LRU ring is empty. Let the block point to itself. */
1079     keycache->used_last= keycache->used_ins= block->next_used= block;
1080     block->prev_used= &block->next_used;
1081   }
1082   assert((ulong) keycache->blocks_available <=
1083          keycache->blocks_used);
1084 }
1085 
1086 
1087 /*
1088   Unlink a block from the LRU chain
1089 
1090   SYNOPSIS
1091     unlink_block()
1092       keycache            pointer to a key cache data structure
1093       block               pointer to the block to unlink from the LRU chain
1094 
1095   RETURN VALUE
1096     none
1097 
1098   NOTES.
1099     See NOTES for link_block
1100 */
1101 
unlink_block(KEY_CACHE * keycache,BLOCK_LINK * block)1102 static void unlink_block(KEY_CACHE *keycache, BLOCK_LINK *block)
1103 {
1104   assert((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE));
1105   assert(block->hash_link); /*backptr to block NULL from free_block()*/
1106   assert(!block->requests);
1107   assert(block->prev_changed && *block->prev_changed == block);
1108   assert(block->next_used && block->prev_used &&
1109          (block->next_used->prev_used == &block->next_used) &&
1110          (*block->prev_used == block));
1111   if (block->next_used == block)
1112     /* The list contains only one member */
1113     keycache->used_last= keycache->used_ins= NULL;
1114   else
1115   {
1116     block->next_used->prev_used= block->prev_used;
1117     *block->prev_used= block->next_used;
1118     if (keycache->used_last == block)
1119       keycache->used_last= STRUCT_PTR(BLOCK_LINK, next_used, block->prev_used);
1120     if (keycache->used_ins == block)
1121       keycache->used_ins=STRUCT_PTR(BLOCK_LINK, next_used, block->prev_used);
1122   }
1123   block->next_used= NULL;
1124 #if !defined(NDEBUG)
1125   /*
1126     This makes it easier to see it's not in a chain during debugging.
1127     And some assert() rely on it.
1128   */
1129   block->prev_used= NULL;
1130 #endif
1131 }
1132 
1133 
1134 /*
1135   Register requests for a block.
1136 
1137   SYNOPSIS
1138     reg_requests()
1139       keycache          Pointer to a key cache data structure.
1140       block             Pointer to the block to register a request on.
1141       count             Number of requests. Always 1.
1142 
1143   NOTE
1144     The first request unlinks the block from the LRU ring. This means
1145     that it is protected against eveiction.
1146 
1147   RETURN
1148     void
1149 */
reg_requests(KEY_CACHE * keycache,BLOCK_LINK * block,int count)1150 static void reg_requests(KEY_CACHE *keycache, BLOCK_LINK *block, int count)
1151 {
1152   assert(block->status & BLOCK_IN_USE);
1153   assert(block->hash_link);
1154 
1155   if (!block->requests)
1156     unlink_block(keycache, block);
1157   block->requests+=count;
1158 }
1159 
1160 
1161 /*
1162   Unregister request for a block
1163   linking it to the LRU chain if it's the last request
1164 
1165   SYNOPSIS
1166     unreg_request()
1167     keycache            pointer to a key cache data structure
1168     block               pointer to the block to link to the LRU chain
1169     at_end              <-> to link the block at the end of the LRU chain
1170 
1171   RETURN VALUE
1172     none
1173 
1174   NOTES.
1175     Every linking to the LRU ring decrements by one a special block
1176     counter (if it's positive). If the at_end parameter is TRUE the block is
1177     added either at the end of warm sub-chain or at the end of hot sub-chain.
1178     It is added to the hot subchain if its counter is zero and number of
1179     blocks in warm sub-chain is not less than some low limit (determined by
1180     the division_limit parameter). Otherwise the block is added to the warm
1181     sub-chain. If the at_end parameter is FALSE the block is always added
1182     at beginning of the warm sub-chain.
1183     Thus a warm block can be promoted to the hot sub-chain when its counter
1184     becomes zero for the first time.
1185     At the same time  the block at the very beginning of the hot subchain
1186     might be moved to the beginning of the warm subchain if it stays untouched
1187     for a too long time (this time is determined by parameter age_threshold).
1188 
1189     It is also possible that the block is selected for eviction and thus
1190     not linked in the LRU ring.
1191 */
1192 
unreg_request(KEY_CACHE * keycache,BLOCK_LINK * block,int at_end)1193 static void unreg_request(KEY_CACHE *keycache,
1194                           BLOCK_LINK *block, int at_end)
1195 {
1196   assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
1197   assert(block->hash_link); /*backptr to block NULL from free_block()*/
1198   assert(block->requests);
1199   assert(block->prev_changed && *block->prev_changed == block);
1200   assert(!block->next_used);
1201   assert(!block->prev_used);
1202   /*
1203     Unregister the request, but do not link erroneous blocks into the
1204     LRU ring.
1205   */
1206   if (!--block->requests && !(block->status & BLOCK_ERROR))
1207   {
1208     my_bool hot;
1209     if (block->hits_left)
1210       block->hits_left--;
1211     hot= !block->hits_left && at_end &&
1212       keycache->warm_blocks > keycache->min_warm_blocks;
1213     if (hot)
1214     {
1215       if (block->temperature == BLOCK_WARM)
1216         keycache->warm_blocks--;
1217       block->temperature= BLOCK_HOT;
1218     }
1219     link_block(keycache, block, hot, (my_bool)at_end);
1220     block->last_hit_time= keycache->keycache_time;
1221     keycache->keycache_time++;
1222     /*
1223       At this place, the block might be in the LRU ring or not. If an
1224       evicter was waiting for a block, it was selected for eviction and
1225       not linked in the LRU ring.
1226     */
1227 
1228     /*
1229       Check if we should link a hot block to the warm block sub-chain.
1230       It is possible that we select the same block as above. But it can
1231       also be another block. In any case a block from the LRU ring is
1232       selected. In other words it works even if the above block was
1233       selected for eviction and not linked in the LRU ring. Since this
1234       happens only if the LRU ring is empty, the block selected below
1235       would be NULL and the rest of the function skipped.
1236     */
1237     block= keycache->used_ins;
1238     if (block && keycache->keycache_time - block->last_hit_time >
1239 	keycache->age_threshold)
1240     {
1241       unlink_block(keycache, block);
1242       link_block(keycache, block, 0, 0);
1243       if (block->temperature != BLOCK_WARM)
1244       {
1245         keycache->warm_blocks++;
1246         block->temperature= BLOCK_WARM;
1247       }
1248     }
1249   }
1250 }
1251 
1252 /*
1253   Remove a reader of the page in block
1254 */
1255 
remove_reader(BLOCK_LINK * block)1256 static void remove_reader(BLOCK_LINK *block)
1257 {
1258   assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
1259   assert(block->hash_link && block->hash_link->block == block);
1260   assert(block->prev_changed && *block->prev_changed == block);
1261   assert(!block->next_used);
1262   assert(!block->prev_used);
1263   assert(block->hash_link->requests);
1264 
1265   if (! --block->hash_link->requests && block->condvar)
1266     mysql_cond_signal(block->condvar);
1267 }
1268 
1269 
1270 /*
1271   Wait until the last reader of the page in block
1272   signals on its termination
1273 */
1274 
wait_for_readers(KEY_CACHE * keycache,BLOCK_LINK * block,st_keycache_thread_var * thread)1275 static void wait_for_readers(KEY_CACHE *keycache,
1276                              BLOCK_LINK *block,
1277                              st_keycache_thread_var *thread)
1278 {
1279   assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
1280   assert(!(block->status & (BLOCK_IN_FLUSH | BLOCK_CHANGED)));
1281   assert(block->hash_link);
1282   assert(block->hash_link->block == block);
1283   /* Linked in file_blocks or changed_blocks hash. */
1284   assert(block->prev_changed && *block->prev_changed == block);
1285   /* Not linked in LRU ring. */
1286   assert(!block->next_used);
1287   assert(!block->prev_used);
1288   while (block->hash_link->requests)
1289   {
1290     /* There must be no other waiter. We have no queue here. */
1291     assert(!block->condvar);
1292     block->condvar= &thread->suspend;
1293     mysql_cond_wait(&thread->suspend, &keycache->cache_lock);
1294     block->condvar= NULL;
1295   }
1296 }
1297 
1298 
1299 /*
1300   Add a hash link to a bucket in the hash_table
1301 */
1302 
link_hash(HASH_LINK ** start,HASH_LINK * hash_link)1303 static inline void link_hash(HASH_LINK **start, HASH_LINK *hash_link)
1304 {
1305   if (*start)
1306     (*start)->prev= &hash_link->next;
1307   hash_link->next= *start;
1308   hash_link->prev= start;
1309   *start= hash_link;
1310 }
1311 
1312 
1313 /*
1314   Remove a hash link from the hash table
1315 */
1316 
unlink_hash(KEY_CACHE * keycache,HASH_LINK * hash_link)1317 static void unlink_hash(KEY_CACHE *keycache, HASH_LINK *hash_link)
1318 {
1319   assert(hash_link->requests == 0);
1320   if ((*hash_link->prev= hash_link->next))
1321     hash_link->next->prev= hash_link->prev;
1322   hash_link->block= NULL;
1323 
1324   if (keycache->waiting_for_hash_link.last_thread)
1325   {
1326     /* Signal that a free hash link has appeared */
1327     st_keycache_thread_var *last_thread=
1328                                keycache->waiting_for_hash_link.last_thread;
1329     st_keycache_thread_var *first_thread= last_thread->next;
1330     st_keycache_thread_var *next_thread= first_thread;
1331     KEYCACHE_PAGE *first_page= (KEYCACHE_PAGE *) (first_thread->opt_info);
1332     st_keycache_thread_var *thread;
1333 
1334     hash_link->file= first_page->file;
1335     hash_link->diskpos= first_page->filepos;
1336     do
1337     {
1338       KEYCACHE_PAGE *page;
1339       thread= next_thread;
1340       page= (KEYCACHE_PAGE *) thread->opt_info;
1341       next_thread= thread->next;
1342       /*
1343          We notify about the event all threads that ask
1344          for the same page as the first thread in the queue
1345       */
1346       if (page->file == hash_link->file && page->filepos == hash_link->diskpos)
1347       {
1348         mysql_cond_signal(&thread->suspend);
1349         unlink_from_queue(&keycache->waiting_for_hash_link, thread);
1350       }
1351     }
1352     while (thread != last_thread);
1353     link_hash(&keycache->hash_root[KEYCACHE_HASH(hash_link->file,
1354 					         hash_link->diskpos)],
1355               hash_link);
1356     return;
1357   }
1358   hash_link->next= keycache->free_hash_list;
1359   keycache->free_hash_list= hash_link;
1360 }
1361 
1362 
1363 /*
1364   Get the hash link for a page
1365 */
1366 
get_hash_link(KEY_CACHE * keycache,int file,my_off_t filepos,st_keycache_thread_var * thread)1367 static HASH_LINK *get_hash_link(KEY_CACHE *keycache,
1368                                 int file, my_off_t filepos,
1369                                 st_keycache_thread_var *thread)
1370 {
1371   HASH_LINK *hash_link, **start;
1372 #ifndef NDEBUG
1373   int cnt;
1374 #endif
1375 
1376 restart:
1377   /*
1378      Find the bucket in the hash table for the pair (file, filepos);
1379      start contains the head of the bucket list,
1380      hash_link points to the first member of the list
1381   */
1382   hash_link= *(start= &keycache->hash_root[KEYCACHE_HASH(file, filepos)]);
1383 #ifndef NDEBUG
1384   cnt= 0;
1385 #endif
1386   /* Look for an element for the pair (file, filepos) in the bucket chain */
1387   while (hash_link &&
1388          (hash_link->diskpos != filepos || hash_link->file != file))
1389   {
1390     hash_link= hash_link->next;
1391 #ifndef NDEBUG
1392     cnt++;
1393     assert(cnt <= keycache->hash_links_used);
1394 #endif
1395   }
1396   if (! hash_link)
1397   {
1398     /* There is no hash link in the hash table for the pair (file, filepos) */
1399     if (keycache->free_hash_list)
1400     {
1401       hash_link= keycache->free_hash_list;
1402       keycache->free_hash_list= hash_link->next;
1403     }
1404     else if (keycache->hash_links_used < keycache->hash_links)
1405     {
1406       hash_link= &keycache->hash_link_root[keycache->hash_links_used++];
1407     }
1408     else
1409     {
1410       /* Wait for a free hash link */
1411       KEYCACHE_PAGE page;
1412       page.file= file;
1413       page.filepos= filepos;
1414       thread->opt_info= (void *) &page;
1415       link_into_queue(&keycache->waiting_for_hash_link, thread);
1416       mysql_cond_wait(&thread->suspend,
1417                                  &keycache->cache_lock);
1418       thread->opt_info= NULL;
1419       goto restart;
1420     }
1421     hash_link->file= file;
1422     hash_link->diskpos= filepos;
1423     link_hash(start, hash_link);
1424   }
1425   /* Register the request for the page */
1426   hash_link->requests++;
1427 
1428   return hash_link;
1429 }
1430 
1431 
1432 /*
1433   Get a block for the file page requested by a keycache read/write operation;
1434   If the page is not in the cache return a free block, if there is none
1435   return the lru block after saving its buffer if the page is dirty.
1436 
1437   SYNOPSIS
1438 
1439     find_key_block()
1440       keycache            pointer to a key cache data structure
1441       thread              pointer to thread specific variables
1442       file                handler for the file to read page from
1443       filepos             position of the page in the file
1444       init_hits_left      how initialize the block counter for the page
1445       wrmode              <-> get for writing
1446       page_st        out  {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ}
1447 
1448   RETURN VALUE
1449     Pointer to the found block if successful, 0 - otherwise
1450 
1451   NOTES.
1452     For the page from file positioned at filepos the function checks whether
1453     the page is in the key cache specified by the first parameter.
1454     If this is the case it immediately returns the block.
1455     If not, the function first chooses  a block for this page. If there is
1456     no not used blocks in the key cache yet, the function takes the block
1457     at the very beginning of the warm sub-chain. It saves the page in that
1458     block if it's dirty before returning the pointer to it.
1459     The function returns in the page_st parameter the following values:
1460       PAGE_READ         - if page already in the block,
1461       PAGE_TO_BE_READ   - if it is to be read yet by the current thread
1462       WAIT_TO_BE_READ   - if it is to be read by another thread
1463     If an error occurs THE BLOCK_ERROR bit is set in the block status.
1464     It might happen that there are no blocks in LRU chain (in warm part) -
1465     all blocks  are unlinked for some read/write operations. Then the function
1466     waits until first of this operations links any block back.
1467 */
1468 
find_key_block(KEY_CACHE * keycache,st_keycache_thread_var * thread,File file,my_off_t filepos,int init_hits_left,int wrmode,int * page_st)1469 static BLOCK_LINK *find_key_block(KEY_CACHE *keycache,
1470                                   st_keycache_thread_var *thread,
1471                                   File file, my_off_t filepos,
1472                                   int init_hits_left,
1473                                   int wrmode, int *page_st)
1474 {
1475   HASH_LINK *hash_link;
1476   BLOCK_LINK *block;
1477   int error= 0;
1478   int page_status;
1479 
1480   DBUG_ENTER("find_key_block");
1481   DBUG_PRINT("enter", ("fd: %d  pos: %lu  wrmode: %d",
1482                        file, (ulong) filepos, wrmode));
1483 
1484 restart:
1485   /*
1486     If the flush phase of a resize operation fails, the cache is left
1487     unusable. This will be detected only after "goto restart".
1488   */
1489   if (!keycache->can_be_used)
1490     DBUG_RETURN(0);
1491 
1492   /*
1493     Find the hash_link for the requested file block (file, filepos). We
1494     do always get a hash_link here. It has registered our request so
1495     that no other thread can use it for another file block until we
1496     release the request (which is done by remove_reader() usually). The
1497     hash_link can have a block assigned to it or not. If there is a
1498     block, it may be assigned to this hash_link or not. In cases where a
1499     block is evicted from the cache, it is taken from the LRU ring and
1500     referenced by the new hash_link. But the block can still be assigned
1501     to its old hash_link for some time if it needs to be flushed first,
1502     or if there are other threads still reading it.
1503 
1504     Summary:
1505       hash_link is always returned.
1506       hash_link->block can be:
1507       - NULL or
1508       - not assigned to this hash_link or
1509       - assigned to this hash_link. If assigned, the block can have
1510         - invalid data (when freshly assigned) or
1511         - valid data. Valid data can be
1512           - changed over the file contents (dirty) or
1513           - not changed (clean).
1514   */
1515   hash_link= get_hash_link(keycache, file, filepos, thread);
1516   assert((hash_link->file == file) && (hash_link->diskpos == filepos));
1517 
1518   page_status= -1;
1519   if ((block= hash_link->block) &&
1520       block->hash_link == hash_link && (block->status & BLOCK_READ))
1521   {
1522     /* Assigned block with valid (changed or unchanged) contents. */
1523     page_status= PAGE_READ;
1524   }
1525   /*
1526     else (page_status == -1)
1527       - block == NULL or
1528       - block not assigned to this hash_link or
1529       - block assigned but not yet read from file (invalid data).
1530   */
1531 
1532   if (keycache->in_resize)
1533   {
1534     /* This is a request during a resize operation */
1535 
1536     if (!block)
1537     {
1538       /*
1539         The file block is not in the cache. We don't need it in the
1540         cache: we are going to read or write directly to file. Cancel
1541         the request. We can simply decrement hash_link->requests because
1542         we did not release cache_lock since increasing it. So no other
1543         thread can wait for our request to become released.
1544       */
1545       if (hash_link->requests == 1)
1546       {
1547         /*
1548           We are the only one to request this hash_link (this file/pos).
1549           Free the hash_link.
1550         */
1551         hash_link->requests--;
1552         unlink_hash(keycache, hash_link);
1553         DBUG_RETURN(0);
1554       }
1555 
1556       /*
1557         More requests on the hash_link. Someone tries to evict a block
1558         for this hash_link (could have started before resizing started).
1559         This means that the LRU ring is empty. Otherwise a block could
1560         be assigned immediately. Behave like a thread that wants to
1561         evict a block for this file/pos. Add to the queue of threads
1562         waiting for a block. Wait until there is one assigned.
1563 
1564         Refresh the request on the hash-link so that it cannot be reused
1565         for another file/pos.
1566       */
1567       thread->opt_info= (void *) hash_link;
1568       link_into_queue(&keycache->waiting_for_block, thread);
1569       do
1570       {
1571         mysql_cond_wait(&thread->suspend,
1572                                    &keycache->cache_lock);
1573       } while (thread->next);
1574       thread->opt_info= NULL;
1575       /*
1576         A block should now be assigned to the hash_link. But it may
1577         still need to be evicted. Anyway, we should re-check the
1578         situation. page_status must be set correctly.
1579       */
1580       hash_link->requests--;
1581       goto restart;
1582     } /* end of if (!block) */
1583 
1584     /*
1585       There is a block for this file/pos in the cache. Register a
1586       request on it. This unlinks it from the LRU ring (if it is there)
1587       and hence protects it against eviction (if not already in
1588       eviction). We need this for returning the block to the caller, for
1589       calling remove_reader() (for debugging purposes), and for calling
1590       free_block(). The only case where we don't need the request is if
1591       the block is in eviction. In that case we have to unregister the
1592       request later.
1593     */
1594     reg_requests(keycache, block, 1);
1595 
1596     if (page_status != PAGE_READ)
1597     {
1598       /*
1599         - block not assigned to this hash_link or
1600         - block assigned but not yet read from file (invalid data).
1601 
1602         This must be a block in eviction. It will be read soon. We need
1603         to wait here until this happened. Otherwise the caller could
1604         access a wrong block or a block which is in read. While waiting
1605         we cannot lose hash_link nor block. We have registered a request
1606         on the hash_link. Everything can happen to the block but changes
1607         in the hash_link -> block relationship. In other words:
1608         everything can happen to the block but free or another completed
1609         eviction.
1610 
1611         Note that we bahave like a secondary requestor here. We just
1612         cannot return with PAGE_WAIT_TO_BE_READ. This would work for
1613         read requests and writes on dirty blocks that are not in flush
1614         only. Waiting here on COND_FOR_REQUESTED works in all
1615         situations.
1616       */
1617       assert(((block->hash_link != hash_link) &&
1618               (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) ||
1619              ((block->hash_link == hash_link) &&
1620               !(block->status & BLOCK_READ)));
1621       wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock,
1622                     thread);
1623       /*
1624         Here we can trust that the block has been assigned to this
1625         hash_link (block->hash_link == hash_link) and read into the
1626         buffer (BLOCK_READ). The worst things possible here are that the
1627         block is in free (BLOCK_REASSIGNED). But the block is still
1628         assigned to the hash_link. The freeing thread waits until we
1629         release our request on the hash_link. The block must not be
1630         again in eviction because we registered an request on it before
1631         starting to wait.
1632       */
1633       assert(block->hash_link == hash_link);
1634       assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
1635       assert(!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH)));
1636     }
1637     /*
1638       The block is in the cache. Assigned to the hash_link. Valid data.
1639       Note that in case of page_st == PAGE_READ, the block can be marked
1640       for eviction. In any case it can be marked for freeing.
1641     */
1642 
1643     if (!wrmode)
1644     {
1645       /* A reader can just read the block. */
1646       *page_st= PAGE_READ;
1647       assert((hash_link->file == file) &&
1648              (hash_link->diskpos == filepos) &&
1649              (block->hash_link == hash_link));
1650       DBUG_RETURN(block);
1651     }
1652 
1653     /*
1654       This is a writer. No two writers for the same block can exist.
1655       This must be assured by locks outside of the key cache.
1656     */
1657     assert(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block));
1658 
1659     while (block->status & BLOCK_IN_FLUSH)
1660     {
1661       /*
1662         Wait until the block is flushed to file. Do not release the
1663         request on the hash_link yet to prevent that the block is freed
1664         or reassigned while we wait. While we wait, several things can
1665         happen to the block, including another flush. But the block
1666         cannot be reassigned to another hash_link until we release our
1667         request on it. But it can be marked BLOCK_REASSIGNED from free
1668         or eviction, while they wait for us to release the hash_link.
1669       */
1670       wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock,
1671                     thread);
1672       /*
1673         If the flush phase failed, the resize could have finished while
1674         we waited here.
1675       */
1676       if (!keycache->in_resize)
1677       {
1678         remove_reader(block);
1679         unreg_request(keycache, block, 1);
1680         goto restart;
1681       }
1682       assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
1683       assert(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block));
1684       assert(block->hash_link == hash_link);
1685     }
1686 
1687     if (block->status & BLOCK_CHANGED)
1688     {
1689       /*
1690         We want to write a block with changed contents. If the cache
1691         block size is bigger than the callers block size (e.g. MyISAM),
1692         the caller may replace part of the block only. Changes of the
1693         other part of the block must be preserved. Since the block has
1694         not yet been selected for flush, we can still add our changes.
1695       */
1696       *page_st= PAGE_READ;
1697       assert((hash_link->file == file) &&
1698              (hash_link->diskpos == filepos) &&
1699              (block->hash_link == hash_link));
1700       DBUG_RETURN(block);
1701     }
1702 
1703     /*
1704       This is a write request for a clean block. We do not want to have
1705       new dirty blocks in the cache while resizing. We will free the
1706       block and write directly to file. If the block is in eviction or
1707       in free, we just let it go.
1708 
1709       Unregister from the hash_link. This must be done before freeing
1710       the block. And it must be done if not freeing the block. Because
1711       we could have waited above, we need to call remove_reader(). Other
1712       threads could wait for us to release our request on the hash_link.
1713     */
1714     remove_reader(block);
1715 
1716     /* If the block is not in eviction and not in free, we can free it. */
1717     if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
1718                            BLOCK_REASSIGNED)))
1719     {
1720       /*
1721         Free block as we are going to write directly to file.
1722         Although we have an exlusive lock for the updated key part,
1723         the control can be yielded by the current thread as we might
1724         have unfinished readers of other key parts in the block
1725         buffer. Still we are guaranteed not to have any readers
1726         of the key part we are writing into until the block is
1727         removed from the cache as we set the BLOCK_REASSIGNED
1728         flag (see the code below that handles reading requests).
1729       */
1730       free_block(keycache, thread, block);
1731     }
1732     else
1733     {
1734       /*
1735         The block will be evicted/freed soon. Don't touch it in any way.
1736         Unregister the request that we registered above.
1737       */
1738       unreg_request(keycache, block, 1);
1739 
1740       /*
1741         The block is still assigned to the hash_link (the file/pos that
1742         we are going to write to). Wait until the eviction/free is
1743         complete. Otherwise the direct write could complete before all
1744         readers are done with the block. So they could read outdated
1745         data.
1746 
1747         Since we released our request on the hash_link, it can be reused
1748         for another file/pos. Hence we cannot just check for
1749         block->hash_link == hash_link. As long as the resize is
1750         proceeding the block cannot be reassigned to the same file/pos
1751         again. So we can terminate the loop when the block is no longer
1752         assigned to this file/pos.
1753       */
1754       do
1755       {
1756         wait_on_queue(&block->wqueue[COND_FOR_SAVED],
1757                       &keycache->cache_lock, thread);
1758         /*
1759           If the flush phase failed, the resize could have finished
1760           while we waited here.
1761         */
1762         if (!keycache->in_resize)
1763           goto restart;
1764       } while (block->hash_link &&
1765                (block->hash_link->file == file) &&
1766                (block->hash_link->diskpos == filepos));
1767     }
1768     DBUG_RETURN(0);
1769   }
1770 
1771   if (page_status == PAGE_READ &&
1772       (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
1773                         BLOCK_REASSIGNED)))
1774   {
1775     /*
1776       This is a request for a block to be removed from cache. The block
1777       is assigned to this hash_link and contains valid data, but is
1778       marked for eviction or to be freed. Possible reasons why it has
1779       not yet been evicted/freed can be a flush before reassignment
1780       (BLOCK_IN_SWITCH), readers of the block have not finished yet
1781       (BLOCK_REASSIGNED), or the evicting thread did not yet awake after
1782       the block has been selected for it (BLOCK_IN_EVICTION).
1783     */
1784 
1785     /*
1786        Only reading requests can proceed until the old dirty page is flushed,
1787        all others are to be suspended, then resubmitted
1788     */
1789     if (!wrmode && !(block->status & BLOCK_REASSIGNED))
1790     {
1791       /*
1792         This is a read request and the block not yet reassigned. We can
1793         register our request and proceed. This unlinks the block from
1794         the LRU ring and protects it against eviction.
1795       */
1796       reg_requests(keycache, block, 1);
1797     }
1798     else
1799     {
1800       /*
1801         Either this is a write request for a block that is in eviction
1802         or in free. We must not use it any more. Instead we must evict
1803         another block. But we cannot do this before the eviction/free is
1804         done. Otherwise we would find the same hash_link + block again
1805         and again.
1806 
1807         Or this is a read request for a block in eviction/free that does
1808         not require a flush, but waits for readers to finish with the
1809         block. We do not read this block to let the eviction/free happen
1810         as soon as possible. Again we must wait so that we don't find
1811         the same hash_link + block again and again.
1812       */
1813       assert(hash_link->requests);
1814       hash_link->requests--;
1815       wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock,
1816                     thread);
1817       /*
1818         The block is no longer assigned to this hash_link.
1819         Get another one.
1820       */
1821       goto restart;
1822     }
1823   }
1824   else
1825   {
1826     /*
1827       This is a request for a new block or for a block not to be removed.
1828       Either
1829       - block == NULL or
1830       - block not assigned to this hash_link or
1831       - block assigned but not yet read from file,
1832       or
1833       - block assigned with valid (changed or unchanged) data and
1834       - it will not be reassigned/freed.
1835     */
1836     if (! block)
1837     {
1838       /* No block is assigned to the hash_link yet. */
1839       if (keycache->blocks_unused)
1840       {
1841         if (keycache->free_block_list)
1842         {
1843           /* There is a block in the free list. */
1844           block= keycache->free_block_list;
1845           keycache->free_block_list= block->next_used;
1846           block->next_used= NULL;
1847         }
1848         else
1849         {
1850           size_t block_mem_offset;
1851           /* There are some never used blocks, take first of them */
1852           assert(keycache->blocks_used <
1853                  (ulong) keycache->disk_blocks);
1854           block= &keycache->block_root[keycache->blocks_used];
1855           block_mem_offset=
1856            ((size_t) keycache->blocks_used) * keycache->key_cache_block_size;
1857           block->buffer= ADD_TO_PTR(keycache->block_mem,
1858                                     block_mem_offset,
1859                                     uchar*);
1860           keycache->blocks_used++;
1861           assert(!block->next_used);
1862         }
1863         assert(!block->prev_used);
1864         assert(!block->next_changed);
1865         assert(!block->prev_changed);
1866         assert(!block->hash_link);
1867         assert(!block->status);
1868         assert(!block->requests);
1869         keycache->blocks_unused--;
1870         block->status= BLOCK_IN_USE;
1871         block->length= 0;
1872         block->offset= keycache->key_cache_block_size;
1873         block->requests= 1;
1874         block->temperature= BLOCK_COLD;
1875         block->hits_left= init_hits_left;
1876         block->last_hit_time= 0;
1877         block->hash_link= hash_link;
1878         hash_link->block= block;
1879         link_to_file_list(keycache, block, file, 0);
1880         page_status= PAGE_TO_BE_READ;
1881       }
1882       else
1883       {
1884 	/*
1885           There are no free blocks and no never used blocks, use a block
1886           from the LRU ring.
1887         */
1888 
1889         if (! keycache->used_last)
1890         {
1891           /*
1892             The LRU ring is empty. Wait until a new block is added to
1893             it. Several threads might wait here for the same hash_link,
1894             all of them must get the same block. While waiting for a
1895             block, after a block is selected for this hash_link, other
1896             threads can run first before this one awakes. During this
1897             time interval other threads find this hash_link pointing to
1898             the block, which is still assigned to another hash_link. In
1899             this case the block is not marked BLOCK_IN_SWITCH yet, but
1900             it is marked BLOCK_IN_EVICTION.
1901           */
1902 
1903           thread->opt_info= (void *) hash_link;
1904           link_into_queue(&keycache->waiting_for_block, thread);
1905           do
1906           {
1907             mysql_cond_wait(&thread->suspend,
1908                                        &keycache->cache_lock);
1909           }
1910           while (thread->next);
1911           thread->opt_info= NULL;
1912           /* Assert that block has a request registered. */
1913           assert(hash_link->block->requests);
1914           /* Assert that block is not in LRU ring. */
1915           assert(!hash_link->block->next_used);
1916           assert(!hash_link->block->prev_used);
1917         }
1918 
1919         /*
1920           If we waited above, hash_link->block has been assigned by
1921           link_block(). Otherwise it is still NULL. In the latter case
1922           we need to grab a block from the LRU ring ourselves.
1923         */
1924         block= hash_link->block;
1925         if (! block)
1926         {
1927           /* Select the last block from the LRU ring. */
1928           block= keycache->used_last->next_used;
1929           block->hits_left= init_hits_left;
1930           block->last_hit_time= 0;
1931           hash_link->block= block;
1932           /*
1933             Register a request on the block. This unlinks it from the
1934             LRU ring and protects it against eviction.
1935           */
1936           assert(!block->requests);
1937           reg_requests(keycache, block,1);
1938           /*
1939             We do not need to set block->status|= BLOCK_IN_EVICTION here
1940             because we will set block->status|= BLOCK_IN_SWITCH
1941             immediately without releasing the lock in between. This does
1942             also support debugging. When looking at the block, one can
1943             see if the block has been selected by link_block() after the
1944             LRU ring was empty, or if it was grabbed directly from the
1945             LRU ring in this branch.
1946           */
1947         }
1948 
1949         /*
1950           If we had to wait above, there is a small chance that another
1951           thread grabbed this block for the same file block already. But
1952           in most cases the first condition is true.
1953         */
1954         if (block->hash_link != hash_link &&
1955 	    ! (block->status & BLOCK_IN_SWITCH) )
1956         {
1957 	  /* this is a primary request for a new page */
1958           block->status|= BLOCK_IN_SWITCH;
1959 
1960           if (block->status & BLOCK_CHANGED)
1961           {
1962 	    /* The block contains a dirty page - push it out of the cache */
1963 
1964             if (block->status & BLOCK_IN_FLUSH)
1965             {
1966               /*
1967                 The block is marked for flush. If we do not wait here,
1968                 it could happen that we write the block, reassign it to
1969                 another file block, then, before the new owner can read
1970                 the new file block, the flusher writes the cache block
1971                 (which still has the old contents) to the new file block!
1972               */
1973               wait_on_queue(&block->wqueue[COND_FOR_SAVED],
1974                             &keycache->cache_lock, thread);
1975               /*
1976                 The block is marked BLOCK_IN_SWITCH. It should be left
1977                 alone except for reading. No free, no write.
1978               */
1979               assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
1980               assert(!(block->status & (BLOCK_REASSIGNED |
1981                                         BLOCK_CHANGED |
1982                                         BLOCK_FOR_UPDATE)));
1983             }
1984             else
1985             {
1986               block->status|= BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE;
1987               /*
1988                 BLOCK_IN_EVICTION may be true or not. Other flags must
1989                 have a fixed value.
1990               */
1991               assert((block->status & ~BLOCK_IN_EVICTION) ==
1992                      (BLOCK_READ | BLOCK_IN_SWITCH |
1993                       BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE |
1994                       BLOCK_CHANGED | BLOCK_IN_USE));
1995               assert(block->hash_link);
1996 
1997               mysql_mutex_unlock(&keycache->cache_lock);
1998               /*
1999                 The call is thread safe because only the current
2000                 thread might change the block->hash_link value
2001               */
2002               error= (int)my_pwrite(block->hash_link->file,
2003                                     block->buffer + block->offset,
2004                                     block->length - block->offset,
2005                                     block->hash_link->diskpos + block->offset,
2006                                     MYF(MY_NABP | MY_WAIT_IF_FULL));
2007               mysql_mutex_lock(&keycache->cache_lock);
2008 
2009               /* Block status must not have changed. */
2010               assert((block->status & ~BLOCK_IN_EVICTION) ==
2011                      (BLOCK_READ | BLOCK_IN_SWITCH |
2012                       BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE |
2013                       BLOCK_CHANGED | BLOCK_IN_USE) || fail_block(block));
2014               keycache->global_cache_write++;
2015             }
2016           }
2017 
2018           block->status|= BLOCK_REASSIGNED;
2019           /*
2020             The block comes from the LRU ring. It must have a hash_link
2021             assigned.
2022           */
2023           assert(block->hash_link);
2024           if (block->hash_link)
2025           {
2026             /*
2027               All pending requests for this page must be resubmitted.
2028               This must be done before waiting for readers. They could
2029               wait for the flush to complete. And we must also do it
2030               after the wait. Flushers might try to free the block while
2031               we wait. They would wait until the reassignment is
2032               complete. Also the block status must reflect the correct
2033               situation: The block is not changed nor in flush any more.
2034               Note that we must not change the BLOCK_CHANGED flag
2035               outside of link_to_file_list() so that it is always in the
2036               correct queue and the *blocks_changed counters are
2037               correct.
2038             */
2039             block->status&= ~(BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE);
2040             link_to_file_list(keycache, block, block->hash_link->file, 1);
2041             release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
2042             /*
2043               The block is still assigned to its old hash_link.
2044 	      Wait until all pending read requests
2045 	      for this page are executed
2046 	      (we could have avoided this waiting, if we had read
2047 	      a page in the cache in a sweep, without yielding control)
2048             */
2049             wait_for_readers(keycache, block, thread);
2050             assert(block->hash_link && block->hash_link->block == block &&
2051                    block->prev_changed);
2052             /* The reader must not have been a writer. */
2053             assert(!(block->status & BLOCK_CHANGED));
2054 
2055             /* Wake flushers that might have found the block in between. */
2056             release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
2057 
2058             /* Remove the hash link for the old file block from the hash. */
2059             unlink_hash(keycache, block->hash_link);
2060 
2061             /*
2062               For sanity checks link_to_file_list() asserts that block
2063               and hash_link refer to each other. Hence we need to assign
2064               the hash_link first, but then we would not know if it was
2065               linked before. Hence we would not know if to unlink it. So
2066               unlink it here and call link_to_file_list(..., FALSE).
2067             */
2068             unlink_changed(block);
2069           }
2070           block->status= error ? BLOCK_ERROR : BLOCK_IN_USE ;
2071           block->length= 0;
2072           block->offset= keycache->key_cache_block_size;
2073           block->hash_link= hash_link;
2074           link_to_file_list(keycache, block, file, 0);
2075           page_status= PAGE_TO_BE_READ;
2076 
2077           assert(block->hash_link->block == block);
2078           assert(hash_link->block->hash_link == hash_link);
2079         }
2080         else
2081         {
2082           /*
2083             Either (block->hash_link == hash_link),
2084 	    or     (block->status & BLOCK_IN_SWITCH).
2085 
2086             This is for secondary requests for a new file block only.
2087             Either it is already assigned to the new hash_link meanwhile
2088             (if we had to wait due to empty LRU), or it is already in
2089             eviction by another thread. Since this block has been
2090             grabbed from the LRU ring and attached to this hash_link,
2091             another thread cannot grab the same block from the LRU ring
2092             anymore. If the block is in eviction already, it must become
2093             attached to the same hash_link and as such destined for the
2094             same file block.
2095           */
2096           page_status= (((block->hash_link == hash_link) &&
2097                          (block->status & BLOCK_READ)) ?
2098                         PAGE_READ : PAGE_WAIT_TO_BE_READ);
2099         }
2100       }
2101     }
2102     else
2103     {
2104       /*
2105         Block is not NULL. This hash_link points to a block.
2106         Either
2107         - block not assigned to this hash_link (yet) or
2108         - block assigned but not yet read from file,
2109         or
2110         - block assigned with valid (changed or unchanged) data and
2111         - it will not be reassigned/freed.
2112 
2113         The first condition means hash_link points to a block in
2114         eviction. This is not necessarily marked by BLOCK_IN_SWITCH yet.
2115         But then it is marked BLOCK_IN_EVICTION. See the NOTE in
2116         link_block(). In both cases it is destined for this hash_link
2117         and its file block address. When this hash_link got its block
2118         address, the block was removed from the LRU ring and cannot be
2119         selected for eviction (for another hash_link) again.
2120 
2121         Register a request on the block. This is another protection
2122         against eviction.
2123       */
2124       assert(((block->hash_link != hash_link) &&
2125               (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) ||
2126              ((block->hash_link == hash_link) &&
2127               !(block->status & BLOCK_READ)) ||
2128              ((block->status & BLOCK_READ) &&
2129               !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))));
2130       reg_requests(keycache, block, 1);
2131       page_status= (((block->hash_link == hash_link) &&
2132                      (block->status & BLOCK_READ)) ?
2133                     PAGE_READ : PAGE_WAIT_TO_BE_READ);
2134     }
2135   }
2136 
2137   assert(page_status != -1);
2138   /* Same assert basically, but be very sure. */
2139   assert(block);
2140   /* Assert that block has a request and is not in LRU ring. */
2141   assert(block->requests);
2142   assert(!block->next_used);
2143   assert(!block->prev_used);
2144   /* Assert that we return the correct block. */
2145   assert((page_status == PAGE_WAIT_TO_BE_READ) ||
2146          ((block->hash_link->file == file) &&
2147           (block->hash_link->diskpos == filepos)));
2148   *page_st=page_status;
2149   DBUG_RETURN(block);
2150 }
2151 
2152 
2153 /*
2154   Read into a key cache block buffer from disk.
2155 
2156   SYNOPSIS
2157 
2158     read_block()
2159       keycache            pointer to a key cache data structure
2160       thread_var          pointer to thread specific variables
2161       block               block to which buffer the data is to be read
2162       read_length         size of data to be read
2163       min_length          at least so much data must be read
2164       primary             <-> the current thread will read the data
2165 
2166   RETURN VALUE
2167     None
2168 
2169   NOTES.
2170     The function either reads a page data from file to the block buffer,
2171     or waits until another thread reads it. What page to read is determined
2172     by a block parameter - reference to a hash link for this page.
2173     If an error occurs THE BLOCK_ERROR bit is set in the block status.
2174     We do not report error when the size of successfully read
2175     portion is less than read_length, but not less than min_length.
2176 */
2177 
read_block(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,BLOCK_LINK * block,uint read_length,uint min_length,my_bool primary)2178 static void read_block(KEY_CACHE *keycache,
2179                        st_keycache_thread_var *thread_var,
2180                        BLOCK_LINK *block, uint read_length,
2181                        uint min_length, my_bool primary)
2182 {
2183   size_t got_length;
2184 
2185   /* On entry cache_lock is locked */
2186 
2187   if (primary)
2188   {
2189     /*
2190       This code is executed only by threads that submitted primary
2191       requests. Until block->status contains BLOCK_READ, all other
2192       request for the block become secondary requests. For a primary
2193       request the block must be properly initialized.
2194     */
2195     assert(((block->status & ~BLOCK_FOR_UPDATE) == BLOCK_IN_USE) ||
2196            fail_block(block));
2197     assert((block->length == 0) || fail_block(block));
2198     assert((block->offset == keycache->key_cache_block_size) ||
2199            fail_block(block));
2200     assert((block->requests > 0) || fail_block(block));
2201 
2202     keycache->global_cache_read++;
2203     /* Page is not in buffer yet, is to be read from disk */
2204     mysql_mutex_unlock(&keycache->cache_lock);
2205     /*
2206       Here other threads may step in and register as secondary readers.
2207       They will register in block->wqueue[COND_FOR_REQUESTED].
2208     */
2209     got_length= my_pread(block->hash_link->file, block->buffer,
2210                          read_length, block->hash_link->diskpos, MYF(0));
2211     mysql_mutex_lock(&keycache->cache_lock);
2212     /*
2213       The block can now have been marked for free (in case of
2214       FLUSH_RELEASE). Otherwise the state must be unchanged.
2215     */
2216     assert(((block->status & ~(BLOCK_REASSIGNED |
2217                                BLOCK_FOR_UPDATE)) == BLOCK_IN_USE) ||
2218            fail_block(block));
2219     assert((block->length == 0) || fail_block(block));
2220     assert((block->offset == keycache->key_cache_block_size) ||
2221            fail_block(block));
2222     assert((block->requests > 0) || fail_block(block));
2223 
2224     if (got_length < min_length)
2225       block->status|= BLOCK_ERROR;
2226     else
2227     {
2228       block->status|= BLOCK_READ;
2229       block->length= (int)got_length;
2230       /*
2231         Do not set block->offset here. If this block is marked
2232         BLOCK_CHANGED later, we want to flush only the modified part. So
2233         only a writer may set block->offset down from
2234         keycache->key_cache_block_size.
2235       */
2236     }
2237     /* Signal that all pending requests for this page now can be processed */
2238     release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
2239   }
2240   else
2241   {
2242     /*
2243       This code is executed only by threads that submitted secondary
2244       requests. At this point it could happen that the cache block is
2245       not yet assigned to the hash_link for the requested file block.
2246       But at awake from the wait this should be the case. Unfortunately
2247       we cannot assert this here because we do not know the hash_link
2248       for the requested file block nor the file and position. So we have
2249       to assert this in the caller.
2250     */
2251     wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock,
2252                   thread_var);
2253   }
2254 }
2255 
2256 
2257 /*
2258   Read a block of data from a cached file into a buffer;
2259 
2260   SYNOPSIS
2261 
2262     key_cache_read()
2263       keycache            pointer to a key cache data structure
2264       thread_var          pointer to thread specific variables
2265       file                handler for the file for the block of data to be read
2266       filepos             position of the block of data in the file
2267       level               determines the weight of the data
2268       buff                buffer to where the data must be placed
2269       length              length of the buffer
2270       block_length        length of the block in the key cache buffer
2271       return_buffer       return pointer to the key cache buffer with the data
2272 
2273   RETURN VALUE
2274     Returns address from where the data is placed if sucessful, 0 - otherwise.
2275 
2276   NOTES.
2277     The function ensures that a block of data of size length from file
2278     positioned at filepos is in the buffers for some key cache blocks.
2279     Then the function either copies the data into the buffer buff, or,
2280     if return_buffer is TRUE, it just returns the pointer to the key cache
2281     buffer with the data.
2282     Filepos must be a multiple of 'block_length', but it doesn't
2283     have to be a multiple of key_cache_block_size;
2284 */
2285 
key_cache_read(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,File file,my_off_t filepos,int level,uchar * buff,uint length,uint block_length MY_ATTRIBUTE ((unused)),int return_buffer MY_ATTRIBUTE ((unused)))2286 uchar *key_cache_read(KEY_CACHE *keycache,
2287                       st_keycache_thread_var *thread_var,
2288                       File file, my_off_t filepos, int level,
2289                       uchar *buff, uint length,
2290                       uint block_length MY_ATTRIBUTE((unused)),
2291                       int return_buffer MY_ATTRIBUTE((unused)))
2292 {
2293   my_bool locked_and_incremented= FALSE;
2294   int error=0;
2295   uchar *start= buff;
2296   DBUG_ENTER("key_cache_read");
2297   DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
2298                (uint) file, (ulong) filepos, length));
2299 
2300   if (keycache->key_cache_inited)
2301   {
2302     /* Key cache is used */
2303     BLOCK_LINK *block;
2304     uint read_length;
2305     uint offset;
2306     int page_st;
2307 
2308     if (MYSQL_KEYCACHE_READ_START_ENABLED())
2309     {
2310       MYSQL_KEYCACHE_READ_START(my_filename(file), length,
2311                                 (ulong) (keycache->blocks_used *
2312                                          keycache->key_cache_block_size),
2313                                 (ulong) (keycache->blocks_unused *
2314                                          keycache->key_cache_block_size));
2315     }
2316 
2317     /*
2318       When the key cache is once initialized, we use the cache_lock to
2319       reliably distinguish the cases of normal operation, resizing, and
2320       disabled cache. We always increment and decrement
2321       'cnt_for_resize_op' so that a resizer can wait for pending I/O.
2322     */
2323     mysql_mutex_lock(&keycache->cache_lock);
2324     /*
2325       Cache resizing has two phases: Flushing and re-initializing. In
2326       the flush phase read requests are allowed to bypass the cache for
2327       blocks not in the cache. find_key_block() returns NULL in this
2328       case.
2329 
2330       After the flush phase new I/O requests must wait until the
2331       re-initialization is done. The re-initialization can be done only
2332       if no I/O request is in progress. The reason is that
2333       key_cache_block_size can change. With enabled cache, I/O is done
2334       in chunks of key_cache_block_size. Every chunk tries to use a
2335       cache block first. If the block size changes in the middle, a
2336       block could be missed and old data could be read.
2337     */
2338     while (keycache->in_resize && !keycache->resize_in_flush)
2339       wait_on_queue(&keycache->resize_queue, &keycache->cache_lock,
2340                     thread_var);
2341     /* Register the I/O for the next resize. */
2342     inc_counter_for_resize_op(keycache);
2343     locked_and_incremented= TRUE;
2344     /* Requested data may not always be aligned to cache blocks. */
2345     offset= (uint) (filepos % keycache->key_cache_block_size);
2346     /* Read data in key_cache_block_size increments */
2347     do
2348     {
2349       /* Cache could be disabled in a later iteration. */
2350       if (!keycache->can_be_used)
2351       {
2352         goto no_key_cache;
2353       }
2354       /* Start reading at the beginning of the cache block. */
2355       filepos-= offset;
2356       /* Do not read beyond the end of the cache block. */
2357       read_length= length;
2358       set_if_smaller(read_length, keycache->key_cache_block_size-offset);
2359       assert(read_length > 0);
2360 
2361       if (block_length > keycache->key_cache_block_size || offset)
2362 	return_buffer=0;
2363 
2364       /* Request the cache block that matches file/pos. */
2365       keycache->global_cache_r_requests++;
2366 
2367       MYSQL_KEYCACHE_READ_BLOCK(keycache->key_cache_block_size);
2368 
2369       block= find_key_block(keycache, thread_var, file, filepos, level, 0,
2370                             &page_st);
2371       if (!block)
2372       {
2373         /*
2374           This happens only for requests submitted during key cache
2375           resize. The block is not in the cache and shall not go in.
2376           Read directly from file.
2377         */
2378         keycache->global_cache_read++;
2379         mysql_mutex_unlock(&keycache->cache_lock);
2380         error= (my_pread(file, (uchar*) buff, read_length,
2381                          filepos + offset, MYF(MY_NABP)) != 0);
2382         mysql_mutex_lock(&keycache->cache_lock);
2383         goto next_block;
2384       }
2385       if (!(block->status & BLOCK_ERROR))
2386       {
2387         if (page_st != PAGE_READ)
2388         {
2389           MYSQL_KEYCACHE_READ_MISS();
2390           /* The requested page is to be read into the block buffer */
2391           read_block(keycache, thread_var, block,
2392                      keycache->key_cache_block_size, read_length+offset,
2393                      (my_bool)(page_st == PAGE_TO_BE_READ));
2394           /*
2395             A secondary request must now have the block assigned to the
2396             requested file block. It does not hurt to check it for
2397             primary requests too.
2398           */
2399           assert(keycache->can_be_used);
2400           assert(block->hash_link->file == file);
2401           assert(block->hash_link->diskpos == filepos);
2402           assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
2403         }
2404         else if (block->length < read_length + offset)
2405         {
2406           /*
2407             Impossible if nothing goes wrong:
2408             this could only happen if we are using a file with
2409             small key blocks and are trying to read outside the file
2410           */
2411           set_my_errno(-1);
2412           block->status|= BLOCK_ERROR;
2413         }
2414         else
2415         {
2416           MYSQL_KEYCACHE_READ_HIT();
2417         }
2418       }
2419 
2420       /* block status may have added BLOCK_ERROR in the above 'if'. */
2421       if (!(block->status & BLOCK_ERROR))
2422       {
2423         {
2424           assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
2425           mysql_mutex_unlock(&keycache->cache_lock);
2426 
2427           /* Copy data from the cache buffer */
2428           memcpy(buff, block->buffer+offset, (size_t) read_length);
2429 
2430           mysql_mutex_lock(&keycache->cache_lock);
2431           assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
2432         }
2433       }
2434 
2435       remove_reader(block);
2436 
2437       /* Error injection for coverage testing. */
2438       DBUG_EXECUTE_IF("key_cache_read_block_error",
2439                       block->status|= BLOCK_ERROR;);
2440 
2441       /* Do not link erroneous blocks into the LRU ring, but free them. */
2442       if (!(block->status & BLOCK_ERROR))
2443       {
2444         /*
2445           Link the block into the LRU ring if it's the last submitted
2446           request for the block. This enables eviction for the block.
2447         */
2448         unreg_request(keycache, block, 1);
2449       }
2450       else
2451       {
2452         free_block(keycache, thread_var, block);
2453         error= 1;
2454         break;
2455       }
2456 
2457     next_block:
2458       buff+= read_length;
2459       filepos+= read_length+offset;
2460       offset= 0;
2461 
2462     } while ((length-= read_length));
2463     if (MYSQL_KEYCACHE_READ_DONE_ENABLED())
2464     {
2465       MYSQL_KEYCACHE_READ_DONE((ulong) (keycache->blocks_used *
2466                                         keycache->key_cache_block_size),
2467                                (ulong) (keycache->blocks_unused *
2468                                         keycache->key_cache_block_size));
2469     }
2470     goto end;
2471   }
2472 
2473 no_key_cache:
2474   /* Key cache is not used */
2475 
2476   keycache->global_cache_r_requests++;
2477   keycache->global_cache_read++;
2478 
2479   if (locked_and_incremented)
2480     mysql_mutex_unlock(&keycache->cache_lock);
2481   if (my_pread(file, (uchar*) buff, length, filepos, MYF(MY_NABP)))
2482     error= 1;
2483   if (locked_and_incremented)
2484     mysql_mutex_lock(&keycache->cache_lock);
2485 
2486 end:
2487   if (locked_and_incremented)
2488   {
2489     dec_counter_for_resize_op(keycache);
2490     mysql_mutex_unlock(&keycache->cache_lock);
2491   }
2492   DBUG_PRINT("exit", ("error: %d", error ));
2493   DBUG_RETURN(error ? (uchar*) 0 : start);
2494 }
2495 
2496 
2497 /*
2498   Insert a block of file data from a buffer into key cache
2499 
2500   SYNOPSIS
2501     key_cache_insert()
2502     keycache            pointer to a key cache data structure
2503     thread_var          pointer to thread specific variables
2504     file                handler for the file to insert data from
2505     filepos             position of the block of data in the file to insert
2506     level               determines the weight of the data
2507     buff                buffer to read data from
2508     length              length of the data in the buffer
2509 
2510   NOTES
2511     This is used by MyISAM to move all blocks from a index file to the key
2512     cache
2513 
2514   RETURN VALUE
2515     0 if a success, 1 - otherwise.
2516 */
2517 
key_cache_insert(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,File file,my_off_t filepos,int level,uchar * buff,uint length)2518 int key_cache_insert(KEY_CACHE *keycache,
2519                      st_keycache_thread_var *thread_var,
2520                      File file, my_off_t filepos, int level,
2521                      uchar *buff, uint length)
2522 {
2523   int error= 0;
2524   DBUG_ENTER("key_cache_insert");
2525   DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
2526                (uint) file,(ulong) filepos, length));
2527 
2528   if (keycache->key_cache_inited)
2529   {
2530     /* Key cache is used */
2531     BLOCK_LINK *block;
2532     uint read_length;
2533     uint offset;
2534     int page_st;
2535     my_bool locked_and_incremented= FALSE;
2536 
2537     /*
2538       When the keycache is once initialized, we use the cache_lock to
2539       reliably distinguish the cases of normal operation, resizing, and
2540       disabled cache. We always increment and decrement
2541       'cnt_for_resize_op' so that a resizer can wait for pending I/O.
2542     */
2543     mysql_mutex_lock(&keycache->cache_lock);
2544     /*
2545       We do not load index data into a disabled cache nor into an
2546       ongoing resize.
2547     */
2548     if (!keycache->can_be_used || keycache->in_resize)
2549 	goto no_key_cache;
2550     /* Register the pseudo I/O for the next resize. */
2551     inc_counter_for_resize_op(keycache);
2552     locked_and_incremented= TRUE;
2553     /* Loaded data may not always be aligned to cache blocks. */
2554     offset= (uint) (filepos % keycache->key_cache_block_size);
2555     /* Load data in key_cache_block_size increments. */
2556     do
2557     {
2558       /* Cache could be disabled or resizing in a later iteration. */
2559       if (!keycache->can_be_used || keycache->in_resize)
2560 	goto no_key_cache;
2561       /* Start loading at the beginning of the cache block. */
2562       filepos-= offset;
2563       /* Do not load beyond the end of the cache block. */
2564       read_length= length;
2565       set_if_smaller(read_length, keycache->key_cache_block_size-offset);
2566       assert(read_length > 0);
2567 
2568       /* The block has been read by the caller already. */
2569       keycache->global_cache_read++;
2570       /* Request the cache block that matches file/pos. */
2571       keycache->global_cache_r_requests++;
2572       block= find_key_block(keycache, thread_var, file, filepos, level, 0,
2573                             &page_st);
2574       if (!block)
2575       {
2576         /*
2577           This happens only for requests submitted during key cache
2578           resize. The block is not in the cache and shall not go in.
2579           Stop loading index data.
2580         */
2581         goto no_key_cache;
2582       }
2583       if (!(block->status & BLOCK_ERROR))
2584       {
2585         if ((page_st == PAGE_WAIT_TO_BE_READ) ||
2586             ((page_st == PAGE_TO_BE_READ) &&
2587              (offset || (read_length < keycache->key_cache_block_size))))
2588         {
2589           /*
2590             Either
2591 
2592             this is a secondary request for a block to be read into the
2593             cache. The block is in eviction. It is not yet assigned to
2594             the requested file block (It does not point to the right
2595             hash_link). So we cannot call remove_reader() on the block.
2596             And we cannot access the hash_link directly here. We need to
2597             wait until the assignment is complete. read_block() executes
2598             the correct wait when called with primary == FALSE.
2599 
2600             Or
2601 
2602             this is a primary request for a block to be read into the
2603             cache and the supplied data does not fill the whole block.
2604 
2605             This function is called on behalf of a LOAD INDEX INTO CACHE
2606             statement, which is a read-only task and allows other
2607             readers. It is possible that a parallel running reader tries
2608             to access this block. If it needs more data than has been
2609             supplied here, it would report an error. To be sure that we
2610             have all data in the block that is available in the file, we
2611             read the block ourselves.
2612 
2613             Though reading again what the caller did read already is an
2614             expensive operation, we need to do this for correctness.
2615           */
2616           read_block(keycache, thread_var, block,
2617                      keycache->key_cache_block_size,
2618                      read_length + offset, (page_st == PAGE_TO_BE_READ));
2619           /*
2620             A secondary request must now have the block assigned to the
2621             requested file block. It does not hurt to check it for
2622             primary requests too.
2623           */
2624           assert(keycache->can_be_used);
2625           assert(block->hash_link->file == file);
2626           assert(block->hash_link->diskpos == filepos);
2627           assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
2628         }
2629         else if (page_st == PAGE_TO_BE_READ)
2630         {
2631           /*
2632             This is a new block in the cache. If we come here, we have
2633             data for the whole block.
2634           */
2635           assert(block->hash_link->requests);
2636           assert(block->status & BLOCK_IN_USE);
2637           assert((page_st == PAGE_TO_BE_READ) ||
2638                  (block->status & BLOCK_READ));
2639 
2640           mysql_mutex_unlock(&keycache->cache_lock);
2641           /*
2642             Here other threads may step in and register as secondary readers.
2643             They will register in block->wqueue[COND_FOR_REQUESTED].
2644           */
2645 
2646           /* Copy data from buff */
2647           memcpy(block->buffer+offset, buff, (size_t) read_length);
2648 
2649           mysql_mutex_lock(&keycache->cache_lock);
2650           assert(block->status & BLOCK_IN_USE);
2651           assert((page_st == PAGE_TO_BE_READ) ||
2652                  (block->status & BLOCK_READ));
2653           /*
2654             After the data is in the buffer, we can declare the block
2655             valid. Now other threads do not need to register as
2656             secondary readers any more. They can immediately access the
2657             block.
2658           */
2659           block->status|= BLOCK_READ;
2660           block->length= read_length+offset;
2661           /*
2662             Do not set block->offset here. If this block is marked
2663             BLOCK_CHANGED later, we want to flush only the modified part. So
2664             only a writer may set block->offset down from
2665             keycache->key_cache_block_size.
2666           */
2667           /* Signal all pending requests. */
2668           release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
2669         }
2670         else
2671         {
2672           /*
2673             page_st == PAGE_READ. The block is in the buffer. All data
2674             must already be present. Blocks are always read with all
2675             data available on file. Assert that the block does not have
2676             less contents than the preloader supplies. If the caller has
2677             data beyond block->length, it means that a file write has
2678             been done while this block was in cache and not extended
2679             with the new data. If the condition is met, we can simply
2680             ignore the block.
2681           */
2682           assert((page_st == PAGE_READ) &&
2683                  (read_length + offset <= block->length));
2684         }
2685 
2686         /*
2687           A secondary request must now have the block assigned to the
2688           requested file block. It does not hurt to check it for primary
2689           requests too.
2690         */
2691         assert(block->hash_link->file == file);
2692         assert(block->hash_link->diskpos == filepos);
2693         assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
2694       } /* end of if (!(block->status & BLOCK_ERROR)) */
2695 
2696       remove_reader(block);
2697 
2698       /* Error injection for coverage testing. */
2699       DBUG_EXECUTE_IF("key_cache_insert_block_error",
2700                       block->status|= BLOCK_ERROR; errno=EIO;);
2701 
2702       /* Do not link erroneous blocks into the LRU ring, but free them. */
2703       if (!(block->status & BLOCK_ERROR))
2704       {
2705         /*
2706           Link the block into the LRU ring if it's the last submitted
2707           request for the block. This enables eviction for the block.
2708         */
2709         unreg_request(keycache, block, 1);
2710       }
2711       else
2712       {
2713         free_block(keycache, thread_var, block);
2714         error= 1;
2715         break;
2716       }
2717 
2718       buff+= read_length;
2719       filepos+= read_length+offset;
2720       offset= 0;
2721 
2722     } while ((length-= read_length));
2723 
2724   no_key_cache:
2725     if (locked_and_incremented)
2726       dec_counter_for_resize_op(keycache);
2727     mysql_mutex_unlock(&keycache->cache_lock);
2728   }
2729   DBUG_RETURN(error);
2730 }
2731 
2732 
2733 /*
2734   Write a buffer into a cached file.
2735 
2736   SYNOPSIS
2737 
2738     key_cache_write()
2739       keycache            pointer to a key cache data structure
2740       thread_var          pointer to thread specific variables
2741       file                handler for the file to write data to
2742       filepos             position in the file to write data to
2743       level               determines the weight of the data
2744       buff                buffer with the data
2745       length              length of the buffer
2746       dont_write          if is 0 then all dirty pages involved in writing
2747                           should have been flushed from key cache
2748 
2749   RETURN VALUE
2750     0 if a success, 1 - otherwise.
2751 
2752   NOTES.
2753     The function copies the data of size length from buff into buffers
2754     for key cache blocks that are  assigned to contain the portion of
2755     the file starting with position filepos.
2756     It ensures that this data is flushed to the file if dont_write is FALSE.
2757     Filepos must be a multiple of 'block_length', but it doesn't
2758     have to be a multiple of key_cache_block_size;
2759 
2760     dont_write is always TRUE in the server (info->lock_type is never F_UNLCK).
2761 */
2762 
key_cache_write(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,File file,my_off_t filepos,int level,uchar * buff,uint length,uint block_length MY_ATTRIBUTE ((unused)),int dont_write)2763 int key_cache_write(KEY_CACHE *keycache,
2764                     st_keycache_thread_var *thread_var,
2765                     File file, my_off_t filepos, int level,
2766                     uchar *buff, uint length,
2767                     uint block_length  MY_ATTRIBUTE((unused)),
2768                     int dont_write)
2769 {
2770   my_bool locked_and_incremented= FALSE;
2771   int error=0;
2772   DBUG_ENTER("key_cache_write");
2773   DBUG_PRINT("enter",
2774              ("fd: %u  pos: %lu  length: %u  block_length: %u"
2775               "  key_block_length: %u",
2776               (uint) file, (ulong) filepos, length, block_length,
2777               keycache ? keycache->key_cache_block_size : 0));
2778 
2779   if (!dont_write)
2780   {
2781     /* purecov: begin inspected */
2782     /* Not used in the server. */
2783     /* Force writing from buff into disk. */
2784     keycache->global_cache_w_requests++;
2785     keycache->global_cache_write++;
2786     if (my_pwrite(file, buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL)))
2787       DBUG_RETURN(1);
2788     /* purecov: end */
2789   }
2790 
2791   if (keycache->key_cache_inited)
2792   {
2793     /* Key cache is used */
2794     BLOCK_LINK *block;
2795     uint read_length;
2796     uint offset;
2797     int page_st;
2798 
2799     if (MYSQL_KEYCACHE_WRITE_START_ENABLED())
2800     {
2801       MYSQL_KEYCACHE_WRITE_START(my_filename(file), length,
2802                                  (ulong) (keycache->blocks_used *
2803                                           keycache->key_cache_block_size),
2804                                  (ulong) (keycache->blocks_unused *
2805                                           keycache->key_cache_block_size));
2806     }
2807 
2808     /*
2809       When the key cache is once initialized, we use the cache_lock to
2810       reliably distinguish the cases of normal operation, resizing, and
2811       disabled cache. We always increment and decrement
2812       'cnt_for_resize_op' so that a resizer can wait for pending I/O.
2813     */
2814     mysql_mutex_lock(&keycache->cache_lock);
2815     /*
2816       Cache resizing has two phases: Flushing and re-initializing. In
2817       the flush phase write requests can modify dirty blocks that are
2818       not yet in flush. Otherwise they are allowed to bypass the cache.
2819       find_key_block() returns NULL in both cases (clean blocks and
2820       non-cached blocks).
2821 
2822       After the flush phase new I/O requests must wait until the
2823       re-initialization is done. The re-initialization can be done only
2824       if no I/O request is in progress. The reason is that
2825       key_cache_block_size can change. With enabled cache I/O is done in
2826       chunks of key_cache_block_size. Every chunk tries to use a cache
2827       block first. If the block size changes in the middle, a block
2828       could be missed and data could be written below a cached block.
2829     */
2830     while (keycache->in_resize && !keycache->resize_in_flush)
2831       wait_on_queue(&keycache->resize_queue, &keycache->cache_lock,
2832                     thread_var);
2833     /* Register the I/O for the next resize. */
2834     inc_counter_for_resize_op(keycache);
2835     locked_and_incremented= TRUE;
2836     /* Requested data may not always be aligned to cache blocks. */
2837     offset= (uint) (filepos % keycache->key_cache_block_size);
2838     /* Write data in key_cache_block_size increments. */
2839     do
2840     {
2841       /* Cache could be disabled in a later iteration. */
2842       if (!keycache->can_be_used)
2843 	goto no_key_cache;
2844 
2845       MYSQL_KEYCACHE_WRITE_BLOCK(keycache->key_cache_block_size);
2846       /* Start writing at the beginning of the cache block. */
2847       filepos-= offset;
2848       /* Do not write beyond the end of the cache block. */
2849       read_length= length;
2850       set_if_smaller(read_length, keycache->key_cache_block_size-offset);
2851       assert(read_length > 0);
2852 
2853       /* Request the cache block that matches file/pos. */
2854       keycache->global_cache_w_requests++;
2855       block= find_key_block(keycache, thread_var, file, filepos, level, 1,
2856                             &page_st);
2857       if (!block)
2858       {
2859         /*
2860           This happens only for requests submitted during key cache
2861           resize. The block is not in the cache and shall not go in.
2862           Write directly to file.
2863         */
2864         if (dont_write)
2865         {
2866           /* Used in the server. */
2867           keycache->global_cache_write++;
2868           mysql_mutex_unlock(&keycache->cache_lock);
2869           if (my_pwrite(file, (uchar*) buff, read_length, filepos + offset,
2870                         MYF(MY_NABP | MY_WAIT_IF_FULL)))
2871             error=1;
2872           mysql_mutex_lock(&keycache->cache_lock);
2873         }
2874         goto next_block;
2875       }
2876       /*
2877         Prevent block from flushing and from being selected for to be
2878         freed. This must be set when we release the cache_lock.
2879         However, we must not set the status of the block before it is
2880         assigned to this file/pos.
2881       */
2882       if (page_st != PAGE_WAIT_TO_BE_READ)
2883         block->status|= BLOCK_FOR_UPDATE;
2884       /*
2885         We must read the file block first if it is not yet in the cache
2886         and we do not replace all of its contents.
2887 
2888         In cases where the cache block is big enough to contain (parts
2889         of) index blocks of different indexes, our request can be
2890         secondary (PAGE_WAIT_TO_BE_READ). In this case another thread is
2891         reading the file block. If the read completes after us, it
2892         overwrites our new contents with the old contents. So we have to
2893         wait for the other thread to complete the read of this block.
2894         read_block() takes care for the wait.
2895       */
2896       if (!(block->status & BLOCK_ERROR) &&
2897           ((page_st == PAGE_TO_BE_READ &&
2898             (offset || read_length < keycache->key_cache_block_size)) ||
2899            (page_st == PAGE_WAIT_TO_BE_READ)))
2900       {
2901         read_block(keycache, thread_var, block,
2902                    offset + read_length >= keycache->key_cache_block_size?
2903                    offset : keycache->key_cache_block_size,
2904                    offset, (page_st == PAGE_TO_BE_READ));
2905         assert(keycache->can_be_used);
2906         assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
2907         /*
2908           Prevent block from flushing and from being selected for to be
2909           freed. This must be set when we release the cache_lock.
2910           Here we set it in case we could not set it above.
2911         */
2912         block->status|= BLOCK_FOR_UPDATE;
2913       }
2914       /*
2915         The block should always be assigned to the requested file block
2916         here. It need not be BLOCK_READ when overwriting the whole block.
2917       */
2918       assert(block->hash_link->file == file);
2919       assert(block->hash_link->diskpos == filepos);
2920       assert(block->status & BLOCK_IN_USE);
2921       assert((page_st == PAGE_TO_BE_READ) || (block->status & BLOCK_READ));
2922       /*
2923         The block to be written must not be marked BLOCK_REASSIGNED.
2924         Otherwise it could be freed in dirty state or reused without
2925         another flush during eviction. It must also not be in flush.
2926         Otherwise the old contens may have been flushed already and
2927         the flusher could clear BLOCK_CHANGED without flushing the
2928         new changes again.
2929       */
2930       assert(!(block->status & BLOCK_REASSIGNED));
2931 
2932       while (block->status & BLOCK_IN_FLUSHWRITE)
2933       {
2934         /*
2935           Another thread is flushing the block. It was dirty already.
2936           Wait until the block is flushed to file. Otherwise we could
2937           modify the buffer contents just while it is written to file.
2938           An unpredictable file block contents would be the result.
2939           While we wait, several things can happen to the block,
2940           including another flush. But the block cannot be reassigned to
2941           another hash_link until we release our request on it.
2942         */
2943         wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock,
2944                       thread_var);
2945         assert(keycache->can_be_used);
2946         assert(block->status & (BLOCK_READ | BLOCK_IN_USE));
2947         /* Still must not be marked for free. */
2948         assert(!(block->status & BLOCK_REASSIGNED));
2949         assert(block->hash_link && (block->hash_link->block == block));
2950       }
2951 
2952       /*
2953         We could perhaps release the cache_lock during access of the
2954         data like in the other functions. Locks outside of the key cache
2955         assure that readers and a writer do not access the same range of
2956         data. Parallel accesses should happen only if the cache block
2957         contains multiple index block(fragment)s. So different parts of
2958         the buffer would be read/written. An attempt to flush during
2959         memcpy() is prevented with BLOCK_FOR_UPDATE.
2960       */
2961       if (!(block->status & BLOCK_ERROR))
2962       {
2963         mysql_mutex_unlock(&keycache->cache_lock);
2964         memcpy(block->buffer+offset, buff, (size_t) read_length);
2965 
2966         mysql_mutex_lock(&keycache->cache_lock);
2967       }
2968 
2969       if (!dont_write)
2970       {
2971         /* Not used in the server. buff has been written to disk at start. */
2972         if ((block->status & BLOCK_CHANGED) &&
2973             (!offset && read_length >= keycache->key_cache_block_size))
2974              link_to_file_list(keycache, block, block->hash_link->file, 1);
2975       }
2976       else if (! (block->status & BLOCK_CHANGED))
2977         link_to_changed_list(keycache, block);
2978       block->status|=BLOCK_READ;
2979       /*
2980         Allow block to be selected for to be freed. Since it is marked
2981         BLOCK_CHANGED too, it won't be selected for to be freed without
2982         a flush.
2983       */
2984       block->status&= ~BLOCK_FOR_UPDATE;
2985       set_if_smaller(block->offset, offset);
2986       set_if_bigger(block->length, read_length+offset);
2987 
2988       /* Threads may be waiting for the changes to be complete. */
2989       release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
2990 
2991       /*
2992         If only a part of the cache block is to be replaced, and the
2993         rest has been read from file, then the cache lock has been
2994         released for I/O and it could be possible that another thread
2995         wants to evict or free the block and waits for it to be
2996         released. So we must not just decrement hash_link->requests, but
2997         also wake a waiting thread.
2998       */
2999       remove_reader(block);
3000 
3001       /* Error injection for coverage testing. */
3002       DBUG_EXECUTE_IF("key_cache_write_block_error",
3003                       block->status|= BLOCK_ERROR;);
3004 
3005       /* Do not link erroneous blocks into the LRU ring, but free them. */
3006       if (!(block->status & BLOCK_ERROR))
3007       {
3008         /*
3009           Link the block into the LRU ring if it's the last submitted
3010           request for the block. This enables eviction for the block.
3011         */
3012         unreg_request(keycache, block, 1);
3013       }
3014       else
3015       {
3016         /* Pretend a "clean" block to avoid complications. */
3017         block->status&= ~(BLOCK_CHANGED);
3018         free_block(keycache, thread_var, block);
3019         error= 1;
3020         break;
3021       }
3022 
3023     next_block:
3024       buff+= read_length;
3025       filepos+= read_length+offset;
3026       offset= 0;
3027 
3028     } while ((length-= read_length));
3029     goto end;
3030   }
3031 
3032 no_key_cache:
3033   /* Key cache is not used */
3034   if (dont_write)
3035   {
3036     /* Used in the server. */
3037     keycache->global_cache_w_requests++;
3038     keycache->global_cache_write++;
3039     if (locked_and_incremented)
3040       mysql_mutex_unlock(&keycache->cache_lock);
3041     if (my_pwrite(file, (uchar*) buff, length, filepos,
3042 		  MYF(MY_NABP | MY_WAIT_IF_FULL)))
3043       error=1;
3044     if (locked_and_incremented)
3045       mysql_mutex_lock(&keycache->cache_lock);
3046   }
3047 
3048 end:
3049   if (locked_and_incremented)
3050   {
3051     dec_counter_for_resize_op(keycache);
3052     mysql_mutex_unlock(&keycache->cache_lock);
3053   }
3054 
3055   if (MYSQL_KEYCACHE_WRITE_DONE_ENABLED())
3056   {
3057     MYSQL_KEYCACHE_WRITE_DONE((ulong) (keycache->blocks_used *
3058                                        keycache->key_cache_block_size),
3059                               (ulong) (keycache->blocks_unused *
3060                                        keycache->key_cache_block_size));
3061   }
3062 
3063   DBUG_RETURN(error);
3064 }
3065 
3066 
3067 /*
3068   Free block.
3069 
3070   SYNOPSIS
3071     free_block()
3072       keycache          Pointer to a key cache data structure
3073       thread_var        Pointer to thread specific variables
3074       block             Pointer to the block to free
3075 
3076   DESCRIPTION
3077     Remove reference to block from hash table.
3078     Remove block from the chain of clean blocks.
3079     Add block to the free list.
3080 
3081   NOTE
3082     Block must not be free (status == 0).
3083     Block must not be in free_block_list.
3084     Block must not be in the LRU ring.
3085     Block must not be in eviction (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH).
3086     Block must not be in free (BLOCK_REASSIGNED).
3087     Block must not be in flush (BLOCK_IN_FLUSH).
3088     Block must not be dirty (BLOCK_CHANGED).
3089     Block must not be in changed_blocks (dirty) hash.
3090     Block must be in file_blocks (clean) hash.
3091     Block must refer to a hash_link.
3092     Block must have a request registered on it.
3093 */
3094 
free_block(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,BLOCK_LINK * block)3095 static void free_block(KEY_CACHE *keycache,
3096                        st_keycache_thread_var *thread_var,
3097                        BLOCK_LINK *block)
3098 {
3099   /*
3100     Assert that the block is not free already. And that it is in a clean
3101     state. Note that the block might just be assigned to a hash_link and
3102     not yet read (BLOCK_READ may not be set here). In this case a reader
3103     is registered in the hash_link and free_block() will wait for it
3104     below.
3105   */
3106   assert((block->status & BLOCK_IN_USE) &&
3107          !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
3108                             BLOCK_REASSIGNED | BLOCK_IN_FLUSH |
3109                             BLOCK_CHANGED | BLOCK_FOR_UPDATE)));
3110   /* Assert that the block is in a file_blocks chain. */
3111   assert(block->prev_changed && *block->prev_changed == block);
3112   /* Assert that the block is not in the LRU ring. */
3113   assert(!block->next_used && !block->prev_used);
3114   /*
3115     IMHO the below condition (if()) makes no sense. I can't see how it
3116     could be possible that free_block() is entered with a NULL hash_link
3117     pointer. The only place where it can become NULL is in free_block()
3118     (or before its first use ever, but for those blocks free_block() is
3119     not called). I don't remove the conditional as it cannot harm, but
3120     place an assert to confirm my hypothesis. Eventually the
3121     condition (if()) can be removed.
3122   */
3123   assert(block->hash_link && block->hash_link->block == block);
3124   if (block->hash_link)
3125   {
3126     /*
3127       While waiting for readers to finish, new readers might request the
3128       block. But since we set block->status|= BLOCK_REASSIGNED, they
3129       will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled
3130       later.
3131     */
3132     block->status|= BLOCK_REASSIGNED;
3133     wait_for_readers(keycache, block, thread_var);
3134     /*
3135       The block must not have been freed by another thread. Repeat some
3136       checks. An additional requirement is that it must be read now
3137       (BLOCK_READ).
3138     */
3139     assert(block->hash_link && block->hash_link->block == block);
3140     assert((block->status & (BLOCK_READ | BLOCK_IN_USE |
3141                              BLOCK_REASSIGNED)) &&
3142            !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
3143                               BLOCK_IN_FLUSH | BLOCK_CHANGED |
3144                               BLOCK_FOR_UPDATE)));
3145     assert(block->prev_changed && *block->prev_changed == block);
3146     assert(!block->prev_used);
3147     /*
3148       Unset BLOCK_REASSIGNED again. If we hand the block to an evicting
3149       thread (through unreg_request() below), other threads must not see
3150       this flag. They could become confused.
3151     */
3152     block->status&= ~BLOCK_REASSIGNED;
3153     /*
3154       Do not release the hash_link until the block is off all lists.
3155       At least not if we hand it over for eviction in unreg_request().
3156     */
3157   }
3158 
3159   /*
3160     Unregister the block request and link the block into the LRU ring.
3161     This enables eviction for the block. If the LRU ring was empty and
3162     threads are waiting for a block, then the block wil be handed over
3163     for eviction immediately. Otherwise we will unlink it from the LRU
3164     ring again, without releasing the lock in between. So decrementing
3165     the request counter and updating statistics are the only relevant
3166     operation in this case. Assert that there are no other requests
3167     registered.
3168   */
3169   assert(block->requests == 1);
3170   unreg_request(keycache, block, 0);
3171   /*
3172     Note that even without releasing the cache lock it is possible that
3173     the block is immediately selected for eviction by link_block() and
3174     thus not added to the LRU ring. In this case we must not touch the
3175     block any more.
3176   */
3177   if (block->status & BLOCK_IN_EVICTION)
3178     return;
3179 
3180   /* Error blocks are not put into the LRU ring. */
3181   if (!(block->status & BLOCK_ERROR))
3182   {
3183     /* Here the block must be in the LRU ring. Unlink it again. */
3184     assert(block->next_used && block->prev_used &&
3185            *block->prev_used == block);
3186     unlink_block(keycache, block);
3187   }
3188   if (block->temperature == BLOCK_WARM)
3189     keycache->warm_blocks--;
3190   block->temperature= BLOCK_COLD;
3191 
3192   /* Remove from file_blocks hash. */
3193   unlink_changed(block);
3194 
3195   /* Remove reference to block from hash table. */
3196   unlink_hash(keycache, block->hash_link);
3197   block->hash_link= NULL;
3198 
3199   block->status= 0;
3200   block->length= 0;
3201   block->offset= keycache->key_cache_block_size;
3202 
3203   /* Enforced by unlink_changed(), but just to be sure. */
3204   assert(!block->next_changed && !block->prev_changed);
3205   /* Enforced by unlink_block(): not in LRU ring nor in free_block_list. */
3206   assert(!block->next_used && !block->prev_used);
3207   /* Insert the free block in the free list. */
3208   block->next_used= keycache->free_block_list;
3209   keycache->free_block_list= block;
3210   /* Keep track of the number of currently unused blocks. */
3211   keycache->blocks_unused++;
3212 
3213   /* All pending requests for this page must be resubmitted. */
3214   release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
3215 }
3216 
3217 
cmp_sec_link(BLOCK_LINK ** a,BLOCK_LINK ** b)3218 static int cmp_sec_link(BLOCK_LINK **a, BLOCK_LINK **b)
3219 {
3220   return (((*a)->hash_link->diskpos < (*b)->hash_link->diskpos) ? -1 :
3221       ((*a)->hash_link->diskpos > (*b)->hash_link->diskpos) ? 1 : 0);
3222 }
3223 
3224 
3225 /*
3226   Flush a portion of changed blocks to disk,
3227   free used blocks if requested
3228 */
3229 
flush_cached_blocks(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,File file,BLOCK_LINK ** cache,BLOCK_LINK ** end,enum flush_type type)3230 static int flush_cached_blocks(KEY_CACHE *keycache,
3231                                st_keycache_thread_var *thread_var,
3232                                File file, BLOCK_LINK **cache,
3233                                BLOCK_LINK **end,
3234                                enum flush_type type)
3235 {
3236   int error;
3237   int last_errno= 0;
3238   uint count= (uint) (end-cache);
3239 
3240   /* Don't lock the cache during the flush */
3241   mysql_mutex_unlock(&keycache->cache_lock);
3242   /*
3243      As all blocks referred in 'cache' are marked by BLOCK_IN_FLUSH
3244      we are guarunteed no thread will change them
3245   */
3246   my_qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link);
3247 
3248   mysql_mutex_lock(&keycache->cache_lock);
3249   /*
3250     Note: Do not break the loop. We have registered a request on every
3251     block in 'cache'. These must be unregistered by free_block() or
3252     unreg_request().
3253   */
3254   for ( ; cache != end ; cache++)
3255   {
3256     BLOCK_LINK *block= *cache;
3257 
3258     /*
3259       If the block contents is going to be changed, we abandon the flush
3260       for this block. flush_key_blocks_int() will restart its search and
3261       handle the block properly.
3262     */
3263     if (!(block->status & BLOCK_FOR_UPDATE))
3264     {
3265       /* Blocks coming here must have a certain status. */
3266       assert(block->hash_link);
3267       assert(block->hash_link->block == block);
3268       assert(block->hash_link->file == file);
3269       assert((block->status & ~BLOCK_IN_EVICTION) ==
3270              (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE));
3271       block->status|= BLOCK_IN_FLUSHWRITE;
3272       mysql_mutex_unlock(&keycache->cache_lock);
3273       error= (int)my_pwrite(file, block->buffer+block->offset,
3274                             block->length - block->offset,
3275                             block->hash_link->diskpos+ block->offset,
3276                             MYF(MY_NABP | MY_WAIT_IF_FULL));
3277       mysql_mutex_lock(&keycache->cache_lock);
3278       keycache->global_cache_write++;
3279       if (error)
3280       {
3281         block->status|= BLOCK_ERROR;
3282         if (!last_errno)
3283           last_errno= errno ? errno : -1;
3284       }
3285       block->status&= ~BLOCK_IN_FLUSHWRITE;
3286       /* Block must not have changed status except BLOCK_FOR_UPDATE. */
3287       assert(block->hash_link);
3288       assert(block->hash_link->block == block);
3289       assert(block->hash_link->file == file);
3290       assert((block->status & ~(BLOCK_FOR_UPDATE | BLOCK_IN_EVICTION)) ==
3291              (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE));
3292       /*
3293         Set correct status and link in right queue for free or later use.
3294         free_block() must not see BLOCK_CHANGED and it may need to wait
3295         for readers of the block. These should not see the block in the
3296         wrong hash. If not freeing the block, we need to have it in the
3297         right queue anyway.
3298       */
3299       link_to_file_list(keycache, block, file, 1);
3300     }
3301     block->status&= ~BLOCK_IN_FLUSH;
3302     /*
3303       Let to proceed for possible waiting requests to write to the block page.
3304       It might happen only during an operation to resize the key cache.
3305     */
3306     release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
3307     /* type will never be FLUSH_IGNORE_CHANGED here */
3308     if (!(type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE) &&
3309         !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
3310                            BLOCK_FOR_UPDATE)))
3311     {
3312       /*
3313         Note that a request has been registered against the block in
3314         flush_key_blocks_int().
3315       */
3316       free_block(keycache, thread_var, block);
3317     }
3318     else
3319     {
3320       /*
3321         Link the block into the LRU ring if it's the last submitted
3322         request for the block. This enables eviction for the block.
3323         Note that a request has been registered against the block in
3324         flush_key_blocks_int().
3325       */
3326       unreg_request(keycache, block, 1);
3327     }
3328 
3329   } /* end of for ( ; cache != end ; cache++) */
3330   return last_errno;
3331 }
3332 
3333 
3334 /*
3335   Flush all key blocks for a file to disk, but don't do any mutex locks.
3336 
3337   SYNOPSIS
3338     flush_key_blocks_int()
3339       keycache            pointer to a key cache data structure
3340       thread_var          pointer to thread specific variables
3341       file                handler for the file to flush to
3342       flush_type          type of the flush
3343 
3344   NOTES
3345     This function doesn't do any mutex locks because it needs to be called both
3346     from flush_key_blocks and flush_all_key_blocks (the later one does the
3347     mutex lock in the resize_key_cache() function).
3348 
3349     We do only care about changed blocks that exist when the function is
3350     entered. We do not guarantee that all changed blocks of the file are
3351     flushed if more blocks change while this function is running.
3352 
3353   RETURN
3354     0   ok
3355     1  error
3356 */
3357 
flush_key_blocks_int(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,File file,enum flush_type type)3358 static int flush_key_blocks_int(KEY_CACHE *keycache,
3359                                 st_keycache_thread_var *thread_var,
3360 				File file, enum flush_type type)
3361 {
3362   BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
3363   int last_errno= 0;
3364   int last_errcnt= 0;
3365   DBUG_ENTER("flush_key_blocks_int");
3366   DBUG_PRINT("enter",("file: %d  blocks_used: %lu  blocks_changed: %lu",
3367               file, keycache->blocks_used, keycache->blocks_changed));
3368 
3369   cache= cache_buff;
3370   if (keycache->disk_blocks > 0)
3371   {
3372     /* Key cache exists and flush is not disabled */
3373     int error= 0;
3374     uint count= FLUSH_CACHE;
3375     BLOCK_LINK **pos,**end;
3376     BLOCK_LINK *first_in_switch= NULL;
3377     BLOCK_LINK *last_in_flush;
3378     BLOCK_LINK *last_for_update;
3379     BLOCK_LINK *block, *next;
3380 #ifndef NDEBUG
3381     uint cnt=0;
3382 #endif
3383 
3384     if (type != FLUSH_IGNORE_CHANGED)
3385     {
3386       /*
3387          Count how many key blocks we have to cache to be able
3388          to flush all dirty pages with minimum seek moves
3389       */
3390       count= 0;
3391       for (block= keycache->changed_blocks[FILE_HASH(file)] ;
3392            block ;
3393            block= block->next_changed)
3394       {
3395         if ((block->hash_link->file == file) &&
3396             !(block->status & BLOCK_IN_FLUSH))
3397         {
3398           count++;
3399           assert(count<= keycache->blocks_used);
3400         }
3401       }
3402       /*
3403         Allocate a new buffer only if its bigger than the one we have.
3404         Assure that we always have some entries for the case that new
3405         changed blocks appear while we need to wait for something.
3406       */
3407       if ((count > FLUSH_CACHE) &&
3408           !(cache= (BLOCK_LINK**) my_malloc(key_memory_KEY_CACHE,
3409                                             sizeof(BLOCK_LINK*)*count,
3410                                             MYF(0))))
3411         cache= cache_buff;
3412       /*
3413         After a restart there could be more changed blocks than now.
3414         So we should not let count become smaller than the fixed buffer.
3415       */
3416       if (cache == cache_buff)
3417         count= FLUSH_CACHE;
3418     }
3419 
3420     /* Retrieve the blocks and write them to a buffer to be flushed */
3421 restart:
3422     last_in_flush= NULL;
3423     last_for_update= NULL;
3424     end= (pos= cache)+count;
3425     for (block= keycache->changed_blocks[FILE_HASH(file)] ;
3426          block ;
3427          block= next)
3428     {
3429 #ifndef NDEBUG
3430       cnt++;
3431       assert(cnt <= keycache->blocks_used);
3432 #endif
3433       next= block->next_changed;
3434       if (block->hash_link->file == file)
3435       {
3436         if (!(block->status & (BLOCK_IN_FLUSH | BLOCK_FOR_UPDATE)))
3437         {
3438           /*
3439             Note: The special handling of BLOCK_IN_SWITCH is obsolete
3440             since we set BLOCK_IN_FLUSH if the eviction includes a
3441             flush. It can be removed in a later version.
3442           */
3443           if (!(block->status & BLOCK_IN_SWITCH))
3444           {
3445             /*
3446               We care only for the blocks for which flushing was not
3447               initiated by another thread and which are not in eviction.
3448               Registering a request on the block unlinks it from the LRU
3449               ring and protects against eviction.
3450             */
3451             reg_requests(keycache, block, 1);
3452             if (type != FLUSH_IGNORE_CHANGED)
3453             {
3454               /* It's not a temporary file */
3455               if (pos == end)
3456               {
3457                 /*
3458                   This should happen relatively seldom. Remove the
3459                   request because we won't do anything with the block
3460                   but restart and pick it again in the next iteration.
3461                 */
3462                 unreg_request(keycache, block, 0);
3463                 /*
3464                   This happens only if there is not enough
3465                   memory for the big block
3466                 */
3467                 if ((error= flush_cached_blocks(keycache, thread_var, file,
3468                                                 cache, end, type)))
3469                 {
3470                   /* Do not loop infinitely trying to flush in vain. */
3471                   if ((last_errno == error) && (++last_errcnt > 5))
3472                     goto err;
3473                   last_errno= error;
3474                 }
3475                 /*
3476                   Restart the scan as some other thread might have changed
3477                   the changed blocks chain: the blocks that were in switch
3478                   state before the flush started have to be excluded
3479                 */
3480                 goto restart;
3481               }
3482               /*
3483                 Mark the block with BLOCK_IN_FLUSH in order not to let
3484                 other threads to use it for new pages and interfere with
3485                 our sequence of flushing dirty file pages. We must not
3486                 set this flag before actually putting the block on the
3487                 write burst array called 'cache'.
3488               */
3489               block->status|= BLOCK_IN_FLUSH;
3490               /* Add block to the array for a write burst. */
3491               *pos++= block;
3492             }
3493             else
3494             {
3495               /* It's a temporary file */
3496               assert(!(block->status & BLOCK_REASSIGNED));
3497               /*
3498                 free_block() must not be called with BLOCK_CHANGED. Note
3499                 that we must not change the BLOCK_CHANGED flag outside of
3500                 link_to_file_list() so that it is always in the correct
3501                 queue and the *blocks_changed counters are correct.
3502               */
3503               link_to_file_list(keycache, block, file, 1);
3504               if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH)))
3505               {
3506                 /* A request has been registered against the block above. */
3507                 free_block(keycache, thread_var, block);
3508               }
3509               else
3510               {
3511                 /*
3512                   Link the block into the LRU ring if it's the last
3513                   submitted request for the block. This enables eviction
3514                   for the block. A request has been registered against
3515                   the block above.
3516                 */
3517                 unreg_request(keycache, block, 1);
3518               }
3519             }
3520           }
3521           else
3522           {
3523             /*
3524               Link the block into a list of blocks 'in switch'.
3525 
3526               WARNING: Here we introduce a place where a changed block
3527               is not in the changed_blocks hash! This is acceptable for
3528               a BLOCK_IN_SWITCH. Never try this for another situation.
3529               Other parts of the key cache code rely on changed blocks
3530               being in the changed_blocks hash.
3531             */
3532             unlink_changed(block);
3533             link_changed(block, &first_in_switch);
3534           }
3535         }
3536         else if (type != FLUSH_KEEP)
3537         {
3538           /*
3539             During the normal flush at end of statement (FLUSH_KEEP) we
3540             do not need to ensure that blocks in flush or update by
3541             other threads are flushed. They will be flushed by them
3542             later. In all other cases we must assure that we do not have
3543             any changed block of this file in the cache when this
3544             function returns.
3545           */
3546           if (block->status & BLOCK_IN_FLUSH)
3547           {
3548             /* Remember the last block found to be in flush. */
3549             last_in_flush= block;
3550           }
3551           else
3552           {
3553             /* Remember the last block found to be selected for update. */
3554             last_for_update= block;
3555           }
3556         }
3557       }
3558     }
3559     if (pos != cache)
3560     {
3561       if ((error=
3562            flush_cached_blocks(keycache, thread_var, file, cache, pos, type)))
3563       {
3564         /* Do not loop inifnitely trying to flush in vain. */
3565         if ((last_errno == error) && (++last_errcnt > 5))
3566           goto err;
3567         last_errno= error;
3568       }
3569       /*
3570         Do not restart here during the normal flush at end of statement
3571         (FLUSH_KEEP). We have now flushed at least all blocks that were
3572         changed when entering this function. In all other cases we must
3573         assure that we do not have any changed block of this file in the
3574         cache when this function returns.
3575       */
3576       if (type != FLUSH_KEEP)
3577         goto restart;
3578     }
3579     if (last_in_flush)
3580     {
3581       /*
3582         There are no blocks to be flushed by this thread, but blocks in
3583         flush by other threads. Wait until one of the blocks is flushed.
3584         Re-check the condition for last_in_flush. We may have unlocked
3585         the cache_lock in flush_cached_blocks(). The state of the block
3586         could have changed.
3587       */
3588       if (last_in_flush->status & BLOCK_IN_FLUSH)
3589         wait_on_queue(&last_in_flush->wqueue[COND_FOR_SAVED],
3590                       &keycache->cache_lock, thread_var);
3591       /* Be sure not to lose a block. They may be flushed in random order. */
3592       goto restart;
3593     }
3594     if (last_for_update)
3595     {
3596       /*
3597         There are no blocks to be flushed by this thread, but blocks for
3598         update by other threads. Wait until one of the blocks is updated.
3599         Re-check the condition for last_for_update. We may have unlocked
3600         the cache_lock in flush_cached_blocks(). The state of the block
3601         could have changed.
3602       */
3603       if (last_for_update->status & BLOCK_FOR_UPDATE)
3604         wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED],
3605                       &keycache->cache_lock, thread_var);
3606       /* The block is now changed. Flush it. */
3607       goto restart;
3608     }
3609 
3610     /*
3611       Wait until the list of blocks in switch is empty. The threads that
3612       are switching these blocks will relink them to clean file chains
3613       while we wait and thus empty the 'first_in_switch' chain.
3614     */
3615     while (first_in_switch)
3616     {
3617 #ifndef NDEBUG
3618       cnt= 0;
3619 #endif
3620       wait_on_queue(&first_in_switch->wqueue[COND_FOR_SAVED],
3621                     &keycache->cache_lock, thread_var);
3622 #ifndef NDEBUG
3623       cnt++;
3624       assert(cnt <= keycache->blocks_used);
3625 #endif
3626       /*
3627         Do not restart here. We have flushed all blocks that were
3628         changed when entering this function and were not marked for
3629         eviction. Other threads have now flushed all remaining blocks in
3630         the course of their eviction.
3631       */
3632     }
3633 
3634     if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE))
3635     {
3636       BLOCK_LINK *last_for_update= NULL;
3637       BLOCK_LINK *last_in_switch= NULL;
3638       uint total_found= 0;
3639       uint found;
3640 
3641       /*
3642         Finally free all clean blocks for this file.
3643         During resize this may be run by two threads in parallel.
3644       */
3645       do
3646       {
3647         found= 0;
3648         for (block= keycache->file_blocks[FILE_HASH(file)] ;
3649              block ;
3650              block= next)
3651         {
3652           /* Remember the next block. After freeing we cannot get at it. */
3653           next= block->next_changed;
3654 
3655           /* Changed blocks cannot appear in the file_blocks hash. */
3656           assert(!(block->status & BLOCK_CHANGED));
3657           if (block->hash_link->file == file)
3658           {
3659             /* We must skip blocks that will be changed. */
3660             if (block->status & BLOCK_FOR_UPDATE)
3661             {
3662               last_for_update= block;
3663               continue;
3664             }
3665 
3666             /*
3667               We must not free blocks in eviction (BLOCK_IN_EVICTION |
3668               BLOCK_IN_SWITCH) or blocks intended to be freed
3669               (BLOCK_REASSIGNED).
3670             */
3671             if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
3672                                    BLOCK_REASSIGNED)))
3673             {
3674               struct st_hash_link *next_hash_link= NULL;
3675               my_off_t next_diskpos= 0;
3676               File next_file= 0;
3677               uint next_status= 0;
3678               uint hash_requests= 0;
3679 
3680               total_found++;
3681               found++;
3682               assert(found <= keycache->blocks_used);
3683 
3684               /*
3685                 Register a request. This unlinks the block from the LRU
3686                 ring and protects it against eviction. This is required
3687                 by free_block().
3688               */
3689               reg_requests(keycache, block, 1);
3690 
3691               /*
3692                 free_block() may need to wait for readers of the block.
3693                 This is the moment where the other thread can move the
3694                 'next' block from the chain. free_block() needs to wait
3695                 if there are requests for the block pending.
3696               */
3697               if (next && (hash_requests= block->hash_link->requests))
3698               {
3699                 /* Copy values from the 'next' block and its hash_link. */
3700                 next_status=    next->status;
3701                 next_hash_link= next->hash_link;
3702                 next_diskpos=   next_hash_link->diskpos;
3703                 next_file=      next_hash_link->file;
3704                 assert(next == next_hash_link->block);
3705               }
3706 
3707               free_block(keycache, thread_var, block);
3708               /*
3709                 If we had to wait and the state of the 'next' block
3710                 changed, break the inner loop. 'next' may no longer be
3711                 part of the current chain.
3712 
3713                 We do not want to break the loop after every free_block(),
3714                 not even only after waits. The chain might be quite long
3715                 and contain blocks for many files. Traversing it again and
3716                 again to find more blocks for this file could become quite
3717                 inefficient.
3718               */
3719               if (next && hash_requests &&
3720                   ((next_status    != next->status) ||
3721                    (next_hash_link != next->hash_link) ||
3722                    (next_file      != next_hash_link->file) ||
3723                    (next_diskpos   != next_hash_link->diskpos) ||
3724                    (next           != next_hash_link->block)))
3725                 break;
3726             }
3727             else
3728             {
3729               last_in_switch= block;
3730             }
3731           }
3732         } /* end for block in file_blocks */
3733       } while (found);
3734 
3735       /*
3736         If any clean block has been found, we may have waited for it to
3737         become free. In this case it could be possible that another clean
3738         block became dirty. This is possible if the write request existed
3739         before the flush started (BLOCK_FOR_UPDATE). Re-check the hashes.
3740       */
3741       if (total_found)
3742         goto restart;
3743 
3744       /*
3745         To avoid an infinite loop, wait until one of the blocks marked
3746         for update is updated.
3747       */
3748       if (last_for_update)
3749       {
3750         /* We did not wait. Block must not have changed status. */
3751         assert(last_for_update->status & BLOCK_FOR_UPDATE);
3752         wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED],
3753                       &keycache->cache_lock, thread_var);
3754         goto restart;
3755       }
3756 
3757       /*
3758         To avoid an infinite loop wait until one of the blocks marked
3759         for eviction is switched.
3760       */
3761       if (last_in_switch)
3762       {
3763         /* We did not wait. Block must not have changed status. */
3764         assert(last_in_switch->status & (BLOCK_IN_EVICTION |
3765                                          BLOCK_IN_SWITCH |
3766                                          BLOCK_REASSIGNED));
3767         wait_on_queue(&last_in_switch->wqueue[COND_FOR_SAVED],
3768                       &keycache->cache_lock, thread_var);
3769         goto restart;
3770       }
3771 
3772     } /* if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) */
3773 
3774   } /* if (keycache->disk_blocks > 0 */
3775 
3776 err:
3777   if (cache != cache_buff)
3778     my_free(cache);
3779   if (last_errno)
3780     errno=last_errno;                /* Return first error */
3781   DBUG_RETURN(last_errno != 0);
3782 }
3783 
3784 
3785 /*
3786   Flush all blocks for a file to disk
3787 
3788   SYNOPSIS
3789 
3790     flush_key_blocks()
3791       keycache            pointer to a key cache data structure
3792       thread_var          pointer to thread specific variables
3793       file                handler for the file to flush to
3794       flush_type          type of the flush
3795 
3796   RETURN
3797     0   ok
3798     1  error
3799 */
3800 
flush_key_blocks(KEY_CACHE * keycache,st_keycache_thread_var * thread_var,File file,enum flush_type type)3801 int flush_key_blocks(KEY_CACHE *keycache,
3802                      st_keycache_thread_var *thread_var,
3803                      File file, enum flush_type type)
3804 {
3805   int res= 0;
3806   DBUG_ENTER("flush_key_blocks");
3807   DBUG_PRINT("enter", ("keycache: 0x%lx", (long) keycache));
3808 
3809   if (!keycache->key_cache_inited)
3810     DBUG_RETURN(0);
3811 
3812   mysql_mutex_lock(&keycache->cache_lock);
3813   /* While waiting for lock, keycache could have been ended. */
3814   if (keycache->disk_blocks > 0)
3815   {
3816     inc_counter_for_resize_op(keycache);
3817     res= flush_key_blocks_int(keycache, thread_var, file, type);
3818     dec_counter_for_resize_op(keycache);
3819   }
3820   mysql_mutex_unlock(&keycache->cache_lock);
3821   DBUG_RETURN(res);
3822 }
3823 
3824 
3825 /*
3826   Flush all blocks in the key cache to disk.
3827 
3828   SYNOPSIS
3829     flush_all_key_blocks()
3830       keycache                  pointer to key cache root structure
3831       thread_var                pointer to thread specific variables
3832 
3833   DESCRIPTION
3834 
3835     Flushing of the whole key cache is done in two phases.
3836 
3837     1. Flush all changed blocks, waiting for them if necessary. Loop
3838     until there is no changed block left in the cache.
3839 
3840     2. Free all clean blocks. Normally this means free all blocks. The
3841     changed blocks were flushed in phase 1 and became clean. However we
3842     may need to wait for blocks that are read by other threads. While we
3843     wait, a clean block could become changed if that operation started
3844     before the resize operation started. To be safe we must restart at
3845     phase 1.
3846 
3847     When we can run through the changed_blocks and file_blocks hashes
3848     without finding a block any more, then we are done.
3849 
3850     Note that we hold keycache->cache_lock all the time unless we need
3851     to wait for something.
3852 
3853   RETURN
3854     0           OK
3855     != 0        Error
3856 */
3857 
flush_all_key_blocks(KEY_CACHE * keycache,st_keycache_thread_var * thread_var)3858 static int flush_all_key_blocks(KEY_CACHE *keycache,
3859                                 st_keycache_thread_var *thread_var)
3860 {
3861   BLOCK_LINK    *block;
3862   uint          total_found;
3863   uint          found;
3864   uint          idx;
3865   DBUG_ENTER("flush_all_key_blocks");
3866 
3867   do
3868   {
3869     mysql_mutex_assert_owner(&keycache->cache_lock);
3870     total_found= 0;
3871 
3872     /*
3873       Phase1: Flush all changed blocks, waiting for them if necessary.
3874       Loop until there is no changed block left in the cache.
3875     */
3876     do
3877     {
3878       found= 0;
3879       /* Step over the whole changed_blocks hash array. */
3880       for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++)
3881       {
3882         /*
3883           If an array element is non-empty, use the first block from its
3884           chain to find a file for flush. All changed blocks for this
3885           file are flushed. So the same block will not appear at this
3886           place again with the next iteration. New writes for blocks are
3887           not accepted during the flush. If multiple files share the
3888           same hash bucket, one of them will be flushed per iteration
3889           of the outer loop of phase 1.
3890         */
3891         if ((block= keycache->changed_blocks[idx]))
3892         {
3893           found++;
3894           /*
3895             Flush dirty blocks but do not free them yet. They can be used
3896             for reading until all other blocks are flushed too.
3897           */
3898           if (flush_key_blocks_int(keycache, thread_var,
3899                                    block->hash_link->file,
3900                                    FLUSH_FORCE_WRITE))
3901             DBUG_RETURN(1);
3902         }
3903       }
3904 
3905     } while (found);
3906 
3907     /*
3908       Phase 2: Free all clean blocks. Normally this means free all
3909       blocks. The changed blocks were flushed in phase 1 and became
3910       clean. However we may need to wait for blocks that are read by
3911       other threads. While we wait, a clean block could become changed
3912       if that operation started before the resize operation started. To
3913       be safe we must restart at phase 1.
3914     */
3915     do
3916     {
3917       found= 0;
3918       /* Step over the whole file_blocks hash array. */
3919       for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++)
3920       {
3921         /*
3922           If an array element is non-empty, use the first block from its
3923           chain to find a file for flush. All blocks for this file are
3924           freed. So the same block will not appear at this place again
3925           with the next iteration. If multiple files share the
3926           same hash bucket, one of them will be flushed per iteration
3927           of the outer loop of phase 2.
3928         */
3929         if ((block= keycache->file_blocks[idx]))
3930         {
3931           total_found++;
3932           found++;
3933           if (flush_key_blocks_int(keycache, thread_var,
3934                                    block->hash_link->file,
3935                                    FLUSH_RELEASE))
3936             DBUG_RETURN(1);
3937         }
3938       }
3939 
3940     } while (found);
3941 
3942     /*
3943       If any clean block has been found, we may have waited for it to
3944       become free. In this case it could be possible that another clean
3945       block became dirty. This is possible if the write request existed
3946       before the resize started (BLOCK_FOR_UPDATE). Re-check the hashes.
3947     */
3948   } while (total_found);
3949 
3950 #ifndef NDEBUG
3951   /* Now there should not exist any block any more. */
3952   for (idx= 0; idx < CHANGED_BLOCKS_HASH; idx++)
3953   {
3954     assert(!keycache->changed_blocks[idx]);
3955     assert(!keycache->file_blocks[idx]);
3956   }
3957 #endif
3958 
3959   DBUG_RETURN(0);
3960 }
3961 
3962 
3963 /*
3964   Reset the counters of a key cache.
3965 
3966   SYNOPSIS
3967     reset_key_cache_counters()
3968     name       the name of a key cache
3969     key_cache  pointer to the key kache to be reset
3970 
3971   DESCRIPTION
3972    This procedure is used by process_key_caches() to reset the counters of all
3973    currently used key caches, both the default one and the named ones.
3974 
3975   RETURN
3976     0 on success (always because it can't fail)
3977 */
3978 
reset_key_cache_counters(const char * name MY_ATTRIBUTE ((unused)),KEY_CACHE * key_cache)3979 int reset_key_cache_counters(const char *name MY_ATTRIBUTE((unused)),
3980                              KEY_CACHE *key_cache)
3981 {
3982   DBUG_ENTER("reset_key_cache_counters");
3983   if (!key_cache->key_cache_inited)
3984   {
3985     DBUG_PRINT("info", ("Key cache %s not initialized.", name));
3986     DBUG_RETURN(0);
3987   }
3988   DBUG_PRINT("info", ("Resetting counters for key cache %s.", name));
3989 
3990   key_cache->global_blocks_changed= 0;   /* Key_blocks_not_flushed */
3991   key_cache->global_cache_r_requests= 0; /* Key_read_requests */
3992   key_cache->global_cache_read= 0;       /* Key_reads */
3993   key_cache->global_cache_w_requests= 0; /* Key_write_requests */
3994   key_cache->global_cache_write= 0;      /* Key_writes */
3995   DBUG_RETURN(0);
3996 }
3997 
3998 
3999 #if !defined(NDEBUG)
4000 #define F_B_PRT(_f_, _v_) DBUG_PRINT("assert_fail", (_f_, _v_))
4001 
fail_block(BLOCK_LINK * block)4002 static int fail_block(BLOCK_LINK *block)
4003 {
4004   F_B_PRT("block->next_used:    %lx\n", (ulong) block->next_used);
4005   F_B_PRT("block->prev_used:    %lx\n", (ulong) block->prev_used);
4006   F_B_PRT("block->next_changed: %lx\n", (ulong) block->next_changed);
4007   F_B_PRT("block->prev_changed: %lx\n", (ulong) block->prev_changed);
4008   F_B_PRT("block->hash_link:    %lx\n", (ulong) block->hash_link);
4009   F_B_PRT("block->status:       %u\n", block->status);
4010   F_B_PRT("block->length:       %u\n", block->length);
4011   F_B_PRT("block->offset:       %u\n", block->offset);
4012   F_B_PRT("block->requests:     %u\n", block->requests);
4013   F_B_PRT("block->temperature:  %u\n", block->temperature);
4014   return 0; /* Let the assert fail. */
4015 }
4016 
fail_hlink(HASH_LINK * hlink)4017 static int fail_hlink(HASH_LINK *hlink)
4018 {
4019   F_B_PRT("hlink->next:    %lx\n", (ulong) hlink->next);
4020   F_B_PRT("hlink->prev:    %lx\n", (ulong) hlink->prev);
4021   F_B_PRT("hlink->block:   %lx\n", (ulong) hlink->block);
4022   F_B_PRT("hlink->diskpos: %lu\n", (ulong) hlink->diskpos);
4023   F_B_PRT("hlink->file:    %d\n", hlink->file);
4024   return 0; /* Let the assert fail. */
4025 }
4026 
cache_empty(KEY_CACHE * keycache)4027 static int cache_empty(KEY_CACHE *keycache)
4028 {
4029   int errcnt= 0;
4030   int idx;
4031   if (keycache->disk_blocks <= 0)
4032     return 1;
4033   for (idx= 0; idx < keycache->disk_blocks; idx++)
4034   {
4035     BLOCK_LINK *block= keycache->block_root + idx;
4036     if (block->status || block->requests || block->hash_link)
4037     {
4038       my_message_local(INFORMATION_LEVEL, "block index: %u", idx);
4039       fail_block(block);
4040       errcnt++;
4041     }
4042   }
4043   for (idx= 0; idx < keycache->hash_links; idx++)
4044   {
4045     HASH_LINK *hash_link= keycache->hash_link_root + idx;
4046     if (hash_link->requests || hash_link->block)
4047     {
4048       my_message_local(INFORMATION_LEVEL, "hash_link index: %u", idx);
4049       fail_hlink(hash_link);
4050       errcnt++;
4051     }
4052   }
4053   if (errcnt)
4054   {
4055     my_message_local(INFORMATION_LEVEL, "blocks: %d  used: %lu",
4056                      keycache->disk_blocks, keycache->blocks_used);
4057     my_message_local(INFORMATION_LEVEL, "hash_links: %d  used: %d",
4058                      keycache->hash_links, keycache->hash_links_used);
4059   }
4060   return !errcnt;
4061 }
4062 #endif
4063 
4064