1 /* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
2    Copyright (c) 2017, MariaDB Corporation.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
16 
17 
18 
19 /**
20   @file
21   The file contains the following modules:
22 
23     Simple Key Cache Module
24 
25     Partitioned Key Cache Module
26 
27     Key Cache Interface Module
28 
29 */
30 
31 #include "mysys_priv.h"
32 #include "mysys_err.h"
33 #include <keycache.h>
34 #include "my_static.h"
35 #include <m_string.h>
36 #include <my_bit.h>
37 #include <errno.h>
38 #include <stdarg.h>
39 #include "probes_mysql.h"
40 
41 /******************************************************************************
42   Simple Key Cache Module
43 
44   The module contains implementations of all key cache interface functions
45   employed by partitioned key caches.
46 
47 ******************************************************************************/
48 
49 /*
50   These functions handle keyblock cacheing for ISAM and MyISAM tables.
51 
52   One cache can handle many files.
53   It must contain buffers of the same blocksize.
54 
55   init_key_cache() should be used to init cache handler.
56 
57   The free list (free_block_list) is a stack like structure.
58   When a block is freed by free_block(), it is pushed onto the stack.
59   When a new block is required it is first tried to pop one from the stack.
60   If the stack is empty, it is tried to get a never-used block from the pool.
61   If this is empty too, then a block is taken from the LRU ring, flushing it
62   to disk, if necessary. This is handled in find_key_block().
63   With the new free list, the blocks can have three temperatures:
64   hot, warm and cold (which is free). This is remembered in the block header
65   by the enum BLOCK_TEMPERATURE temperature variable. Remembering the
66   temperature is necessary to correctly count the number of warm blocks,
67   which is required to decide when blocks are allowed to become hot. Whenever
68   a block is inserted to another (sub-)chain, we take the old and new
69   temperature into account to decide if we got one more or less warm block.
70   blocks_unused is the sum of never used blocks in the pool and of currently
71   free blocks. blocks_used is the number of blocks fetched from the pool and
72   as such gives the maximum number of in-use blocks at any time.
73 
74   Key Cache Locking
75   =================
76 
77   All key cache locking is done with a single mutex per key cache:
78   keycache->cache_lock. This mutex is locked almost all the time
init_functions(IO_CACHE * info)79   when executing code in this file (mf_keycache.c).
80   However it is released for I/O and some copy operations.
81 
82   The cache_lock is also released when waiting for some event. Waiting
83   and signalling is done via condition variables. In most cases the
84   thread waits on its thread->suspend condition variable. Every thread
85   has a my_thread_var structure, which contains this variable and a
86   '*next' and '**prev' pointer. These pointers are used to insert the
87   thread into a wait queue.
88 
89   A thread can wait for one block and thus be in one wait queue at a
90   time only.
91 
92   Before starting to wait on its condition variable with
93   mysql_cond_wait(), the thread enters itself to a specific wait queue
94   with link_into_queue() (double linked with '*next' + '**prev') or
95   wait_on_queue() (single linked with '*next').
96 
97   Another thread, when releasing a resource, looks up the waiting thread
98   in the related wait queue. It sends a signal with
99   mysql_cond_signal() to the waiting thread.
100 
101   NOTE: Depending on the particular wait situation, either the sending
102   thread removes the waiting thread from the wait queue with
103   unlink_from_queue() or release_whole_queue() respectively, or the waiting
104   thread removes itself.
105 
106   There is one exception from this locking scheme when one thread wants
107   to reuse a block for some other address. This works by first marking
108   the block reserved (status= BLOCK_IN_SWITCH) and then waiting for all
109   threads that are reading the block to finish. Each block has a
110   reference to a condition variable (condvar). It holds a reference to
111   the thread->suspend condition variable for the waiting thread (if such
112   a thread exists). When that thread is signaled, the reference is
113   cleared. The number of readers of a block is registered in
114   block->hash_link->requests. See wait_for_readers() / remove_reader()
115   for details. This is similar to the above, but it clearly means that
116   only one thread can wait for a particular block. There is no queue in
117   this case. Strangely enough block->convar is used for waiting for the
118   assigned hash_link only. More precisely it is used to wait for all
119   requests to be unregistered from the assigned hash_link.
120 
121   The resize_queue serves two purposes:
122   1. Threads that want to do a resize wait there if in_resize is set.
123      This is not used in the server. The server refuses a second resize
124      request if one is already active. keycache->in_init is used for the
125      synchronization. See set_var.cc.
126   2. Threads that want to access blocks during resize wait here during
127      the re-initialization phase.
128   When the resize is done, all threads on the queue are signalled.
129   Hypothetical resizers can compete for resizing, and read/write
130   requests will restart to request blocks from the freshly resized
131   cache. If the cache has been resized too small, it is disabled and
132   'can_be_used' is false. In this case read/write requests bypass the
133   cache. Since they increment and decrement 'cnt_for_resize_op', the
134   next resizer can wait on the queue 'waiting_for_resize_cnt' until all
135   I/O finished.
136 */
137 
138 /* declare structures that is used by st_key_cache */
139 
140 struct st_block_link;
141 typedef struct st_block_link BLOCK_LINK;
142 struct st_keycache_page;
143 typedef struct st_keycache_page KEYCACHE_PAGE;
144 struct st_hash_link;
145 typedef struct st_hash_link HASH_LINK;
146 
147 /* info about requests in a waiting queue */
148 typedef struct st_keycache_wqueue
149 {
150   struct st_my_thread_var *last_thread;  /* circular list of waiting threads */
151 } KEYCACHE_WQUEUE;
152 
init_io_cache(IO_CACHE * info,File file,size_t cachesize,enum cache_type type,my_off_t seek_offset,my_bool use_async_io,myf cache_myflags)153 /* Default size of hash for changed files */
154 #define MIN_CHANGED_BLOCKS_HASH_SIZE 128
155 
156 /* Control block for a simple (non-partitioned) key cache */
157 
158 typedef struct st_simple_key_cache_cb
159 {
160   my_bool key_cache_inited;      /* <=> control block is allocated           */
161   my_bool in_resize;             /* true during resize operation             */
162   my_bool resize_in_flush;       /* true during flush of resize operation    */
163   my_bool can_be_used;           /* usage of cache for read/write is allowed */
164   size_t key_cache_mem_size;     /* specified size of the cache memory       */
165   uint key_cache_block_size;     /* size of the page buffer of a cache block */
166   ulong min_warm_blocks;         /* min number of warm blocks;               */
167   ulong age_threshold;           /* age threshold for hot blocks             */
168   ulonglong keycache_time;       /* total number of block link operations    */
169   uint hash_entries;             /* max number of entries in the hash table  */
170   uint changed_blocks_hash_size;	 /* Number of hash buckets for file blocks   */
171   int hash_links;                /* max number of hash links                 */
172   int hash_links_used;           /* number of hash links currently used      */
173   int disk_blocks;               /* max number of blocks in the cache        */
174   ulong blocks_used;           /* maximum number of concurrently used blocks */
175   ulong blocks_unused;           /* number of currently unused blocks        */
176   ulong blocks_changed;          /* number of currently dirty blocks         */
177   ulong warm_blocks;             /* number of blocks in warm sub-chain       */
178   ulong cnt_for_resize_op;       /* counter to block resize operation        */
179   long blocks_available;      /* number of blocks available in the LRU chain */
180   HASH_LINK **hash_root;         /* arr. of entries into hash table buckets  */
181   HASH_LINK *hash_link_root;     /* memory for hash table links              */
182   HASH_LINK *free_hash_list;     /* list of free hash links                  */
183   BLOCK_LINK *free_block_list;   /* list of free blocks                      */
184   BLOCK_LINK *block_root;        /* memory for block links                   */
185   uchar *block_mem;              /* memory for block buffers                 */
186   BLOCK_LINK *used_last;         /* ptr to the last block of the LRU chain   */
187   BLOCK_LINK *used_ins;          /* ptr to the insertion block in LRU chain  */
188   mysql_mutex_t cache_lock;      /* to lock access to the cache structure    */
189   KEYCACHE_WQUEUE resize_queue;  /* threads waiting during resize operation  */
190   /*
191     Waiting for a zero resize count. Using a queue for symmetry though
192     only one thread can wait here.
193   */
194   KEYCACHE_WQUEUE waiting_for_resize_cnt;
195   KEYCACHE_WQUEUE waiting_for_hash_link; /* waiting for a free hash link     */
196   KEYCACHE_WQUEUE waiting_for_block;    /* requests waiting for a free block */
197   BLOCK_LINK **changed_blocks; 		/* hash for dirty file bl.*/
198   BLOCK_LINK **file_blocks;  		/* hash for other file bl.*/
199 
200   /* Statistics variables. These are reset in reset_key_cache_counters(). */
201   ulong global_blocks_changed;      /* number of currently dirty blocks      */
202   ulonglong global_cache_w_requests;/* number of write requests (write hits) */
203   ulonglong global_cache_write;     /* number of writes from cache to files  */
204   ulonglong global_cache_r_requests;/* number of read requests (read hits)   */
205   ulonglong global_cache_read;      /* number of reads from files to cache   */
206 
207   int blocks;                   /* max number of blocks in the cache        */
208   uint hash_factor;             /* factor used to calculate hash function   */
209   my_bool in_init;		/* Set to 1 in MySQL during init/resize     */
210 } SIMPLE_KEY_CACHE_CB;
211 
212 /*
213   Some compilation flags have been added specifically for this module
214   to control the following:
215   - not to let a thread to yield the control when reading directly
216     from key cache, which might improve performance in many cases;
217     to enable this add:
218     #define SERIALIZED_READ_FROM_CACHE
219   - to set an upper bound for number of threads simultaneously
220     using the key cache; this setting helps to determine an optimal
221     size for hash table and improve performance when the number of
222     blocks in the key cache much less than the number of threads
223     accessing it;
224     to set this number equal to <N> add
225       #define MAX_THREADS <N>
226   - to substitute calls of mysql_cond_wait for calls of
227     mysql_cond_timedwait (wait with timeout set up);
228     this setting should be used only when you want to trap a deadlock
229     situation, which theoretically should not happen;
230     to set timeout equal to <T> seconds add
231       #define KEYCACHE_TIMEOUT <T>
232   - to enable the module traps and to send debug information from
233     key cache module to a special debug log add:
234       #define KEYCACHE_DEBUG
235     the name of this debug log file <LOG NAME> can be set through:
236       #define KEYCACHE_DEBUG_LOG  <LOG NAME>
237     if the name is not defined, it's set by default;
238     if the KEYCACHE_DEBUG flag is not set up and we are in a debug
239     mode, i.e. when ! defined(DBUG_OFF), the debug information from the
240     module is sent to the regular debug log.
241 
242   Example of the settings:
243     #define SERIALIZED_READ_FROM_CACHE
244     #define MAX_THREADS   100
245     #define KEYCACHE_TIMEOUT  1
246     #define KEYCACHE_DEBUG
247     #define KEYCACHE_DEBUG_LOG  "my_key_cache_debug.log"
248 */
249 
250 #define STRUCT_PTR(TYPE, MEMBER, a)                                           \
251           (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER))
252 
253 /* types of condition variables */
254 #define  COND_FOR_REQUESTED 0
255 #define  COND_FOR_SAVED     1
256 #define  COND_FOR_READERS   2
257 
258 typedef mysql_cond_t KEYCACHE_CONDVAR;
259 
260 /* descriptor of the page in the key cache block buffer */
261 struct st_keycache_page
262 {
263   int file;               /* file to which the page belongs to  */
264   my_off_t filepos;       /* position of the page in the file   */
265 };
266 
267 /* element in the chain of a hash table bucket */
268 struct st_hash_link
269 {
270   struct st_hash_link *next, **prev; /* to connect links in the same bucket  */
271   struct st_block_link *block;       /* reference to the block for the page: */
272   File file;                         /* from such a file                     */
273   my_off_t diskpos;                  /* with such an offset                  */
274   uint requests;                     /* number of requests for the page      */
275 };
276 
277 /* simple states of a block */
278 #define BLOCK_ERROR           1U/* an error occurred when performing file i/o */
279 #define BLOCK_READ            2U/* file block is in the block buffer         */
280 #define BLOCK_IN_SWITCH       4U/* block is preparing to read new page       */
281 #define BLOCK_REASSIGNED      8U/* blk does not accept requests for old page */
282 #define BLOCK_IN_FLUSH       16U/* block is selected for flush               */
283 #define BLOCK_CHANGED        32U/* block buffer contains a dirty page        */
284 #define BLOCK_IN_USE         64U/* block is not free                         */
285 #define BLOCK_IN_EVICTION   128U/* block is selected for eviction            */
286 #define BLOCK_IN_FLUSHWRITE 256U/* block is in write to file                 */
287 #define BLOCK_FOR_UPDATE    512U/* block is selected for buffer modification */
288 
289 /* page status, returned by find_key_block */
290 #define PAGE_READ               0
291 #define PAGE_TO_BE_READ         1
292 #define PAGE_WAIT_TO_BE_READ    2
293 
294 /* block temperature determines in which (sub-)chain the block currently is */
295 enum BLOCK_TEMPERATURE { BLOCK_COLD /*free*/ , BLOCK_WARM , BLOCK_HOT };
296 
297 /* key cache block */
298 struct st_block_link
299 {
300   struct st_block_link
301     *next_used, **prev_used;   /* to connect links in the LRU chain (ring)   */
302   struct st_block_link
303     *next_changed, **prev_changed; /* for lists of file dirty/clean blocks   */
304   struct st_hash_link *hash_link; /* backward ptr to referring hash_link     */
305   KEYCACHE_WQUEUE wqueue[2]; /* queues on waiting requests for new/old pages */
306   uint requests;          /* number of requests for the block                */
307   uchar *buffer;           /* buffer for the block page                       */
308   uint offset;            /* beginning of modified data in the buffer        */
309   uint length;            /* end of data in the buffer                       */
310   uint status;            /* state of the block                              */
311   enum BLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot */
312   uint hits_left;         /* number of hits left until promotion             */
313   ulonglong last_hit_time; /* timestamp of the last hit                      */
314   KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event    */
315 };
316 
317 KEY_CACHE dflt_key_cache_var;
318 KEY_CACHE *dflt_key_cache= &dflt_key_cache_var;
319 
init_slave_io_cache(IO_CACHE * master,IO_CACHE * slave)320 #define FLUSH_CACHE         2000            /* sort this many blocks at once */
321 
322 static int flush_all_key_blocks(SIMPLE_KEY_CACHE_CB *keycache);
323 static void end_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache, my_bool cleanup);
324 static void wait_on_queue(KEYCACHE_WQUEUE *wqueue,
325                           mysql_mutex_t *mutex);
326 static void release_whole_queue(KEYCACHE_WQUEUE *wqueue);
327 static void free_block(SIMPLE_KEY_CACHE_CB *keycache, BLOCK_LINK *block);
328 #ifndef DBUG_OFF
329 static void test_key_cache(SIMPLE_KEY_CACHE_CB *keycache,
330                            const char *where, my_bool lock);
331 #endif
332 #define KEYCACHE_BASE_EXPR(f, pos)                                            \
333   ((ulong) ((pos) / keycache->key_cache_block_size) +	 (ulong) (f))
334 #define KEYCACHE_HASH(f, pos)                                                 \
335   ((KEYCACHE_BASE_EXPR(f, pos) / keycache->hash_factor) &                     \
336       (keycache->hash_entries-1))
337 #define FILE_HASH(f, cache)  ((uint) (f) & (cache->changed_blocks_hash_size-1))
338 
339 #define DEFAULT_KEYCACHE_DEBUG_LOG  "keycache_debug.log"
340 
341 #if defined(KEYCACHE_DEBUG) && ! defined(KEYCACHE_DEBUG_LOG)
342 #define KEYCACHE_DEBUG_LOG  DEFAULT_KEYCACHE_DEBUG_LOG
343 #endif
344 
345 #if defined(KEYCACHE_DEBUG_LOG)
346 static FILE *keycache_debug_log=NULL;
347 static void keycache_debug_print(const char *fmt,...);
348 #define KEYCACHE_DEBUG_OPEN                                                   \
349           if (!keycache_debug_log)                                            \
350           {                                                                   \
351             keycache_debug_log= fopen(KEYCACHE_DEBUG_LOG, "w");               \
352             (void) setvbuf(keycache_debug_log, NULL, _IOLBF, BUFSIZ);         \
353           }
354 
355 #define KEYCACHE_DEBUG_CLOSE                                                  \
356           if (keycache_debug_log)                                             \
357           {                                                                   \
end_slave_io_cache(IO_CACHE * cache)358             fclose(keycache_debug_log);                                       \
359             keycache_debug_log= 0;                                            \
360           }
361 #else
362 #define KEYCACHE_DEBUG_OPEN
363 #define KEYCACHE_DEBUG_CLOSE
364 #endif /* defined(KEYCACHE_DEBUG_LOG) */
365 
366 #if defined(KEYCACHE_DEBUG_LOG) && defined(KEYCACHE_DEBUG)
367 #define KEYCACHE_DBUG_PRINT(l, m)                                             \
368             { if (keycache_debug_log) fprintf(keycache_debug_log, "%s: ", l); \
369               keycache_debug_print m; }
370 
371 #define KEYCACHE_DBUG_ASSERT(a)                                               \
372             { if (! (a) && keycache_debug_log) fclose(keycache_debug_log);    \
373               assert(a); }
374 #else
seek_io_cache(IO_CACHE * cache,my_off_t needed_offset)375 #define KEYCACHE_DBUG_PRINT(l, m)  DBUG_PRINT(l, m)
376 #define KEYCACHE_DBUG_ASSERT(a)    DBUG_ASSERT(a)
377 #endif /* defined(KEYCACHE_DEBUG_LOG) && defined(KEYCACHE_DEBUG) */
378 
379 #if defined(KEYCACHE_DEBUG) || !defined(DBUG_OFF)
380 static long keycache_thread_id;
381 #define KEYCACHE_THREAD_TRACE(l)                                              \
382              KEYCACHE_DBUG_PRINT(l,("|thread %ld",keycache_thread_id))
383 
384 #define KEYCACHE_THREAD_TRACE_BEGIN(l)                                        \
385             { struct st_my_thread_var *thread_var= my_thread_var;             \
386               keycache_thread_id= thread_var->id;                             \
387               KEYCACHE_DBUG_PRINT(l,("[thread %ld",keycache_thread_id)) }
388 
389 #define KEYCACHE_THREAD_TRACE_END(l)                                          \
390             KEYCACHE_DBUG_PRINT(l,("]thread %ld",keycache_thread_id))
391 #else
392 #define KEYCACHE_THREAD_TRACE_BEGIN(l)
393 #define KEYCACHE_THREAD_TRACE_END(l)
394 #define KEYCACHE_THREAD_TRACE(l)
395 #endif /* defined(KEYCACHE_DEBUG) || !defined(DBUG_OFF) */
396 
397 #define BLOCK_NUMBER(b)                                                       \
398   ((uint) (((char*)(b)-(char *) keycache->block_root)/sizeof(BLOCK_LINK)))
399 #define HASH_LINK_NUMBER(h)                                                   \
400   ((uint) (((char*)(h)-(char *) keycache->hash_link_root)/sizeof(HASH_LINK)))
401 
402 #if (defined(KEYCACHE_TIMEOUT) && !defined(__WIN__)) || defined(KEYCACHE_DEBUG)
403 static int keycache_pthread_cond_wait(mysql_cond_t *cond,
404                                       mysql_mutex_t *mutex);
405 #else
406 #define keycache_pthread_cond_wait(C, M) mysql_cond_wait(C, M)
407 #endif
408 
409 #if defined(KEYCACHE_DEBUG)
410 static int keycache_pthread_mutex_lock(mysql_mutex_t *mutex);
411 static void keycache_pthread_mutex_unlock(mysql_mutex_t *mutex);
412 static int keycache_pthread_cond_signal(mysql_cond_t *cond);
413 #else
414 #define keycache_pthread_mutex_lock(M) mysql_mutex_lock(M)
my_aiowait(my_aio_result * result)415 #define keycache_pthread_mutex_unlock(M) mysql_mutex_unlock(M)
416 #define keycache_pthread_cond_signal(C) mysql_cond_signal(C)
417 #endif /* defined(KEYCACHE_DEBUG) */
418 
419 #if !defined(DBUG_OFF)
420 #if defined(inline)
421 #undef inline
422 #endif
423 #define inline  /* disabled inline for easier debugging */
424 static int fail_hlink(HASH_LINK *hlink);
425 static int cache_empty(SIMPLE_KEY_CACHE_CB *keycache);
426 #endif
427 #ifdef DBUG_ASSERT_EXISTS
428 static int fail_block(BLOCK_LINK *block);
429 #endif
430 
431 static inline uint next_power(uint value)
432 {
433   return (uint) my_round_up_to_next_power((uint32) value) << 1;
434 }
435 
436 
437 /*
438   Initialize a simple key cache
439 
440   SYNOPSIS
441     init_simple_key_cache()
442     keycache                pointer to the control block of a simple key cache
443     key_cache_block_size    size of blocks to keep cached data
444     use_mem                 memory to use for the key cache buferrs/structures
445     division_limit          division limit (may be zero)
446     age_threshold           age threshold (may be zero)
reinit_io_cache(IO_CACHE * info,enum cache_type type,my_off_t seek_offset,my_bool use_async_io,my_bool clear_cache)447 
448   DESCRIPTION
449     This function is the implementation of the init_key_cache interface
450     function that is employed by simple (non-partitioned) key caches.
451     The function builds a simple key cache and initializes the control block
452     structure of the type SIMPLE_KEY_CACHE_CB that is used for this key cache.
453     The parameter keycache is supposed to point to this structure.
454     The parameter key_cache_block_size specifies the size of the blocks in
455     the key cache to be built. The parameters division_limit and age_threshold
456     determine the initial values of those characteristics of the key cache
457     that are used for midpoint insertion strategy. The parameter use_mem
458     specifies the total amount of memory to be allocated for key cache blocks
459     and auxiliary structures.
460 
461   RETURN VALUE
462     number of blocks in the key cache, if successful,
463     <= 0 - otherwise.
464 
465   NOTES.
466     if keycache->key_cache_inited != 0 we assume that the key cache
467     is already initialized.  This is for now used by myisamchk, but shouldn't
468     be something that a program should rely on!
469 
470     It's assumed that no two threads call this function simultaneously
471     referring to the same key cache handle.
472 */
473 
474 static
475 int init_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache,
476                           uint key_cache_block_size,
477 		          size_t use_mem, uint division_limit,
478 		          uint age_threshold, uint changed_blocks_hash_size)
479 {
480   ulong blocks, hash_links;
481   size_t length;
482   int error;
483   DBUG_ENTER("init_simple_key_cache");
484   DBUG_ASSERT(key_cache_block_size >= 512);
485 
486   KEYCACHE_DEBUG_OPEN;
487   if (keycache->key_cache_inited && keycache->disk_blocks > 0)
488   {
489     DBUG_PRINT("warning",("key cache already in use"));
490     DBUG_RETURN(0);
491   }
492 
493   keycache->blocks_used= keycache->blocks_unused= 0;
494   keycache->global_blocks_changed= 0;
495   keycache->global_cache_w_requests= keycache->global_cache_r_requests= 0;
496   keycache->global_cache_read= keycache->global_cache_write= 0;
497   keycache->disk_blocks= -1;
498   if (! keycache->key_cache_inited)
499   {
500     keycache->key_cache_inited= 1;
501     keycache->hash_factor= 1;
502     /*
503       Initialize these variables once only.
504       Their value must survive re-initialization during resizing.
505     */
506     keycache->in_resize= 0;
507     keycache->resize_in_flush= 0;
508     keycache->cnt_for_resize_op= 0;
509     keycache->waiting_for_resize_cnt.last_thread= NULL;
510     keycache->in_init= 0;
511     mysql_mutex_init(key_KEY_CACHE_cache_lock,
512                      &keycache->cache_lock, MY_MUTEX_INIT_FAST);
513     keycache->resize_queue.last_thread= NULL;
514   }
515 
516   keycache->key_cache_mem_size= use_mem;
517   keycache->key_cache_block_size= key_cache_block_size;
518   DBUG_PRINT("info", ("key_cache_block_size: %u",
519 		      key_cache_block_size));
520 
521   blocks= (ulong) (use_mem / (sizeof(BLOCK_LINK) + 2 * sizeof(HASH_LINK) +
522                               sizeof(HASH_LINK*) * 5/4 + key_cache_block_size));
523 
524   /* Changed blocks hash needs to be a power of 2 */
525   changed_blocks_hash_size= my_round_up_to_next_power(MY_MAX(changed_blocks_hash_size,
526                                                              MIN_CHANGED_BLOCKS_HASH_SIZE));
527 
528   /* It doesn't make sense to have too few blocks (less than 8) */
529   if (blocks >= 8)
530   {
531     for ( ; ; )
532     {
533       /* Set my_hash_entries to the next bigger 2 power */
534       if ((keycache->hash_entries= next_power(blocks)) < blocks * 5/4)
535         keycache->hash_entries<<= 1;
536       hash_links= 2 * blocks;
537 #if defined(MAX_THREADS)
538       if (hash_links < MAX_THREADS + blocks - 1)
539         hash_links= MAX_THREADS + blocks - 1;
540 #endif
541       while ((length= (ALIGN_SIZE(blocks * sizeof(BLOCK_LINK)) +
542 		       ALIGN_SIZE(hash_links * sizeof(HASH_LINK)) +
543 		       ALIGN_SIZE(sizeof(HASH_LINK*) *
544                                   keycache->hash_entries) +
545                        sizeof(BLOCK_LINK*)* (changed_blocks_hash_size*2))) +
546              ((size_t) blocks * keycache->key_cache_block_size) > use_mem && blocks > 8)
547         blocks--;
548       /* Allocate memory for cache page buffers */
549       if ((keycache->block_mem=
550 	   my_large_malloc((size_t) blocks * keycache->key_cache_block_size,
551 			  MYF(0))))
552       {
553         /*
554 	  Allocate memory for blocks, hash_links and hash entries;
555 	  For each block 2 hash links are allocated
556         */
557         if (my_multi_malloc_large(MYF(MY_ZEROFILL),
558                                   &keycache->block_root,
559                                   (ulonglong) (blocks * sizeof(BLOCK_LINK)),
560                                   &keycache->hash_root,
561                                   (ulonglong) (sizeof(HASH_LINK*) *
562                                                keycache->hash_entries),
563                                   &keycache->hash_link_root,
564                                   (ulonglong) (hash_links * sizeof(HASH_LINK)),
565                                   &keycache->changed_blocks,
566                                   (ulonglong) (sizeof(BLOCK_LINK*) *
567                                                changed_blocks_hash_size),
568                                   &keycache->file_blocks,
569                                   (ulonglong) (sizeof(BLOCK_LINK*) *
570                                                changed_blocks_hash_size),
571                                   NullS))
572           break;
573         my_large_free(keycache->block_mem);
574         keycache->block_mem= 0;
575       }
576       if (blocks < 8)
577       {
578         my_errno= ENOMEM;
579         my_error(EE_OUTOFMEMORY, MYF(ME_FATAL),
580                  blocks * keycache->key_cache_block_size);
581         goto err;
582       }
583       blocks= blocks / 4*3;
584     }
585     keycache->blocks_unused= blocks;
586     keycache->disk_blocks= (int) blocks;
587     keycache->hash_links= hash_links;
588     keycache->hash_links_used= 0;
589     keycache->free_hash_list= NULL;
590     keycache->blocks_used= keycache->blocks_changed= 0;
591 
592     keycache->global_blocks_changed= 0;
593     keycache->blocks_available=0;		/* For debugging */
594 
595     /* The LRU chain is empty after initialization */
596     keycache->used_last= NULL;
597     keycache->used_ins= NULL;
598     keycache->free_block_list= NULL;
599     keycache->keycache_time= 0;
600     keycache->warm_blocks= 0;
601     keycache->min_warm_blocks= (division_limit ?
602 				blocks * division_limit / 100 + 1 :
603 				blocks);
604     keycache->age_threshold= (age_threshold ?
605 			      blocks * age_threshold / 100 :
606 			      blocks);
607     keycache->changed_blocks_hash_size= changed_blocks_hash_size;
608     keycache->can_be_used= 1;
609 
610     keycache->waiting_for_hash_link.last_thread= NULL;
611     keycache->waiting_for_block.last_thread= NULL;
612     DBUG_PRINT("exit",
613 	       ("disk_blocks: %d  block_root: %p  hash_entries: %d\
614  hash_root: %p  hash_links: %d  hash_link_root: %p",
615 		keycache->disk_blocks,   keycache->block_root,
616 		keycache->hash_entries,  keycache->hash_root,
617 		keycache->hash_links,    keycache->hash_link_root));
618   }
619   else
620   {
621     /* key_buffer_size is specified too small. Disable the cache. */
622     keycache->can_be_used= 0;
623   }
624 
625   keycache->blocks= keycache->disk_blocks > 0 ? keycache->disk_blocks : 0;
626   DBUG_RETURN((int) keycache->disk_blocks);
627 
628 err:
629   error= my_errno;
630   keycache->disk_blocks= 0;
631   keycache->blocks=  0;
632   if (keycache->block_mem)
633   {
634     my_large_free((uchar*) keycache->block_mem);
635     keycache->block_mem= NULL;
636   }
637   if (keycache->block_root)
638   {
639     my_free(keycache->block_root);
640     keycache->block_root= NULL;
641   }
642   my_errno= error;
643   keycache->can_be_used= 0;
644   DBUG_RETURN(0);
645 }
646 
647 
648 /*
649   Prepare for resizing a simple key cache
650 
651   SYNOPSIS
652     prepare_resize_simple_key_cache()
653     keycache                pointer to the control block of a simple key cache
654     release_lock            <=> release the key cache lock before return
_my_b_cache_read(IO_CACHE * info,uchar * Buffer,size_t Count)655 
656   DESCRIPTION
657     This function flushes all dirty pages from a simple key cache and after
658     this it destroys the key cache calling end_simple_key_cache. The function
659     takes the parameter keycache as a pointer to the control block
660     structure of the type SIMPLE_KEY_CACHE_CB for this key cache.
661     The parameter release_lock says whether the key cache lock must be
662     released before return from the function.
663 
664   RETURN VALUE
665     0 - on success,
666     1 - otherwise.
667 
668   NOTES
669     This function is the called by resize_simple_key_cache and
670     resize_partitioned_key_cache that resize simple and partitioned key caches
671     respectively.
672 */
673 
674 static
675 int prepare_resize_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache,
676                                     my_bool release_lock)
677 {
678   int res= 0;
679   DBUG_ENTER("prepare_resize_simple_key_cache");
680 
681   keycache_pthread_mutex_lock(&keycache->cache_lock);
682 
683   /*
684     We may need to wait for another thread which is doing a resize
685     already. This cannot happen in the MySQL server though. It allows
686     one resizer only. In set_var.cc keycache->in_init is used to block
687     multiple attempts.
688   */
689   while (keycache->in_resize)
690   {
691     /* purecov: begin inspected */
692     wait_on_queue(&keycache->resize_queue, &keycache->cache_lock);
693     /* purecov: end */
694   }
695 
696   /*
697     Mark the operation in progress. This blocks other threads from doing
698     a resize in parallel. It prohibits new blocks to enter the cache.
699     Read/write requests can bypass the cache during the flush phase.
700   */
701   keycache->in_resize= 1;
702 
703   /* Need to flush only if keycache is enabled. */
704   if (keycache->can_be_used)
705   {
706     /* Start the flush phase. */
707     keycache->resize_in_flush= 1;
708 
709     if (flush_all_key_blocks(keycache))
710     {
711       /* TODO: if this happens, we should write a warning in the log file ! */
712       keycache->resize_in_flush= 0;
713       keycache->can_be_used= 0;
714       res= 1;
715       goto finish;
716     }
717     DBUG_SLOW_ASSERT(cache_empty(keycache));
718 
719     /* End the flush phase. */
720     keycache->resize_in_flush= 0;
721   }
722 
723   /*
724     Some direct read/write operations (bypassing the cache) may still be
725     unfinished. Wait until they are done. If the key cache can be used,
726     direct I/O is done in increments of key_cache_block_size. That is,
727     every block is checked if it is in the cache. We need to wait for
728     pending I/O before re-initializing the cache, because we may change
729     the block size. Otherwise they could check for blocks at file
730     positions where the new block division has none. We do also want to
731     wait for I/O done when (if) the cache was disabled. It must not
732     run in parallel with normal cache operation.
733   */
734   while (keycache->cnt_for_resize_op)
735     wait_on_queue(&keycache->waiting_for_resize_cnt, &keycache->cache_lock);
736 
737   end_simple_key_cache(keycache, 0);
738 
739 finish:
740   if (release_lock)
741     keycache_pthread_mutex_unlock(&keycache->cache_lock);
742   DBUG_RETURN(res);
743 }
744 
745 
746 /*
747   Finalize resizing a simple key cache
748 
749   SYNOPSIS
750     finish_resize_simple_key_cache()
751     keycache                pointer to the control block of a simple key cache
752 
753   DESCRIPTION
754     This function performs finalizing actions for the operation of
755     resizing a simple key cache. The function takes the parameter
756     keycache as a pointer to the control block structure of the type
757     SIMPLE_KEY_CACHE_CB for this key cache. The function sets the flag
758     in_resize in this structure to FALSE.
759 
760   RETURN VALUE
761     none
762 
763   NOTES
764     This function is the called by resize_simple_key_cache and
765     resize_partitioned_key_cache that resize simple and partitioned key caches
766     respectively.
767 */
768 
769 static
770 void finish_resize_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache)
771 {
772   DBUG_ENTER("finish_resize_simple_key_cache");
773 
774   mysql_mutex_assert_owner(&keycache->cache_lock);
775 
776   /*
777     Mark the resize finished. This allows other threads to start a
778     resize or to request new cache blocks.
779   */
780   keycache->in_resize= 0;
781 
782 
783   /* Signal waiting threads. */
784   release_whole_queue(&keycache->resize_queue);
785 
786 
787   keycache_pthread_mutex_unlock(&keycache->cache_lock);
788 
789   DBUG_VOID_RETURN;
790 }
791 
792 
793 /*
794   Resize a simple key cache
795 
796   SYNOPSIS
797     resize_simple_key_cache()
798     keycache                pointer to the control block of a simple key cache
799     key_cache_block_size    size of blocks to keep cached data
800     use_mem                 memory to use for the key cache buffers/structures
801     division_limit          new division limit (if not zero)
802     age_threshold           new age threshold (if not zero)
803 
804   DESCRIPTION
805     This function is the implementation of the resize_key_cache interface
806     function that is employed by simple (non-partitioned) key caches.
807     The function takes the parameter keycache as a pointer to the
808     control block structure of the type SIMPLE_KEY_CACHE_CB for the simple key
809     cache to be resized.
810     The parameter key_cache_block_size specifies the new size of the blocks in
811     the key cache. The parameters division_limit and age_threshold
812     determine the new initial values of those characteristics of the key cache
813     that are used for midpoint insertion strategy. The parameter use_mem
814     specifies the total amount of memory to be allocated for key cache blocks
815     and auxiliary structures in the new key cache.
816 
817   RETURN VALUE
818     number of blocks in the key cache, if successful,
819     0 - otherwise.
820 
821   NOTES.
822     The function first calls the function prepare_resize_simple_key_cache
823     to flush all dirty blocks from key cache, to free memory used
824     for key cache blocks and auxiliary structures. After this the
825     function builds a new key cache with new parameters.
826 
827     This implementation doesn't block the calls and executions of other
828     functions from the key cache interface. However it assumes that the
829     calls of resize_simple_key_cache itself are serialized.
830 
831     The function starts the operation only when all other threads
832     performing operations with the key cache let her to proceed
833     (when cnt_for_resize=0).
834 */
835 
836 static
837 int resize_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache,
838                             uint key_cache_block_size,
839 		            size_t use_mem, uint division_limit,
840 		            uint age_threshold, uint changed_blocks_hash_size)
841 {
842   int blocks= 0;
843   DBUG_ENTER("resize_simple_key_cache");
844 
845   DBUG_ASSERT(keycache->key_cache_inited);
846 
847   /*
848     Note that the cache_lock mutex and the resize_queue are left untouched.
849     We do not lose the cache_lock and will release it only at the end of
850     this function.
851   */
852   if (prepare_resize_simple_key_cache(keycache, 0))
853     goto finish;
854 
855   /* The following will work even if use_mem is 0 */
856   blocks= init_simple_key_cache(keycache, key_cache_block_size, use_mem,
857 			        division_limit, age_threshold,
858                                 changed_blocks_hash_size);
859 
860 finish:
861   finish_resize_simple_key_cache(keycache);
862 
863   DBUG_RETURN(blocks);
864 }
865 
866 
867 /*
868   Increment counter blocking resize key cache operation
869 */
870 static inline void inc_counter_for_resize_op(SIMPLE_KEY_CACHE_CB *keycache)
871 {
872   keycache->cnt_for_resize_op++;
873 }
874 
875 
876 /*
877   Decrement counter blocking resize key cache operation;
878   Signal the operation to proceed when counter becomes equal zero
879 */
880 static inline void dec_counter_for_resize_op(SIMPLE_KEY_CACHE_CB *keycache)
881 {
882   if (!--keycache->cnt_for_resize_op)
883     release_whole_queue(&keycache->waiting_for_resize_cnt);
884 }
885 
886 
887 /*
888   Change key cache parameters of a simple key cache
889 
890   SYNOPSIS
init_io_cache_share(IO_CACHE * read_cache,IO_CACHE_SHARE * cshare,IO_CACHE * write_cache,uint num_threads)891     change_simple_key_cache_param()
892     keycache                pointer to the control block of a simple key cache
893     division_limit          new division limit (if not zero)
894     age_threshold           new age threshold (if not zero)
895 
896   DESCRIPTION
897     This function is the implementation of the change_key_cache_param interface
898     function that is employed by simple (non-partitioned) key caches.
899     The function takes the parameter keycache as a pointer to the
900     control block structure of the type SIMPLE_KEY_CACHE_CB for the simple key
901     cache where new values of the division limit and the age threshold used
902     for midpoint insertion strategy are to be set.  The parameters
903     division_limit and age_threshold provide these new values.
904 
905   RETURN VALUE
906     none
907 
908   NOTES.
909     Presently the function resets the key cache parameters concerning
910     midpoint insertion strategy - division_limit and age_threshold.
911     This function changes some parameters of a given key cache without
912     reformatting it. The function does not touch the contents the key
913     cache blocks.
914 */
915 
916 static
917 void change_simple_key_cache_param(SIMPLE_KEY_CACHE_CB *keycache, uint division_limit,
918 			           uint age_threshold)
919 {
920   DBUG_ENTER("change_simple_key_cache_param");
921   keycache_pthread_mutex_lock(&keycache->cache_lock);
922   if (division_limit)
923     keycache->min_warm_blocks= (keycache->disk_blocks *
924 				division_limit / 100 + 1);
925   if (age_threshold)
926     keycache->age_threshold=   (keycache->disk_blocks *
927 				age_threshold / 100);
928   keycache_pthread_mutex_unlock(&keycache->cache_lock);
929   DBUG_VOID_RETURN;
930 }
931 
932 
933 /*
934   Destroy a simple key cache
935 
936   SYNOPSIS
937     end_simple_key_cache()
938     keycache                pointer to the control block of a simple key cache
939     cleanup                 <=> complete free (free also mutex for key cache)
940 
941   DESCRIPTION
942     This function is the implementation of the end_key_cache interface
943     function that is employed by simple (non-partitioned) key caches.
944     The function takes the parameter keycache as a pointer to the
945     control block structure of the type SIMPLE_KEY_CACHE_CB for the simple key
946     cache to be destroyed.
947     The function frees the memory allocated for the key cache blocks and
948     auxiliary structures. If the value of the parameter cleanup is TRUE
remove_io_thread(IO_CACHE * cache)949     then even the key cache mutex is freed.
950 
951   RETURN VALUE
952     none
953 */
954 
955 static
956 void end_simple_key_cache(SIMPLE_KEY_CACHE_CB *keycache, my_bool cleanup)
957 {
958   DBUG_ENTER("end_simple_key_cache");
959   DBUG_PRINT("enter", ("key_cache: %p",  keycache));
960 
961   if (!keycache->key_cache_inited)
962     DBUG_VOID_RETURN;
963 
964   if (keycache->disk_blocks > 0)
965   {
966     if (keycache->block_mem)
967     {
968       my_large_free((uchar*) keycache->block_mem);
969       keycache->block_mem= NULL;
970       my_free(keycache->block_root);
971       keycache->block_root= NULL;
972     }
973     keycache->disk_blocks= -1;
974     /* Reset blocks_changed to be safe if flush_all_key_blocks is called */
975     keycache->blocks_changed= 0;
976   }
977 
978   DBUG_PRINT("status", ("used: %lu  changed: %lu  w_requests: %lu  "
979                         "writes: %lu  r_requests: %lu  reads: %lu",
980                         keycache->blocks_used, keycache->global_blocks_changed,
981                         (ulong) keycache->global_cache_w_requests,
982                         (ulong) keycache->global_cache_write,
983                         (ulong) keycache->global_cache_r_requests,
984                         (ulong) keycache->global_cache_read));
985 
986   /*
987     Reset these values to be able to detect a disabled key cache.
988     See Bug#44068 (RESTORE can disable the MyISAM Key Cache).
989   */
990   keycache->blocks_used= 0;
991   keycache->blocks_unused= 0;
992 
993   if (cleanup)
994   {
995     mysql_mutex_destroy(&keycache->cache_lock);
996     keycache->key_cache_inited= keycache->can_be_used= 0;
997     KEYCACHE_DEBUG_CLOSE;
998   }
999   DBUG_VOID_RETURN;
1000 } /* end_key_cache */
1001 
1002 
1003 /*
1004   Link a thread into double-linked queue of waiting threads.
1005 
1006   SYNOPSIS
1007     link_into_queue()
1008       wqueue              pointer to the queue structure
1009       thread              pointer to the thread to be added to the queue
1010 
1011   RETURN VALUE
1012     none
1013 
1014   NOTES.
1015     Queue is represented by a circular list of the thread structures
1016     The list is double-linked of the type (**prev,*next), accessed by
1017     a pointer to the last element.
1018 */
1019 
1020 static void link_into_queue(KEYCACHE_WQUEUE *wqueue,
1021                             struct st_my_thread_var *thread)
1022 {
1023   struct st_my_thread_var *last;
1024   DBUG_ASSERT(!thread->next && !thread->prev);
1025 
1026   if (! (last= wqueue->last_thread))
1027   {
lock_io_cache(IO_CACHE * cache,my_off_t pos)1028     /* Queue is empty */
1029     thread->next= thread;
1030     thread->prev= &thread->next;
1031   }
1032   else
1033   {
1034     DBUG_ASSERT(last->next->prev == &last->next);
1035     /* Add backlink to previous element */
1036     thread->prev=      last->next->prev;
1037     /* Fix first in list to point backwords to current */
1038     last->next->prev=  &thread->next;
1039     /* Next should point to the first element in list */
1040     thread->next=      last->next;
1041     /* Fix old element to point to new one */
1042     last->next=        thread;
1043   }
1044   wqueue->last_thread= thread;
1045 }
1046 
1047 /*
1048   Unlink a thread from double-linked queue of waiting threads
1049 
1050   SYNOPSIS
1051     unlink_from_queue()
1052       wqueue              pointer to the queue structure
1053       thread              pointer to the thread to be removed from the queue
1054 
1055   RETURN VALUE
1056     none
1057 
1058   NOTES.
1059     See NOTES for link_into_queue
1060 */
1061 
1062 static void unlink_from_queue(KEYCACHE_WQUEUE *wqueue,
1063                               struct st_my_thread_var *thread)
1064 {
1065   KEYCACHE_DBUG_PRINT("unlink_from_queue", ("thread %ld", (ulong) thread->id));
1066   DBUG_ASSERT(thread->next && thread->prev);
1067 
1068   if (thread->next == thread)
1069   {
1070     /* The queue contains only one member */
1071     wqueue->last_thread= NULL;
1072   }
1073   else
1074   {
1075     /* Remove current element from list */
1076     thread->next->prev= thread->prev;
1077     *thread->prev=      thread->next;
1078     /* If first element, change list pointer to point to previous element */
1079     if (wqueue->last_thread == thread)
1080       wqueue->last_thread= STRUCT_PTR(struct st_my_thread_var, next,
1081                                       thread->prev);
1082   }
1083   thread->next= NULL;
1084 #ifdef DBUG_ASSERT_EXISTS
1085   /*
1086     This makes it easier to see it's not in a chain during debugging.
1087     And some DBUG_ASSERT() rely on it.
1088   */
1089   thread->prev= NULL;
1090 #endif
1091 }
1092 
1093 
1094 /*
1095   Add a thread to single-linked queue of waiting threads
1096 
1097   SYNOPSIS
1098     wait_on_queue()
1099       wqueue            Pointer to the queue structure.
1100       mutex             Cache_lock to acquire after awake.
1101 
1102   RETURN VALUE
1103     none
1104 
1105   NOTES.
1106     Queue is represented by a circular list of the thread structures
1107     The list is single-linked of the type (*next), accessed by a pointer
1108     to the last element.
1109 
1110     The function protects against stray signals by verifying that the
1111     current thread is unlinked from the queue when awaking. However,
1112     since several threads can wait for the same event, it might be
1113     necessary for the caller of the function to check again if the
1114     condition for awake is indeed matched.
1115 */
1116 
1117 static void wait_on_queue(KEYCACHE_WQUEUE *wqueue,
1118                           mysql_mutex_t *mutex)
1119 {
1120   struct st_my_thread_var *last;
1121   struct st_my_thread_var *thread= my_thread_var;
1122   DBUG_ASSERT(!thread->next);
1123   DBUG_ASSERT(!thread->prev); /* Not required, but must be true anyway. */
1124   mysql_mutex_assert_owner(mutex);
1125 
1126   /* Add to queue. */
1127   if (! (last= wqueue->last_thread))
1128     thread->next= thread;
1129   else
1130   {
1131     thread->next= last->next;
1132     last->next= thread;
1133   }
1134   wqueue->last_thread= thread;
1135 
1136   /*
1137     Wait until thread is removed from queue by the signaling thread.
1138     The loop protects against stray signals.
1139   */
1140   do
1141   {
1142     KEYCACHE_DBUG_PRINT("wait", ("suspend thread %ld", (ulong) thread->id));
1143     keycache_pthread_cond_wait(&thread->suspend, mutex);
1144   }
1145   while (thread->next);
1146 }
1147 
1148 
1149 /*
1150   Remove all threads from queue signaling them to proceed
1151 
1152   SYNOPSIS
1153     release_whole_queue()
1154       wqueue            pointer to the queue structure
1155 
1156   RETURN VALUE
1157     none
1158 
1159   NOTES.
1160     See notes for wait_on_queue().
1161     When removed from the queue each thread is signaled via condition
1162     variable thread->suspend.
1163 */
1164 
1165 static void release_whole_queue(KEYCACHE_WQUEUE *wqueue)
1166 {
1167   struct st_my_thread_var *last;
1168   struct st_my_thread_var *next;
1169   struct st_my_thread_var *thread;
1170 
unlock_io_cache(IO_CACHE * cache)1171   /* Queue may be empty. */
1172   if (!(last= wqueue->last_thread))
1173     return;
1174 
1175   next= last->next;                             /* First (oldest) element */
1176   do
1177   {
1178     thread=next;
1179     DBUG_ASSERT(thread && thread->init == 1);
1180     KEYCACHE_DBUG_PRINT("release_whole_queue: signal",
1181                         ("thread %ld", (ulong) thread->id));
1182     /* Take thread from queue. */
1183     next= thread->next;
1184     thread->next= NULL;
1185 
1186     /* Signal the thread. */
1187     keycache_pthread_cond_signal(&thread->suspend);
1188   }
1189   while (thread != last);
1190 
1191   /* Now queue is definitely empty. */
1192   wqueue->last_thread= NULL;
1193 }
1194 
1195 
1196 /*
1197   Unlink a block from the chain of dirty/clean blocks
1198 */
1199 
1200 static inline void unlink_changed(BLOCK_LINK *block)
1201 {
1202   DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
1203   if (block->next_changed)
1204     block->next_changed->prev_changed= block->prev_changed;
1205   *block->prev_changed= block->next_changed;
1206 
1207 #ifdef DBUG_ASSERT_EXISTS
1208   /*
1209     This makes it easier to see it's not in a chain during debugging.
1210     And some DBUG_ASSERT() rely on it.
1211   */
1212   block->next_changed= NULL;
1213   block->prev_changed= NULL;
1214 #endif
1215 }
1216 
1217 
1218 /*
1219   Link a block into the chain of dirty/clean blocks
1220 */
1221 
1222 static inline void link_changed(BLOCK_LINK *block, BLOCK_LINK **phead)
1223 {
1224   DBUG_ASSERT(!block->next_changed);
_my_b_cache_read_r(IO_CACHE * cache,uchar * Buffer,size_t Count)1225   DBUG_ASSERT(!block->prev_changed);
1226   block->prev_changed= phead;
1227   if ((block->next_changed= *phead))
1228     (*phead)->prev_changed= &block->next_changed;
1229   *phead= block;
1230 }
1231 
1232 
1233 /*
1234   Link a block in a chain of clean blocks of a file.
1235 
1236   SYNOPSIS
1237     link_to_file_list()
1238       keycache		Key cache handle
1239       block             Block to relink
1240       file              File to be linked to
1241       unlink            If to unlink first
1242 
1243   DESCRIPTION
1244     Unlink a block from whichever chain it is linked in, if it's
1245     asked for, and link it to the chain of clean blocks of the
1246     specified file.
1247 
1248   NOTE
1249     Please do never set/clear BLOCK_CHANGED outside of
1250     link_to_file_list() or link_to_changed_list().
1251     You would risk to damage correct counting of changed blocks
1252     and to find blocks in the wrong hash.
1253 
1254   RETURN
1255     void
1256 */
1257 
1258 static void link_to_file_list(SIMPLE_KEY_CACHE_CB *keycache,
1259                               BLOCK_LINK *block, int file,
1260                               my_bool unlink_block)
1261 {
1262   DBUG_ASSERT(block->status & BLOCK_IN_USE);
1263   DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
1264   DBUG_ASSERT(block->hash_link->file == file);
1265   if (unlink_block)
1266     unlink_changed(block);
1267   link_changed(block, &keycache->file_blocks[FILE_HASH(file, keycache)]);
1268   if (block->status & BLOCK_CHANGED)
1269   {
1270     block->status&= ~BLOCK_CHANGED;
1271     keycache->blocks_changed--;
1272     keycache->global_blocks_changed--;
1273   }
1274 }
1275 
1276 
1277 /*
1278   Re-link a block from the clean chain to the dirty chain of a file.
1279 
1280   SYNOPSIS
1281     link_to_changed_list()
1282       keycache		key cache handle
1283       block             block to relink
1284 
1285   DESCRIPTION
1286     Unlink a block from the chain of clean blocks of a file
1287     and link it to the chain of dirty blocks of the same file.
1288 
1289   NOTE
1290     Please do never set/clear BLOCK_CHANGED outside of
1291     link_to_file_list() or link_to_changed_list().
1292     You would risk to damage correct counting of changed blocks
1293     and to find blocks in the wrong hash.
1294 
1295   RETURN
1296     void
1297 */
1298 
1299 static void link_to_changed_list(SIMPLE_KEY_CACHE_CB *keycache,
1300                                  BLOCK_LINK *block)
1301 {
1302   DBUG_ASSERT(block->status & BLOCK_IN_USE);
1303   DBUG_ASSERT(!(block->status & BLOCK_CHANGED));
1304   DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
1305 
1306   unlink_changed(block);
1307   link_changed(block,
1308                &keycache->changed_blocks[FILE_HASH(block->hash_link->file, keycache)]);
1309   block->status|=BLOCK_CHANGED;
1310   keycache->blocks_changed++;
1311   keycache->global_blocks_changed++;
1312 }
1313 
1314 
1315 /*
1316   Link a block to the LRU chain at the beginning or at the end of
1317   one of two parts.
1318 
1319   SYNOPSIS
1320     link_block()
1321       keycache            pointer to a key cache data structure
1322       block               pointer to the block to link to the LRU chain
1323       hot                 <-> to link the block into the hot subchain
1324       at_end              <-> to link the block at the end of the subchain
1325 
1326   RETURN VALUE
1327     none
1328 
1329   NOTES.
1330     The LRU ring is represented by a circular list of block structures.
1331     The list is double-linked of the type (**prev,*next) type.
1332     The LRU ring is divided into two parts - hot and warm.
1333     There are two pointers to access the last blocks of these two
1334     parts. The beginning of the warm part follows right after the
1335     end of the hot part.
1336     Only blocks of the warm part can be used for eviction.
1337     The first block from the beginning of this subchain is always
1338     taken for eviction (keycache->last_used->next)
1339 
1340     LRU chain:       +------+   H O T    +------+
1341                 +----| end  |----...<----| beg  |----+
1342                 |    +------+last        +------+    |
1343                 v<-link in latest hot (new end)      |
1344                 |     link in latest warm (new end)->^
1345                 |    +------+  W A R M   +------+    |
1346                 +----| beg  |---->...----| end  |----+
copy_to_read_buffer(IO_CACHE * write_cache,const uchar * write_buffer,my_off_t pos_in_file)1347                      +------+            +------+ins
1348                   first for eviction
1349 
1350     It is also possible that the block is selected for eviction and thus
1351     not linked in the LRU ring.
1352 */
1353 
1354 static void link_block(SIMPLE_KEY_CACHE_CB *keycache, BLOCK_LINK *block,
1355                        my_bool hot, my_bool at_end)
1356 {
1357   BLOCK_LINK *ins;
1358   BLOCK_LINK **pins;
1359 
1360   DBUG_ASSERT((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE));
1361   DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/
1362   DBUG_ASSERT(!block->requests);
1363   DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
1364   DBUG_ASSERT(!block->next_used);
1365   DBUG_ASSERT(!block->prev_used);
1366   if (!hot && keycache->waiting_for_block.last_thread)
1367   {
1368     /* Signal that in the LRU warm sub-chain an available block has appeared */
1369     struct st_my_thread_var *last_thread=
1370                                keycache->waiting_for_block.last_thread;
1371     struct st_my_thread_var *first_thread= last_thread->next;
1372     struct st_my_thread_var *next_thread= first_thread;
1373     HASH_LINK *hash_link= (HASH_LINK *) first_thread->keycache_link;
1374     struct st_my_thread_var *thread;
1375     do
1376     {
1377       thread= next_thread;
1378       next_thread= thread->next;
1379       /*
1380          We notify about the event all threads that ask
1381          for the same page as the first thread in the queue
1382       */
1383       if ((HASH_LINK *) thread->keycache_link == hash_link)
1384       {
1385         KEYCACHE_DBUG_PRINT("link_block: signal",
1386                             ("thread %ld", (ulong) thread->id));
1387         keycache_pthread_cond_signal(&thread->suspend);
1388         unlink_from_queue(&keycache->waiting_for_block, thread);
1389         block->requests++;
1390       }
1391     }
1392     while (thread != last_thread);
1393     hash_link->block= block;
1394     /*
1395       NOTE: We assigned the block to the hash_link and signalled the
1396       requesting thread(s). But it is possible that other threads runs
1397       first. These threads see the hash_link assigned to a block which
1398       is assigned to another hash_link and not marked BLOCK_IN_SWITCH.
1399       This can be a problem for functions that do not select the block
1400       via its hash_link: flush and free. They do only see a block which
1401       is in a "normal" state and don't know that it will be evicted soon.
1402 
1403       We cannot set BLOCK_IN_SWITCH here because only one of the
1404       requesting threads must handle the eviction. All others must wait
1405       for it to complete. If we set the flag here, the threads would not
1406       know who is in charge of the eviction. Without the flag, the first
1407       thread takes the stick and sets the flag.
1408 
1409       But we need to note in the block that is has been selected for
1410       eviction. It must not be freed. The evicting thread will not
1411       expect the block in the free list. Before freeing we could also
1412       check if block->requests > 1. But I think including another flag
1413       in the check of block->status is slightly more efficient and
1414       probably easier to read.
1415     */
1416     block->status|= BLOCK_IN_EVICTION;
1417     KEYCACHE_THREAD_TRACE("link_block: after signaling");
1418 #if defined(KEYCACHE_DEBUG)
1419     KEYCACHE_DBUG_PRINT("link_block",
1420         ("linked,unlinked block %u  status=%x  #requests=%u  #available=%u",
1421          BLOCK_NUMBER(block), block->status,
1422          block->requests, keycache->blocks_available));
1423 #endif
1424     return;
1425   }
1426   pins= hot ? &keycache->used_ins : &keycache->used_last;
1427   ins= *pins;
1428   if (ins)
1429   {
1430     ins->next_used->prev_used= &block->next_used;
1431     block->next_used= ins->next_used;
1432     block->prev_used= &ins->next_used;
1433     ins->next_used= block;
1434     if (at_end)
1435       *pins= block;
1436   }
1437   else
1438   {
1439     /* The LRU ring is empty. Let the block point to itself. */
1440     keycache->used_last= keycache->used_ins= block->next_used= block;
1441     block->prev_used= &block->next_used;
1442   }
1443   KEYCACHE_THREAD_TRACE("link_block");
1444 #if defined(KEYCACHE_DEBUG)
1445   keycache->blocks_available++;
1446   KEYCACHE_DBUG_PRINT("link_block",
1447       ("linked block %u:%1u  status=%x  #requests=%u  #available=%u",
1448        BLOCK_NUMBER(block), at_end, block->status,
1449        block->requests, keycache->blocks_available));
1450   KEYCACHE_DBUG_ASSERT((ulong) keycache->blocks_available <=
1451                        keycache->blocks_used);
1452 #endif
1453 }
1454 
1455 
1456 /*
1457   Unlink a block from the LRU chain
1458 
1459   SYNOPSIS
1460     unlink_block()
1461       keycache            pointer to a key cache data structure
1462       block               pointer to the block to unlink from the LRU chain
1463 
1464   RETURN VALUE
1465     none
1466 
1467   NOTES.
1468     See NOTES for link_block
1469 */
1470 
1471 static void unlink_block(SIMPLE_KEY_CACHE_CB *keycache, BLOCK_LINK *block)
1472 {
1473   DBUG_ASSERT((block->status & ~BLOCK_CHANGED) == (BLOCK_READ | BLOCK_IN_USE));
1474   DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/
1475   DBUG_ASSERT(!block->requests);
1476   DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
1477   DBUG_ASSERT(block->next_used && block->prev_used &&
1478               (block->next_used->prev_used == &block->next_used) &&
1479               (*block->prev_used == block));
1480   if (block->next_used == block)
1481     /* The list contains only one member */
1482     keycache->used_last= keycache->used_ins= NULL;
1483   else
1484   {
1485     block->next_used->prev_used= block->prev_used;
1486     *block->prev_used= block->next_used;
1487     if (keycache->used_last == block)
1488       keycache->used_last= STRUCT_PTR(BLOCK_LINK, next_used, block->prev_used);
1489     if (keycache->used_ins == block)
1490       keycache->used_ins=STRUCT_PTR(BLOCK_LINK, next_used, block->prev_used);
1491   }
1492   block->next_used= NULL;
1493 #ifdef DBUG_ASSERT_EXISTS
1494   /*
1495     This makes it easier to see it's not in a chain during debugging.
1496     And some DBUG_ASSERT() rely on it.
1497   */
1498   block->prev_used= NULL;
1499 #endif
1500 
1501   KEYCACHE_THREAD_TRACE("unlink_block");
1502 #if defined(KEYCACHE_DEBUG)
1503   KEYCACHE_DBUG_ASSERT(keycache->blocks_available != 0);
1504   keycache->blocks_available--;
1505   KEYCACHE_DBUG_PRINT("unlink_block",
1506     ("unlinked block %u  status=%x   #requests=%u  #available=%u",
1507      BLOCK_NUMBER(block), block->status,
1508      block->requests, keycache->blocks_available));
1509 #endif
1510 }
1511 
1512 
1513 /*
1514   Register requests for a block.
1515 
1516   SYNOPSIS
1517     reg_requests()
1518       keycache          Pointer to a key cache data structure.
1519       block             Pointer to the block to register a request on.
1520       count             Number of requests. Always 1.
1521 
1522   NOTE
1523     The first request unlinks the block from the LRU ring. This means
1524     that it is protected against eveiction.
1525 
1526   RETURN
1527     void
1528 */
1529 static void reg_requests(SIMPLE_KEY_CACHE_CB *keycache,
1530                          BLOCK_LINK *block, int count)
1531 {
1532   DBUG_ASSERT(block->status & BLOCK_IN_USE);
1533   DBUG_ASSERT(block->hash_link);
1534 
1535   if (!block->requests)
1536     unlink_block(keycache, block);
1537   block->requests+=count;
1538 }
1539 
1540 
1541 /*
1542   Unregister request for a block
1543   linking it to the LRU chain if it's the last request
_my_b_async_read(IO_CACHE * info,uchar * Buffer,size_t Count)1544 
1545   SYNOPSIS
1546     unreg_request()
1547     keycache            pointer to a key cache data structure
1548     block               pointer to the block to link to the LRU chain
1549     at_end              <-> to link the block at the end of the LRU chain
1550 
1551   RETURN VALUE
1552     none
1553 
1554   NOTES.
1555     Every linking to the LRU ring decrements by one a special block
1556     counter (if it's positive). If the at_end parameter is TRUE the block is
1557     added either at the end of warm sub-chain or at the end of hot sub-chain.
1558     It is added to the hot subchain if its counter is zero and number of
1559     blocks in warm sub-chain is not less than some low limit (determined by
1560     the division_limit parameter). Otherwise the block is added to the warm
1561     sub-chain. If the at_end parameter is FALSE the block is always added
1562     at beginning of the warm sub-chain.
1563     Thus a warm block can be promoted to the hot sub-chain when its counter
1564     becomes zero for the first time.
1565     At the same time  the block at the very beginning of the hot subchain
1566     might be moved to the beginning of the warm subchain if it stays untouched
1567     for a too long time (this time is determined by parameter age_threshold).
1568 
1569     It is also possible that the block is selected for eviction and thus
1570     not linked in the LRU ring.
1571 */
1572 
1573 static void unreg_request(SIMPLE_KEY_CACHE_CB *keycache,
1574                           BLOCK_LINK *block, int at_end)
1575 {
1576   DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
1577   DBUG_ASSERT(block->hash_link); /*backptr to block NULL from free_block()*/
1578   DBUG_ASSERT(block->requests);
1579   DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
1580   DBUG_ASSERT(!block->next_used);
1581   DBUG_ASSERT(!block->prev_used);
1582   /*
1583     Unregister the request, but do not link erroneous blocks into the
1584     LRU ring.
1585   */
1586   if (!--block->requests && !(block->status & BLOCK_ERROR))
1587   {
1588     my_bool hot;
1589     if (block->hits_left)
1590       block->hits_left--;
1591     hot= !block->hits_left && at_end &&
1592       keycache->warm_blocks > keycache->min_warm_blocks;
1593     if (hot)
1594     {
1595       if (block->temperature == BLOCK_WARM)
1596         keycache->warm_blocks--;
1597       block->temperature= BLOCK_HOT;
1598       KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
1599                            keycache->warm_blocks));
1600     }
1601     link_block(keycache, block, hot, (my_bool)at_end);
1602     block->last_hit_time= keycache->keycache_time;
1603     keycache->keycache_time++;
1604     /*
1605       At this place, the block might be in the LRU ring or not. If an
1606       evicter was waiting for a block, it was selected for eviction and
1607       not linked in the LRU ring.
1608     */
1609 
1610     /*
1611       Check if we should link a hot block to the warm block sub-chain.
1612       It is possible that we select the same block as above. But it can
1613       also be another block. In any case a block from the LRU ring is
1614       selected. In other words it works even if the above block was
1615       selected for eviction and not linked in the LRU ring. Since this
1616       happens only if the LRU ring is empty, the block selected below
1617       would be NULL and the rest of the function skipped.
1618     */
1619     block= keycache->used_ins;
1620     if (block && keycache->keycache_time - block->last_hit_time >
1621 	keycache->age_threshold)
1622     {
1623       unlink_block(keycache, block);
1624       link_block(keycache, block, 0, 0);
1625       if (block->temperature != BLOCK_WARM)
1626       {
1627         keycache->warm_blocks++;
1628         block->temperature= BLOCK_WARM;
1629       }
1630       KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
1631                            keycache->warm_blocks));
1632     }
1633   }
1634 }
1635 
1636 /*
1637   Remove a reader of the page in block
1638 */
1639 
1640 static void remove_reader(BLOCK_LINK *block)
1641 {
1642   DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
1643   DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
1644   DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
1645   DBUG_ASSERT(!block->next_used);
1646   DBUG_ASSERT(!block->prev_used);
1647   DBUG_ASSERT(block->hash_link->requests);
1648   if (! --block->hash_link->requests && block->condvar)
1649     keycache_pthread_cond_signal(block->condvar);
1650 }
1651 
1652 
1653 /*
1654   Wait until the last reader of the page in block
1655   signals on its termination
1656 */
1657 
1658 static void wait_for_readers(SIMPLE_KEY_CACHE_CB *keycache,
1659                              BLOCK_LINK *block)
1660 {
1661   struct st_my_thread_var *thread= my_thread_var;
1662   DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
1663   DBUG_ASSERT(!(block->status & (BLOCK_IN_FLUSH | BLOCK_CHANGED)));
1664   DBUG_ASSERT(block->hash_link);
1665   DBUG_ASSERT(block->hash_link->block == block);
1666   /* Linked in file_blocks or changed_blocks hash. */
1667   DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
1668   /* Not linked in LRU ring. */
1669   DBUG_ASSERT(!block->next_used);
1670   DBUG_ASSERT(!block->prev_used);
1671   while (block->hash_link->requests)
1672   {
1673     KEYCACHE_DBUG_PRINT("wait_for_readers: wait",
1674                         ("suspend thread %ld  block %u",
1675                          (ulong) thread->id, BLOCK_NUMBER(block)));
1676     /* There must be no other waiter. We have no queue here. */
1677     DBUG_ASSERT(!block->condvar);
1678     block->condvar= &thread->suspend;
1679     keycache_pthread_cond_wait(&thread->suspend, &keycache->cache_lock);
1680     block->condvar= NULL;
1681   }
1682 }
1683 
1684 
1685 /*
1686   Add a hash link to a bucket in the hash_table
1687 */
1688 
1689 static inline void link_hash(HASH_LINK **start, HASH_LINK *hash_link)
1690 {
1691   if (*start)
1692     (*start)->prev= &hash_link->next;
1693   hash_link->next= *start;
1694   hash_link->prev= start;
1695   *start= hash_link;
1696 }
1697 
1698 
1699 /*
1700   Remove a hash link from the hash table
1701 */
1702 
1703 static void unlink_hash(SIMPLE_KEY_CACHE_CB *keycache, HASH_LINK *hash_link)
1704 {
1705   KEYCACHE_DBUG_PRINT("unlink_hash", ("fd: %u  pos_ %lu  #requests=%u",
1706       (uint) hash_link->file,(ulong) hash_link->diskpos, hash_link->requests));
1707   KEYCACHE_DBUG_ASSERT(hash_link->requests == 0);
1708   if ((*hash_link->prev= hash_link->next))
1709     hash_link->next->prev= hash_link->prev;
1710   hash_link->block= NULL;
1711   if (keycache->waiting_for_hash_link.last_thread)
1712   {
1713     /* Signal that a free hash link has appeared */
1714     struct st_my_thread_var *last_thread=
_my_b_get(IO_CACHE * info)1715                                keycache->waiting_for_hash_link.last_thread;
1716     struct st_my_thread_var *first_thread= last_thread->next;
1717     struct st_my_thread_var *next_thread= first_thread;
1718     KEYCACHE_PAGE *first_page= (KEYCACHE_PAGE *) (first_thread->keycache_link);
1719     struct st_my_thread_var *thread;
1720 
1721     hash_link->file= first_page->file;
1722     hash_link->diskpos= first_page->filepos;
1723     do
1724     {
1725       KEYCACHE_PAGE *page;
1726       thread= next_thread;
1727       page= (KEYCACHE_PAGE *) thread->keycache_link;
1728       next_thread= thread->next;
1729       /*
1730          We notify about the event all threads that ask
1731          for the same page as the first thread in the queue
1732       */
_my_b_cache_write(IO_CACHE * info,const uchar * Buffer,size_t Count)1733       if (page->file == hash_link->file && page->filepos == hash_link->diskpos)
1734       {
1735         KEYCACHE_DBUG_PRINT("unlink_hash: signal",
1736                             ("thread %ld", (ulong) thread->id));
1737         keycache_pthread_cond_signal(&thread->suspend);
1738         unlink_from_queue(&keycache->waiting_for_hash_link, thread);
1739       }
1740     }
1741     while (thread != last_thread);
1742     link_hash(&keycache->hash_root[KEYCACHE_HASH(hash_link->file,
1743 					         hash_link->diskpos)],
1744               hash_link);
1745     return;
1746   }
1747   hash_link->next= keycache->free_hash_list;
1748   keycache->free_hash_list= hash_link;
1749 }
1750 
1751 
1752 /*
1753   Get the hash link for a page
1754 */
1755 
1756 static HASH_LINK *get_hash_link(SIMPLE_KEY_CACHE_CB *keycache,
1757                                 int file, my_off_t filepos)
1758 {
1759   reg1 HASH_LINK *hash_link, **start;
1760 #if defined(KEYCACHE_DEBUG)
1761   int cnt;
1762 #endif
1763 
1764   KEYCACHE_DBUG_PRINT("get_hash_link", ("fd: %u  pos: %lu",
1765                       (uint) file,(ulong) filepos));
1766 
1767 restart:
1768   /*
1769      Find the bucket in the hash table for the pair (file, filepos);
1770      start contains the head of the bucket list,
1771      hash_link points to the first member of the list
1772   */
1773   hash_link= *(start= &keycache->hash_root[KEYCACHE_HASH(file, filepos)]);
1774 #if defined(KEYCACHE_DEBUG)
1775   cnt= 0;
1776 #endif
_my_b_cache_write_r(IO_CACHE * info,const uchar * Buffer,size_t Count)1777   /* Look for an element for the pair (file, filepos) in the bucket chain */
1778   while (hash_link &&
1779          (hash_link->diskpos != filepos || hash_link->file != file))
1780   {
1781     hash_link= hash_link->next;
1782 #if defined(KEYCACHE_DEBUG)
1783     cnt++;
1784     if (! (cnt <= keycache->hash_links_used))
1785     {
1786       int i;
1787       for (i=0, hash_link= *start ;
1788            i < cnt ; i++, hash_link= hash_link->next)
1789       {
1790         KEYCACHE_DBUG_PRINT("get_hash_link", ("fd: %u  pos: %lu",
1791             (uint) hash_link->file,(ulong) hash_link->diskpos));
1792       }
1793     }
1794     KEYCACHE_DBUG_ASSERT(cnt <= keycache->hash_links_used);
1795 #endif
1796   }
1797   if (! hash_link)
my_b_append(IO_CACHE * info,const uchar * Buffer,size_t Count)1798   {
1799     /* There is no hash link in the hash table for the pair (file, filepos) */
1800     if (keycache->free_hash_list)
1801     {
1802       hash_link= keycache->free_hash_list;
1803       keycache->free_hash_list= hash_link->next;
1804     }
1805     else if (keycache->hash_links_used < keycache->hash_links)
1806     {
1807       hash_link= &keycache->hash_link_root[keycache->hash_links_used++];
1808     }
1809     else
1810     {
1811       /* Wait for a free hash link */
1812       struct st_my_thread_var *thread= my_thread_var;
1813       KEYCACHE_PAGE page;
1814       KEYCACHE_DBUG_PRINT("get_hash_link", ("waiting"));
1815       page.file= file;
1816       page.filepos= filepos;
1817       thread->keycache_link= (void *) &page;
1818       link_into_queue(&keycache->waiting_for_hash_link, thread);
1819       KEYCACHE_DBUG_PRINT("get_hash_link: wait",
1820                           ("suspend thread %ld", (ulong) thread->id));
1821       keycache_pthread_cond_wait(&thread->suspend,
1822                                  &keycache->cache_lock);
1823       thread->keycache_link= NULL;
1824       goto restart;
1825     }
1826     hash_link->file= file;
1827     hash_link->diskpos= filepos;
1828     link_hash(start, hash_link);
1829   }
1830   /* Register the request for the page */
1831   hash_link->requests++;
1832 
1833   return hash_link;
1834 }
1835 
1836 
1837 /*
1838   Get a block for the file page requested by a keycache read/write operation;
1839   If the page is not in the cache return a free block, if there is none
1840   return the lru block after saving its buffer if the page is dirty.
1841 
1842   SYNOPSIS
my_b_safe_write(IO_CACHE * info,const uchar * Buffer,size_t Count)1843 
1844     find_key_block()
1845       keycache            pointer to a key cache data structure
1846       file                handler for the file to read page from
1847       filepos             position of the page in the file
1848       init_hits_left      how initialize the block counter for the page
1849       wrmode              <-> get for writing
1850       page_st        out  {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ}
1851 
1852   RETURN VALUE
1853     Pointer to the found block if successful, 0 - otherwise
1854 
1855   NOTES.
1856     For the page from file positioned at filepos the function checks whether
1857     the page is in the key cache specified by the first parameter.
1858     If this is the case it immediately returns the block.
1859     If not, the function first chooses  a block for this page. If there is
1860     no not used blocks in the key cache yet, the function takes the block
1861     at the very beginning of the warm sub-chain. It saves the page in that
1862     block if it's dirty before returning the pointer to it.
1863     The function returns in the page_st parameter the following values:
1864       PAGE_READ         - if page already in the block,
1865       PAGE_TO_BE_READ   - if it is to be read yet by the current thread
1866       WAIT_TO_BE_READ   - if it is to be read by another thread
1867     If an error occurs THE BLOCK_ERROR bit is set in the block status.
1868     It might happen that there are no blocks in LRU chain (in warm part) -
1869     all blocks  are unlinked for some read/write operations. Then the function
1870     waits until first of this operations links any block back.
1871 */
1872 
1873 static BLOCK_LINK *find_key_block(SIMPLE_KEY_CACHE_CB *keycache,
1874                                   File file, my_off_t filepos,
1875                                   int init_hits_left,
1876                                   int wrmode, int *page_st)
1877 {
1878   HASH_LINK *hash_link;
1879   BLOCK_LINK *block;
1880   int error= 0;
1881   int page_status;
1882 
1883   DBUG_ENTER("find_key_block");
1884   KEYCACHE_THREAD_TRACE("find_key_block:begin");
1885   DBUG_PRINT("enter", ("fd: %d  pos: %lu  wrmode: %d",
1886                        file, (ulong) filepos, wrmode));
1887   KEYCACHE_DBUG_PRINT("find_key_block", ("fd: %d  pos: %lu  wrmode: %d",
1888                                          file, (ulong) filepos,
1889                                          wrmode));
1890 #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
1891   DBUG_EXECUTE("check_keycache2",
1892                test_key_cache(keycache, "start of find_key_block", 0););
1893 #endif
1894 
1895 restart:
1896   /*
1897     If the flush phase of a resize operation fails, the cache is left
1898     unusable. This will be detected only after "goto restart".
1899   */
1900   if (!keycache->can_be_used)
1901     DBUG_RETURN(0);
1902 
1903   /*
1904     Find the hash_link for the requested file block (file, filepos). We
1905     do always get a hash_link here. It has registered our request so
1906     that no other thread can use it for another file block until we
1907     release the request (which is done by remove_reader() usually). The
1908     hash_link can have a block assigned to it or not. If there is a
1909     block, it may be assigned to this hash_link or not. In cases where a
1910     block is evicted from the cache, it is taken from the LRU ring and
1911     referenced by the new hash_link. But the block can still be assigned
1912     to its old hash_link for some time if it needs to be flushed first,
1913     or if there are other threads still reading it.
1914 
1915     Summary:
1916       hash_link is always returned.
1917       hash_link->block can be:
1918       - NULL or
1919       - not assigned to this hash_link or
1920       - assigned to this hash_link. If assigned, the block can have
1921         - invalid data (when freshly assigned) or
1922         - valid data. Valid data can be
1923           - changed over the file contents (dirty) or
1924           - not changed (clean).
1925   */
1926   hash_link= get_hash_link(keycache, file, filepos);
1927   DBUG_ASSERT((hash_link->file == file) && (hash_link->diskpos == filepos));
1928 
1929   page_status= -1;
1930   if ((block= hash_link->block) &&
1931       block->hash_link == hash_link && (block->status & BLOCK_READ))
1932   {
1933     /* Assigned block with valid (changed or unchanged) contents. */
1934     page_status= PAGE_READ;
1935   }
1936   /*
1937     else (page_status == -1)
1938       - block == NULL or
1939       - block not assigned to this hash_link or
1940       - block assigned but not yet read from file (invalid data).
1941   */
1942 
1943   if (keycache->in_resize)
1944   {
1945     /* This is a request during a resize operation */
1946 
1947     if (!block)
1948     {
1949       struct st_my_thread_var *thread;
1950 
1951       /*
1952         The file block is not in the cache. We don't need it in the
1953         cache: we are going to read or write directly to file. Cancel
1954         the request. We can simply decrement hash_link->requests because
1955         we did not release cache_lock since increasing it. So no other
1956         thread can wait for our request to become released.
1957       */
1958       if (hash_link->requests == 1)
1959       {
1960         /*
1961           We are the only one to request this hash_link (this file/pos).
1962           Free the hash_link.
1963         */
1964         hash_link->requests--;
1965         unlink_hash(keycache, hash_link);
1966         DBUG_RETURN(0);
1967       }
1968 
1969       /*
1970         More requests on the hash_link. Someone tries to evict a block
1971         for this hash_link (could have started before resizing started).
1972         This means that the LRU ring is empty. Otherwise a block could
1973         be assigned immediately. Behave like a thread that wants to
1974         evict a block for this file/pos. Add to the queue of threads
1975         waiting for a block. Wait until there is one assigned.
1976 
1977         Refresh the request on the hash-link so that it cannot be reused
1978         for another file/pos.
1979       */
1980       thread= my_thread_var;
1981       thread->keycache_link= (void *) hash_link;
1982       link_into_queue(&keycache->waiting_for_block, thread);
1983       do
1984       {
1985         KEYCACHE_DBUG_PRINT("find_key_block: wait",
1986                             ("suspend thread %ld", (ulong) thread->id));
1987         keycache_pthread_cond_wait(&thread->suspend,
1988                                    &keycache->cache_lock);
1989       } while (thread->next);
1990       thread->keycache_link= NULL;
1991       /*
1992         A block should now be assigned to the hash_link. But it may
1993         still need to be evicted. Anyway, we should re-check the
1994         situation. page_status must be set correctly.
1995       */
1996       hash_link->requests--;
1997       goto restart;
1998     } /* end of if (!block) */
1999 
2000     /*
2001       There is a block for this file/pos in the cache. Register a
2002       request on it. This unlinks it from the LRU ring (if it is there)
2003       and hence protects it against eviction (if not already in
2004       eviction). We need this for returning the block to the caller, for
2005       calling remove_reader() (for debugging purposes), and for calling
2006       free_block(). The only case where we don't need the request is if
2007       the block is in eviction. In that case we have to unregister the
2008       request later.
2009     */
2010     reg_requests(keycache, block, 1);
2011 
2012     if (page_status != PAGE_READ)
2013     {
2014       /*
2015         - block not assigned to this hash_link or
2016         - block assigned but not yet read from file (invalid data).
2017 
2018         This must be a block in eviction. It will be read soon. We need
2019         to wait here until this happened. Otherwise the caller could
2020         access a wrong block or a block which is in read. While waiting
2021         we cannot lose hash_link nor block. We have registered a request
2022         on the hash_link. Everything can happen to the block but changes
2023         in the hash_link -> block relationship. In other words:
2024         everything can happen to the block but free or another completed
2025         eviction.
2026 
2027         Note that we bahave like a secondary requestor here. We just
2028         cannot return with PAGE_WAIT_TO_BE_READ. This would work for
2029         read requests and writes on dirty blocks that are not in flush
2030         only. Waiting here on COND_FOR_REQUESTED works in all
2031         situations.
2032       */
2033       DBUG_ASSERT(((block->hash_link != hash_link) &&
2034                    (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) ||
2035                   ((block->hash_link == hash_link) &&
2036                    !(block->status & BLOCK_READ)));
2037       wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock);
2038       /*
2039         Here we can trust that the block has been assigned to this
2040         hash_link (block->hash_link == hash_link) and read into the
2041         buffer (BLOCK_READ). The worst things possible here are that the
2042         block is in free (BLOCK_REASSIGNED). But the block is still
2043         assigned to the hash_link. The freeing thread waits until we
2044         release our request on the hash_link. The block must not be
2045         again in eviction because we registered an request on it before
2046         starting to wait.
2047       */
2048       DBUG_ASSERT(block->hash_link == hash_link);
2049       DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
2050       DBUG_ASSERT(!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH)));
2051     }
2052     /*
2053       The block is in the cache. Assigned to the hash_link. Valid data.
2054       Note that in case of page_st == PAGE_READ, the block can be marked
2055       for eviction. In any case it can be marked for freeing.
2056     */
2057 
2058     if (!wrmode)
2059     {
2060       /* A reader can just read the block. */
2061       *page_st= PAGE_READ;
2062       DBUG_ASSERT((hash_link->file == file) &&
2063                   (hash_link->diskpos == filepos) &&
2064                   (block->hash_link == hash_link));
2065       DBUG_RETURN(block);
2066     }
2067 
2068     /*
2069       This is a writer. No two writers for the same block can exist.
2070       This must be assured by locks outside of the key cache.
2071     */
2072     DBUG_ASSERT(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block));
2073 
2074     while (block->status & BLOCK_IN_FLUSH)
2075     {
2076       /*
2077         Wait until the block is flushed to file. Do not release the
2078         request on the hash_link yet to prevent that the block is freed
2079         or reassigned while we wait. While we wait, several things can
2080         happen to the block, including another flush. But the block
2081         cannot be reassigned to another hash_link until we release our
2082         request on it. But it can be marked BLOCK_REASSIGNED from free
2083         or eviction, while they wait for us to release the hash_link.
2084       */
2085       wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock);
2086       /*
2087         If the flush phase failed, the resize could have finished while
2088         we waited here.
2089       */
2090       if (!keycache->in_resize)
2091       {
2092         remove_reader(block);
2093         unreg_request(keycache, block, 1);
2094         goto restart;
2095       }
2096       DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
2097       DBUG_ASSERT(!(block->status & BLOCK_FOR_UPDATE) || fail_block(block));
2098       DBUG_ASSERT(block->hash_link == hash_link);
2099     }
2100 
2101     if (block->status & BLOCK_CHANGED)
2102     {
2103       /*
2104         We want to write a block with changed contents. If the cache
2105         block size is bigger than the callers block size (e.g. MyISAM),
2106         the caller may replace part of the block only. Changes of the
2107         other part of the block must be preserved. Since the block has
2108         not yet been selected for flush, we can still add our changes.
2109       */
2110       *page_st= PAGE_READ;
2111       DBUG_ASSERT((hash_link->file == file) &&
2112                   (hash_link->diskpos == filepos) &&
2113                   (block->hash_link == hash_link));
2114       DBUG_RETURN(block);
2115     }
2116 
2117     /*
2118       This is a write request for a clean block. We do not want to have
2119       new dirty blocks in the cache while resizing. We will free the
2120       block and write directly to file. If the block is in eviction or
2121       in free, we just let it go.
2122 
2123       Unregister from the hash_link. This must be done before freeing
2124       the block. And it must be done if not freeing the block. Because
2125       we could have waited above, we need to call remove_reader(). Other
2126       threads could wait for us to release our request on the hash_link.
2127     */
2128     remove_reader(block);
2129 
2130     /* If the block is not in eviction and not in free, we can free it. */
2131     if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
2132                            BLOCK_REASSIGNED)))
2133     {
2134       /*
2135         Free block as we are going to write directly to file.
2136         Although we have an exlusive lock for the updated key part,
2137         the control can be yielded by the current thread as we might
2138         have unfinished readers of other key parts in the block
2139         buffer. Still we are guaranteed not to have any readers
2140         of the key part we are writing into until the block is
2141         removed from the cache as we set the BLOCK_REASSIGNED
2142         flag (see the code below that handles reading requests).
2143       */
2144       free_block(keycache, block);
2145     }
2146     else
2147     {
2148       /*
2149         The block will be evicted/freed soon. Don't touch it in any way.
2150         Unregister the request that we registered above.
2151       */
2152       unreg_request(keycache, block, 1);
2153 
2154       /*
2155         The block is still assigned to the hash_link (the file/pos that
2156         we are going to write to). Wait until the eviction/free is
2157         complete. Otherwise the direct write could complete before all
2158         readers are done with the block. So they could read outdated
2159         data.
2160 
2161         Since we released our request on the hash_link, it can be reused
2162         for another file/pos. Hence we cannot just check for
2163         block->hash_link == hash_link. As long as the resize is
2164         proceeding the block cannot be reassigned to the same file/pos
2165         again. So we can terminate the loop when the block is no longer
2166         assigned to this file/pos.
2167       */
2168       do
2169       {
2170         wait_on_queue(&block->wqueue[COND_FOR_SAVED],
2171                       &keycache->cache_lock);
2172         /*
2173           If the flush phase failed, the resize could have finished
2174           while we waited here.
2175         */
2176         if (!keycache->in_resize)
2177           goto restart;
2178       } while (block->hash_link &&
2179                (block->hash_link->file == file) &&
2180                (block->hash_link->diskpos == filepos));
2181     }
2182     DBUG_RETURN(0);
2183   }
2184 
2185   if (page_status == PAGE_READ &&
2186       (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
2187                         BLOCK_REASSIGNED)))
2188   {
2189     /*
2190       This is a request for a block to be removed from cache. The block
2191       is assigned to this hash_link and contains valid data, but is
2192       marked for eviction or to be freed. Possible reasons why it has
2193       not yet been evicted/freed can be a flush before reassignment
2194       (BLOCK_IN_SWITCH), readers of the block have not finished yet
2195       (BLOCK_REASSIGNED), or the evicting thread did not yet awake after
2196       the block has been selected for it (BLOCK_IN_EVICTION).
2197     */
2198 
2199     KEYCACHE_DBUG_PRINT("find_key_block",
2200                         ("request for old page in block %u "
2201                          "wrmode: %d  block->status: %d",
2202                          BLOCK_NUMBER(block), wrmode, block->status));
2203     /*
2204        Only reading requests can proceed until the old dirty page is flushed,
2205        all others are to be suspended, then resubmitted
2206     */
2207     if (!wrmode && !(block->status & BLOCK_REASSIGNED))
2208     {
2209       /*
2210         This is a read request and the block not yet reassigned. We can
2211         register our request and proceed. This unlinks the block from
2212         the LRU ring and protects it against eviction.
2213       */
2214       reg_requests(keycache, block, 1);
2215     }
2216     else
2217     {
2218       /*
2219         Either this is a write request for a block that is in eviction
2220         or in free. We must not use it any more. Instead we must evict
2221         another block. But we cannot do this before the eviction/free is
2222         done. Otherwise we would find the same hash_link + block again
2223         and again.
2224 
2225         Or this is a read request for a block in eviction/free that does
2226         not require a flush, but waits for readers to finish with the
2227         block. We do not read this block to let the eviction/free happen
2228         as soon as possible. Again we must wait so that we don't find
2229         the same hash_link + block again and again.
2230       */
2231       DBUG_ASSERT(hash_link->requests);
2232       hash_link->requests--;
2233       KEYCACHE_DBUG_PRINT("find_key_block",
2234                           ("request waiting for old page to be saved"));
2235       wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock);
2236       KEYCACHE_DBUG_PRINT("find_key_block",
2237                           ("request for old page resubmitted"));
2238       /*
2239         The block is no longer assigned to this hash_link.
2240         Get another one.
2241       */
2242       goto restart;
2243     }
2244   }
2245   else
2246   {
2247     /*
2248       This is a request for a new block or for a block not to be removed.
2249       Either
2250       - block == NULL or
2251       - block not assigned to this hash_link or
2252       - block assigned but not yet read from file,
2253       or
2254       - block assigned with valid (changed or unchanged) data and
2255       - it will not be reassigned/freed.
2256     */
2257     if (! block)
2258     {
2259       /* No block is assigned to the hash_link yet. */
2260       if (keycache->blocks_unused)
2261       {
2262         if (keycache->free_block_list)
2263         {
2264           /* There is a block in the free list. */
2265           block= keycache->free_block_list;
2266           keycache->free_block_list= block->next_used;
2267           block->next_used= NULL;
2268         }
2269         else
2270         {
2271           size_t block_mem_offset;
2272           /* There are some never used blocks, take first of them */
2273           DBUG_ASSERT(keycache->blocks_used <
2274                       (ulong) keycache->disk_blocks);
2275           block= &keycache->block_root[keycache->blocks_used];
2276           block_mem_offset=
2277            ((size_t) keycache->blocks_used) * keycache->key_cache_block_size;
2278           block->buffer= ADD_TO_PTR(keycache->block_mem,
2279                                     block_mem_offset,
2280                                     uchar*);
2281           keycache->blocks_used++;
2282           DBUG_ASSERT(!block->next_used);
2283         }
2284         DBUG_ASSERT(!block->prev_used);
2285         DBUG_ASSERT(!block->next_changed);
2286         DBUG_ASSERT(!block->prev_changed);
2287         DBUG_ASSERT(!block->hash_link);
2288         DBUG_ASSERT(!block->status);
2289         DBUG_ASSERT(!block->requests);
2290         keycache->blocks_unused--;
2291         block->status= BLOCK_IN_USE;
2292         block->length= 0;
2293         block->offset= keycache->key_cache_block_size;
2294         block->requests= 1;
2295         block->temperature= BLOCK_COLD;
2296         block->hits_left= init_hits_left;
2297         block->last_hit_time= 0;
2298         block->hash_link= hash_link;
2299         hash_link->block= block;
2300         link_to_file_list(keycache, block, file, 0);
2301         page_status= PAGE_TO_BE_READ;
2302         KEYCACHE_DBUG_PRINT("find_key_block",
2303                             ("got free or never used block %u",
2304                              BLOCK_NUMBER(block)));
2305       }
2306       else
2307       {
2308 	/*
2309           There are no free blocks and no never used blocks, use a block
2310           from the LRU ring.
2311         */
2312 
2313         if (! keycache->used_last)
2314         {
2315           /*
2316             The LRU ring is empty. Wait until a new block is added to
2317             it. Several threads might wait here for the same hash_link,
2318             all of them must get the same block. While waiting for a
2319             block, after a block is selected for this hash_link, other
2320             threads can run first before this one awakes. During this
2321             time interval other threads find this hash_link pointing to
2322             the block, which is still assigned to another hash_link. In
2323             this case the block is not marked BLOCK_IN_SWITCH yet, but
2324             it is marked BLOCK_IN_EVICTION.
2325           */
2326 
2327           struct st_my_thread_var *thread= my_thread_var;
2328           thread->keycache_link= (void *) hash_link;
2329           link_into_queue(&keycache->waiting_for_block, thread);
2330           do
2331           {
2332             KEYCACHE_DBUG_PRINT("find_key_block: wait",
2333                                 ("suspend thread %ld", (ulong) thread->id));
2334             keycache_pthread_cond_wait(&thread->suspend,
2335                                        &keycache->cache_lock);
2336           }
2337           while (thread->next);
2338           thread->keycache_link= NULL;
2339           /* Assert that block has a request registered. */
2340           DBUG_ASSERT(hash_link->block->requests);
2341           /* Assert that block is not in LRU ring. */
2342           DBUG_ASSERT(!hash_link->block->next_used);
2343           DBUG_ASSERT(!hash_link->block->prev_used);
2344         }
2345         /*
2346           If we waited above, hash_link->block has been assigned by
2347           link_block(). Otherwise it is still NULL. In the latter case
2348           we need to grab a block from the LRU ring ourselves.
2349         */
2350         block= hash_link->block;
2351         if (! block)
2352         {
2353           /* Select the last block from the LRU ring. */
2354           block= keycache->used_last->next_used;
2355           block->hits_left= init_hits_left;
2356           block->last_hit_time= 0;
2357           hash_link->block= block;
2358           /*
2359             Register a request on the block. This unlinks it from the
2360             LRU ring and protects it against eviction.
2361           */
2362           DBUG_ASSERT(!block->requests);
2363           reg_requests(keycache, block,1);
2364           /*
2365             We do not need to set block->status|= BLOCK_IN_EVICTION here
2366             because we will set block->status|= BLOCK_IN_SWITCH
2367             immediately without releasing the lock in between. This does
2368             also support debugging. When looking at the block, one can
2369             see if the block has been selected by link_block() after the
2370             LRU ring was empty, or if it was grabbed directly from the
2371             LRU ring in this branch.
2372           */
2373         }
2374 
2375         /*
2376           If we had to wait above, there is a small chance that another
2377           thread grabbed this block for the same file block already. But
2378           in most cases the first condition is true.
2379         */
2380         if (block->hash_link != hash_link &&
2381 	    ! (block->status & BLOCK_IN_SWITCH) )
2382         {
2383 	  /* this is a primary request for a new page */
2384           block->status|= BLOCK_IN_SWITCH;
2385 
2386           KEYCACHE_DBUG_PRINT("find_key_block",
2387                         ("got block %u for new page", BLOCK_NUMBER(block)));
2388 
2389           if (block->status & BLOCK_CHANGED)
2390           {
2391 	    /* The block contains a dirty page - push it out of the cache */
2392 
2393             KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty"));
2394             if (block->status & BLOCK_IN_FLUSH)
2395             {
2396               /*
2397                 The block is marked for flush. If we do not wait here,
2398                 it could happen that we write the block, reassign it to
2399                 another file block, then, before the new owner can read
2400                 the new file block, the flusher writes the cache block
2401                 (which still has the old contents) to the new file block!
2402               */
2403               wait_on_queue(&block->wqueue[COND_FOR_SAVED],
2404                             &keycache->cache_lock);
2405               /*
2406                 The block is marked BLOCK_IN_SWITCH. It should be left
2407                 alone except for reading. No free, no write.
2408               */
2409               DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
2410               DBUG_ASSERT(!(block->status & (BLOCK_REASSIGNED |
2411                                              BLOCK_CHANGED |
2412                                              BLOCK_FOR_UPDATE)));
2413             }
2414             else
2415             {
2416               block->status|= BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE;
2417               /*
2418                 BLOCK_IN_EVICTION may be true or not. Other flags must
2419                 have a fixed value.
2420               */
2421               DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) ==
2422                           (BLOCK_READ | BLOCK_IN_SWITCH |
2423                            BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE |
2424                            BLOCK_CHANGED | BLOCK_IN_USE));
2425               DBUG_ASSERT(block->hash_link);
2426 
2427               keycache_pthread_mutex_unlock(&keycache->cache_lock);
2428               /*
2429                 The call is thread safe because only the current
2430                 thread might change the block->hash_link value
2431               */
2432               error= (int)my_pwrite(block->hash_link->file,
2433                                block->buffer + block->offset,
2434                                block->length - block->offset,
2435                                block->hash_link->diskpos + block->offset,
2436                                MYF(MY_NABP | MY_WAIT_IF_FULL));
2437               keycache_pthread_mutex_lock(&keycache->cache_lock);
2438 
2439               /* Block status must not have changed. */
2440               DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) ==
2441                           (BLOCK_READ | BLOCK_IN_SWITCH |
2442                            BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE |
2443                            BLOCK_CHANGED | BLOCK_IN_USE) || fail_block(block));
2444               keycache->global_cache_write++;
2445             }
2446           }
2447 
2448           block->status|= BLOCK_REASSIGNED;
2449           /*
2450             The block comes from the LRU ring. It must have a hash_link
2451             assigned.
2452           */
2453           DBUG_ASSERT(block->hash_link);
2454           if (block->hash_link)
2455           {
2456             /*
2457               All pending requests for this page must be resubmitted.
2458               This must be done before waiting for readers. They could
2459               wait for the flush to complete. And we must also do it
2460               after the wait. Flushers might try to free the block while
2461               we wait. They would wait until the reassignment is
2462               complete. Also the block status must reflect the correct
2463               situation: The block is not changed nor in flush any more.
2464               Note that we must not change the BLOCK_CHANGED flag
2465               outside of link_to_file_list() so that it is always in the
2466               correct queue and the *blocks_changed counters are
2467               correct.
2468             */
2469             block->status&= ~(BLOCK_IN_FLUSH | BLOCK_IN_FLUSHWRITE);
2470             link_to_file_list(keycache, block, block->hash_link->file, 1);
2471             release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
2472             /*
2473               The block is still assigned to its old hash_link.
2474 	      Wait until all pending read requests
2475 	      for this page are executed
2476 	      (we could have avoided this waiting, if we had read
2477 	      a page in the cache in a sweep, without yielding control)
2478             */
2479             wait_for_readers(keycache, block);
2480             DBUG_ASSERT(block->hash_link && block->hash_link->block == block &&
2481                         block->prev_changed);
2482             /* The reader must not have been a writer. */
2483             DBUG_ASSERT(!(block->status & BLOCK_CHANGED));
2484 
2485             /* Wake flushers that might have found the block in between. */
2486             release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
2487 
2488             /* Remove the hash link for the old file block from the hash. */
2489             unlink_hash(keycache, block->hash_link);
2490 
2491             /*
2492               For sanity checks link_to_file_list() asserts that block
2493               and hash_link refer to each other. Hence we need to assign
2494               the hash_link first, but then we would not know if it was
2495               linked before. Hence we would not know if to unlink it. So
2496               unlink it here and call link_to_file_list(..., FALSE).
2497             */
2498             unlink_changed(block);
2499           }
2500           block->status= error ? BLOCK_ERROR : BLOCK_IN_USE ;
2501           block->length= 0;
2502           block->offset= keycache->key_cache_block_size;
2503           block->hash_link= hash_link;
2504           link_to_file_list(keycache, block, file, 0);
2505           page_status= PAGE_TO_BE_READ;
2506 
2507           KEYCACHE_DBUG_ASSERT(block->hash_link->block == block);
2508           KEYCACHE_DBUG_ASSERT(hash_link->block->hash_link == hash_link);
2509         }
2510         else
2511         {
2512           /*
2513             Either (block->hash_link == hash_link),
2514 	    or     (block->status & BLOCK_IN_SWITCH).
2515 
2516             This is for secondary requests for a new file block only.
2517             Either it is already assigned to the new hash_link meanwhile
2518             (if we had to wait due to empty LRU), or it is already in
2519             eviction by another thread. Since this block has been
2520             grabbed from the LRU ring and attached to this hash_link,
2521             another thread cannot grab the same block from the LRU ring
2522             anymore. If the block is in eviction already, it must become
2523             attached to the same hash_link and as such destined for the
2524             same file block.
2525           */
2526           KEYCACHE_DBUG_PRINT("find_key_block",
2527                               ("block->hash_link: %p  hash_link: %p  "
2528                                "block->status: %u", block->hash_link,
2529                                hash_link, block->status ));
2530           page_status= (((block->hash_link == hash_link) &&
2531                          (block->status & BLOCK_READ)) ?
2532                         PAGE_READ : PAGE_WAIT_TO_BE_READ);
2533         }
2534       }
2535     }
2536     else
2537     {
2538       /*
2539         Block is not NULL. This hash_link points to a block.
2540         Either
2541         - block not assigned to this hash_link (yet) or
2542         - block assigned but not yet read from file,
2543         or
2544         - block assigned with valid (changed or unchanged) data and
2545         - it will not be reassigned/freed.
2546 
2547         The first condition means hash_link points to a block in
2548         eviction. This is not necessarily marked by BLOCK_IN_SWITCH yet.
2549         But then it is marked BLOCK_IN_EVICTION. See the NOTE in
2550         link_block(). In both cases it is destined for this hash_link
2551         and its file block address. When this hash_link got its block
2552         address, the block was removed from the LRU ring and cannot be
2553         selected for eviction (for another hash_link) again.
2554 
2555         Register a request on the block. This is another protection
2556         against eviction.
2557       */
2558       DBUG_ASSERT(((block->hash_link != hash_link) &&
2559                    (block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))) ||
2560                   ((block->hash_link == hash_link) &&
2561                    !(block->status & BLOCK_READ)) ||
2562                   ((block->status & BLOCK_READ) &&
2563                    !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH))));
2564       reg_requests(keycache, block, 1);
2565       KEYCACHE_DBUG_PRINT("find_key_block",
2566                           ("block->hash_link: %p  hash_link: %p  "
2567                            "block->status: %u", block->hash_link,
2568                            hash_link, block->status ));
2569       page_status= (((block->hash_link == hash_link) &&
2570                      (block->status & BLOCK_READ)) ?
2571                     PAGE_READ : PAGE_WAIT_TO_BE_READ);
2572     }
2573   }
2574 
2575   KEYCACHE_DBUG_ASSERT(page_status != -1);
2576   /* Same assert basically, but be very sure. */
2577   KEYCACHE_DBUG_ASSERT(block);
2578   /* Assert that block has a request and is not in LRU ring. */
2579   DBUG_ASSERT(block->requests);
2580   DBUG_ASSERT(!block->next_used);
2581   DBUG_ASSERT(!block->prev_used);
2582   /* Assert that we return the correct block. */
2583   DBUG_ASSERT((page_status == PAGE_WAIT_TO_BE_READ) ||
2584               ((block->hash_link->file == file) &&
2585                (block->hash_link->diskpos == filepos)));
2586   *page_st=page_status;
2587   KEYCACHE_DBUG_PRINT("find_key_block",
2588                       ("fd: %d  pos: %lu  block->status: %u  page_status: %d",
2589                        file, (ulong) filepos, block->status,
2590                        page_status));
2591 
2592 #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
2593   DBUG_EXECUTE("check_keycache2",
2594                test_key_cache(keycache, "end of find_key_block",0););
2595 #endif
2596   KEYCACHE_THREAD_TRACE("find_key_block:end");
2597   DBUG_RETURN(block);
2598 }
2599 
2600 
2601 /*
2602   Read into a key cache block buffer from disk.
2603 
2604   SYNOPSIS
2605 
2606     read_block_{primary|secondary}()
2607       keycache            pointer to a key cache data structure
2608       block               block to which buffer the data is to be read
2609       read_length         size of data to be read
2610       min_length          at least so much data must be read
2611 
2612   RETURN VALUE
2613     None
2614 
2615   NOTES.
2616     The function either reads a page data from file to the block buffer,
2617     or waits until another thread reads it. What page to read is determined
2618     by a block parameter - reference to a hash link for this page.
2619     If an error occurs THE BLOCK_ERROR bit is set in the block status.
2620     We do not report error when the size of successfully read
2621     portion is less than read_length, but not less than min_length.
2622 */
2623 
2624 static void read_block_primary(SIMPLE_KEY_CACHE_CB *keycache,
2625                                BLOCK_LINK *block, uint read_length,
2626                                uint min_length)
2627 {
2628   size_t got_length;
2629 
2630   /* On entry cache_lock is locked */
2631 
2632   KEYCACHE_THREAD_TRACE("read_block_primary");
2633 
2634   /*
2635     This code is executed only by threads that submitted primary
2636     requests. Until block->status contains BLOCK_READ, all other
2637     request for the block become secondary requests. For a primary
2638     request the block must be properly initialized.
2639   */
2640   DBUG_ASSERT(((block->status & ~BLOCK_FOR_UPDATE) == BLOCK_IN_USE) ||
2641               fail_block(block));
2642   DBUG_ASSERT((block->length == 0) || fail_block(block));
2643   DBUG_ASSERT((block->offset == keycache->key_cache_block_size) ||
2644               fail_block(block));
2645   DBUG_ASSERT((block->requests > 0) || fail_block(block));
2646 
2647   KEYCACHE_DBUG_PRINT("read_block_primary",
2648                       ("page to be read by primary request"));
2649 
2650   keycache->global_cache_read++;
2651   /* Page is not in buffer yet, is to be read from disk */
2652   keycache_pthread_mutex_unlock(&keycache->cache_lock);
2653   /*
2654     Here other threads may step in and register as secondary readers.
2655     They will register in block->wqueue[COND_FOR_REQUESTED].
2656   */
2657   got_length= my_pread(block->hash_link->file, block->buffer,
2658                        read_length, block->hash_link->diskpos, MYF(0));
2659   keycache_pthread_mutex_lock(&keycache->cache_lock);
2660   /*
2661     The block can now have been marked for free (in case of
2662     FLUSH_RELEASE). Otherwise the state must be unchanged.
2663   */
2664   DBUG_ASSERT(((block->status & ~(BLOCK_REASSIGNED |
2665                                   BLOCK_FOR_UPDATE)) == BLOCK_IN_USE) ||
2666               fail_block(block));
2667   DBUG_ASSERT((block->length == 0) || fail_block(block));
2668   DBUG_ASSERT((block->offset == keycache->key_cache_block_size) ||
2669               fail_block(block));
2670   DBUG_ASSERT((block->requests > 0) || fail_block(block));
2671 
2672   if (got_length < min_length)
2673     block->status|= BLOCK_ERROR;
2674   else
2675   {
2676     block->status|= BLOCK_READ;
2677     block->length= (uint)got_length;
2678     /*
2679       Do not set block->offset here. If this block is marked
2680       BLOCK_CHANGED later, we want to flush only the modified part. So
2681       only a writer may set block->offset down from
2682       keycache->key_cache_block_size.
2683     */
2684   }
2685   KEYCACHE_DBUG_PRINT("read_block_primary",
2686                       ("primary request: new page in cache"));
2687   /* Signal that all pending requests for this page now can be processed */
2688   release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
2689 
2690   DBUG_ASSERT(keycache->can_be_used);
2691 }
2692 
2693 
2694 static void read_block_secondary(SIMPLE_KEY_CACHE_CB *keycache,
2695                                  BLOCK_LINK *block)
2696 {
2697   KEYCACHE_THREAD_TRACE("read_block_secondary");
2698 
2699   /*
2700     This code is executed only by threads that submitted secondary
2701     requests. At this point it could happen that the cache block is
2702     not yet assigned to the hash_link for the requested file block.
2703     But at awake from the wait this should be the case. Unfortunately
2704     we cannot assert this here because we do not know the hash_link
2705     for the requested file block nor the file and position. So we have
2706     to assert this in the caller.
2707   */
2708   KEYCACHE_DBUG_PRINT("read_block_secondary",
2709                       ("secondary request waiting for new page to be read"));
2710 
2711   wait_on_queue(&block->wqueue[COND_FOR_REQUESTED], &keycache->cache_lock);
2712 
2713   KEYCACHE_DBUG_PRINT("read_block_secondary",
2714                       ("secondary request: new page in cache"));
2715 
2716   DBUG_ASSERT(keycache->can_be_used);
2717   DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
2718 }
2719 
2720 
2721 /*
2722   Read a block of data from a simple key cache into a buffer
2723 
2724   SYNOPSIS
2725 
2726     simple_key_cache_read()
2727     keycache            pointer to the control block of a simple key cache
2728     file                handler for the file for the block of data to be read
2729     filepos             position of the block of data in the file
2730     level               determines the weight of the data
2731     buff                buffer to where the data must be placed
2732     length              length of the buffer
2733     block_length        length of the read data from a key cache block
2734     return_buffer       return pointer to the key cache buffer with the data
2735 
2736   DESCRIPTION
2737     This function is the implementation of the key_cache_read interface
2738     function that is employed by simple (non-partitioned) key caches.
2739     The function takes the parameter keycache as a pointer to the
2740     control block structure of the type SIMPLE_KEY_CACHE_CB for a simple key
2741     cache.
2742     In a general case the function reads a block of data from the key cache
2743     into the buffer buff of the size specified by the parameter length. The
2744     beginning of the  block of data to be read is specified by the parameters
2745     file and filepos. The length of the read data is the same as the length
2746     of the buffer. The data is read into the buffer in key_cache_block_size
2747     increments. If the next portion of the data is not found in any key cache
2748     block, first it is read from file into the key cache.
2749     If the parameter return_buffer is not ignored and its value is TRUE, and
2750     the data to be read of the specified size block_length can be read from one
2751     key cache buffer, then the function returns a pointer to the data in the
2752     key cache buffer.
2753     The function takse into account parameters block_length and return buffer
2754     only in a single-threaded environment.
2755     The parameter 'level' is used only by the midpoint insertion strategy
2756     when the data or its portion cannot be found in the key cache.
2757 
2758   RETURN VALUE
2759     Returns address from where the data is placed if successful, 0 - otherwise.
2760 
2761   NOTES
2762     Filepos must be a multiple of 'block_length', but it doesn't
2763     have to be a multiple of key_cache_block_size;
2764 */
2765 
2766 uchar *simple_key_cache_read(SIMPLE_KEY_CACHE_CB *keycache,
2767                              File file, my_off_t filepos, int level,
2768                              uchar *buff, uint length,
2769                              uint block_length __attribute__((unused)),
2770                              int return_buffer __attribute__((unused)))
2771 {
2772   my_bool locked_and_incremented= FALSE;
2773   int error=0;
2774   uchar *start= buff;
2775   DBUG_ENTER("simple_key_cache_read");
2776   DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
2777                (uint) file, (ulong) filepos, length));
2778 
2779   if (keycache->key_cache_inited)
2780   {
2781     /* Key cache is used */
2782     reg1 BLOCK_LINK *block;
2783     uint read_length;
2784     uint offset;
2785     int page_st;
2786 
2787     if (MYSQL_KEYCACHE_READ_START_ENABLED())
2788     {
2789       MYSQL_KEYCACHE_READ_START(my_filename(file), length,
2790                                 (ulong) (keycache->blocks_used *
2791                                          keycache->key_cache_block_size),
2792                                 (ulong) (keycache->blocks_unused *
2793                                          keycache->key_cache_block_size));
2794     }
2795 
2796     /*
2797       When the key cache is once initialized, we use the cache_lock to
2798       reliably distinguish the cases of normal operation, resizing, and
2799       disabled cache. We always increment and decrement
2800       'cnt_for_resize_op' so that a resizer can wait for pending I/O.
2801     */
2802     keycache_pthread_mutex_lock(&keycache->cache_lock);
2803     /*
2804       Cache resizing has two phases: Flushing and re-initializing. In
2805       the flush phase read requests are allowed to bypass the cache for
2806       blocks not in the cache. find_key_block() returns NULL in this
2807       case.
2808 
2809       After the flush phase new I/O requests must wait until the
2810       re-initialization is done. The re-initialization can be done only
2811       if no I/O request is in progress. The reason is that
2812       key_cache_block_size can change. With enabled cache, I/O is done
2813       in chunks of key_cache_block_size. Every chunk tries to use a
2814       cache block first. If the block size changes in the middle, a
2815       block could be missed and old data could be read.
2816     */
2817     while (keycache->in_resize && !keycache->resize_in_flush)
2818       wait_on_queue(&keycache->resize_queue, &keycache->cache_lock);
2819     /* Register the I/O for the next resize. */
2820     inc_counter_for_resize_op(keycache);
2821     locked_and_incremented= TRUE;
2822     /* Requested data may not always be aligned to cache blocks. */
2823     offset= (uint) (filepos % keycache->key_cache_block_size);
2824     /* Read data in key_cache_block_size increments */
2825     do
2826     {
2827       /* Cache could be disabled in a later iteration. */
2828       if (!keycache->can_be_used)
2829       {
2830         KEYCACHE_DBUG_PRINT("key_cache_read", ("keycache cannot be used"));
2831         goto no_key_cache;
2832       }
2833       /* Start reading at the beginning of the cache block. */
2834       filepos-= offset;
2835       /* Do not read beyond the end of the cache block. */
2836       read_length= length;
2837       set_if_smaller(read_length, keycache->key_cache_block_size-offset);
2838       KEYCACHE_DBUG_ASSERT(read_length > 0);
2839 
2840       /* Request the cache block that matches file/pos. */
2841       keycache->global_cache_r_requests++;
2842 
2843       MYSQL_KEYCACHE_READ_BLOCK(keycache->key_cache_block_size);
2844 
2845       block=find_key_block(keycache, file, filepos, level, 0, &page_st);
2846       if (!block)
2847       {
2848         /*
2849           This happens only for requests submitted during key cache
2850           resize. The block is not in the cache and shall not go in.
2851           Read directly from file.
2852         */
2853         keycache->global_cache_read++;
2854         keycache_pthread_mutex_unlock(&keycache->cache_lock);
2855         error= (my_pread(file, (uchar*) buff, read_length,
2856                          filepos + offset, MYF(MY_NABP)) != 0);
2857         keycache_pthread_mutex_lock(&keycache->cache_lock);
2858         goto next_block;
2859       }
2860       if (!(block->status & BLOCK_ERROR))
2861       {
2862         if (page_st == PAGE_TO_BE_READ)
2863         {
2864           MYSQL_KEYCACHE_READ_MISS();
2865           read_block_primary(keycache, block,
2866                      keycache->key_cache_block_size, read_length+offset);
2867         }
2868         else if (page_st == PAGE_WAIT_TO_BE_READ)
2869         {
2870           MYSQL_KEYCACHE_READ_MISS();
2871           /* The requested page is to be read into the block buffer */
2872           read_block_secondary(keycache, block);
2873 
2874           /*
2875             A secondary request must now have the block assigned to the
2876             requested file block.
2877           */
2878           DBUG_ASSERT(block->hash_link->file == file);
2879           DBUG_ASSERT(block->hash_link->diskpos == filepos);
2880         }
2881         else if (block->length < read_length + offset)
2882         {
2883           /*
2884             Impossible if nothing goes wrong:
2885             this could only happen if we are using a file with
2886             small key blocks and are trying to read outside the file
2887           */
2888           my_errno= -1;
2889           block->status|= BLOCK_ERROR;
2890         }
2891         else
2892         {
2893           MYSQL_KEYCACHE_READ_HIT();
2894         }
2895       }
2896 
2897       /* block status may have added BLOCK_ERROR in the above 'if'. */
2898       if (!(block->status & BLOCK_ERROR))
2899       {
2900         {
2901           DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
2902 #if !defined(SERIALIZED_READ_FROM_CACHE)
2903           keycache_pthread_mutex_unlock(&keycache->cache_lock);
2904 #endif
2905 
2906           /* Copy data from the cache buffer */
2907           memcpy(buff, block->buffer+offset, (size_t) read_length);
2908 
2909 #if !defined(SERIALIZED_READ_FROM_CACHE)
2910           keycache_pthread_mutex_lock(&keycache->cache_lock);
2911           DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
2912 #endif
2913         }
2914       }
2915 
2916       remove_reader(block);
2917 
2918       /* Error injection for coverage testing. */
2919       DBUG_EXECUTE_IF("key_cache_read_block_error",
2920                       block->status|= BLOCK_ERROR;);
2921 
2922       /* Do not link erroneous blocks into the LRU ring, but free them. */
2923       if (!(block->status & BLOCK_ERROR))
2924       {
2925         /*
2926           Link the block into the LRU ring if it's the last submitted
2927           request for the block. This enables eviction for the block.
2928         */
2929         unreg_request(keycache, block, 1);
2930       }
2931       else
2932       {
2933         free_block(keycache, block);
2934         error= 1;
2935         break;
2936       }
2937 
2938     next_block:
2939       buff+= read_length;
2940       filepos+= read_length+offset;
2941       offset= 0;
2942 
2943     } while ((length-= read_length));
2944     if (MYSQL_KEYCACHE_READ_DONE_ENABLED())
2945     {
2946       MYSQL_KEYCACHE_READ_DONE((ulong) (keycache->blocks_used *
2947                                         keycache->key_cache_block_size),
2948                                (ulong) (keycache->blocks_unused *
2949                                         keycache->key_cache_block_size));
2950     }
2951     goto end;
2952   }
2953   KEYCACHE_DBUG_PRINT("key_cache_read", ("keycache not initialized"));
2954 
2955 no_key_cache:
2956   /* Key cache is not used */
2957 
2958   keycache->global_cache_r_requests++;
2959   keycache->global_cache_read++;
2960 
2961   if (locked_and_incremented)
2962     keycache_pthread_mutex_unlock(&keycache->cache_lock);
2963   if (my_pread(file, (uchar*) buff, length, filepos, MYF(MY_NABP)))
2964     error= 1;
2965   if (locked_and_incremented)
2966     keycache_pthread_mutex_lock(&keycache->cache_lock);
2967 
2968 end:
2969   if (locked_and_incremented)
2970   {
2971     dec_counter_for_resize_op(keycache);
2972     keycache_pthread_mutex_unlock(&keycache->cache_lock);
2973   }
2974   DBUG_PRINT("exit", ("error: %d", error ));
2975   DBUG_RETURN(error ? (uchar*) 0 : start);
2976 }
2977 
2978 
2979 /*
2980   Insert a block of file data from a buffer into a simple key cache
2981 
2982   SYNOPSIS
2983     simple_key_cache_insert()
2984     keycache            pointer to the control block of a simple key cache
2985     file                handler for the file to insert data from
2986     filepos             position of the block of data in the file to insert
2987     level               determines the weight of the data
2988     buff                buffer to read data from
2989     length              length of the data in the buffer
2990 
2991   DESCRIPTION
2992     This function is the implementation of the key_cache_insert interface
2993     function that is employed by simple (non-partitioned) key caches.
2994     The function takes the parameter keycache as a pointer to the
2995     control block structure of the type SIMPLE_KEY_CACHE_CB for a simple key
2996     cache.
2997     The function writes a block of file data from a buffer into the key cache.
2998     The buffer is specified with the parameters buff and length - the pointer
2999     to the beginning of the buffer and its size respectively. It's assumed
3000     the buffer contains the data from 'file' allocated from the position
3001     filepos. The data is copied from the buffer in key_cache_block_size
3002     increments.
3003     The parameter level is used to set one characteristic for the key buffers
3004     loaded with the data from buff. The characteristic is used only by the
3005     midpoint insertion strategy.
3006 
3007   RETURN VALUE
3008     0 if a success, 1 - otherwise.
3009 
3010   NOTES
3011     The function is used by MyISAM to move all blocks from a index file to
3012     the key cache. It can be performed in parallel with reading the file data
3013     from the key buffers by other threads.
3014 
3015 */
3016 
3017 static
3018 int simple_key_cache_insert(SIMPLE_KEY_CACHE_CB *keycache,
3019                             File file, my_off_t filepos, int level,
3020                             uchar *buff, uint length)
3021 {
3022   int error= 0;
3023   DBUG_ENTER("key_cache_insert");
3024   DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
3025                (uint) file,(ulong) filepos, length));
3026 
3027   if (keycache->key_cache_inited)
3028   {
3029     /* Key cache is used */
3030     reg1 BLOCK_LINK *block;
3031     uint read_length;
3032     uint offset;
3033     int page_st;
3034     my_bool locked_and_incremented= FALSE;
3035 
3036     /*
3037       When the keycache is once initialized, we use the cache_lock to
3038       reliably distinguish the cases of normal operation, resizing, and
3039       disabled cache. We always increment and decrement
3040       'cnt_for_resize_op' so that a resizer can wait for pending I/O.
3041     */
3042     keycache_pthread_mutex_lock(&keycache->cache_lock);
3043     /*
3044       We do not load index data into a disabled cache nor into an
3045       ongoing resize.
3046     */
3047     if (!keycache->can_be_used || keycache->in_resize)
3048 	goto no_key_cache;
3049     /* Register the pseudo I/O for the next resize. */
3050     inc_counter_for_resize_op(keycache);
3051     locked_and_incremented= TRUE;
3052     /* Loaded data may not always be aligned to cache blocks. */
3053     offset= (uint) (filepos % keycache->key_cache_block_size);
3054     /* Load data in key_cache_block_size increments. */
3055     do
3056     {
3057       /* Cache could be disabled or resizing in a later iteration. */
3058       if (!keycache->can_be_used || keycache->in_resize)
3059 	goto no_key_cache;
3060       /* Start loading at the beginning of the cache block. */
3061       filepos-= offset;
3062       /* Do not load beyond the end of the cache block. */
3063       read_length= length;
3064       set_if_smaller(read_length, keycache->key_cache_block_size-offset);
3065       KEYCACHE_DBUG_ASSERT(read_length > 0);
3066 
3067       /* The block has been read by the caller already. */
3068       keycache->global_cache_read++;
3069       /* Request the cache block that matches file/pos. */
3070       keycache->global_cache_r_requests++;
3071       block= find_key_block(keycache, file, filepos, level, 0, &page_st);
3072       if (!block)
3073       {
3074         /*
3075           This happens only for requests submitted during key cache
3076           resize. The block is not in the cache and shall not go in.
3077           Stop loading index data.
3078         */
3079         goto no_key_cache;
3080       }
3081       if (!(block->status & BLOCK_ERROR))
3082       {
3083         if (page_st == PAGE_WAIT_TO_BE_READ)
3084         {
3085           /*
3086             this is a secondary request for a block to be read into the
3087             cache. The block is in eviction. It is not yet assigned to
3088             the requested file block (It does not point to the right
3089             hash_link). So we cannot call remove_reader() on the block.
3090             And we cannot access the hash_link directly here. We need to
3091             wait until the assignment is complete. read_block_secondary()
3092             executes the correct wait.
3093           */
3094           read_block_secondary(keycache, block);
3095 
3096           /*
3097             A secondary request must now have the block assigned to the
3098             requested file block.
3099           */
3100           DBUG_ASSERT(block->hash_link->file == file);
3101           DBUG_ASSERT(block->hash_link->diskpos == filepos);
3102         }
3103         else if (page_st == PAGE_TO_BE_READ &&
3104                  (offset || (read_length < keycache->key_cache_block_size)))
3105         {
3106           /*
3107             this is a primary request for a block to be read into the
3108             cache and the supplied data does not fill the whole block.
3109 
3110             This function is called on behalf of a LOAD INDEX INTO CACHE
3111             statement, which is a read-only task and allows other
3112             readers. It is possible that a parallel running reader tries
3113             to access this block. If it needs more data than has been
3114             supplied here, it would report an error. To be sure that we
3115             have all data in the block that is available in the file, we
3116             read the block ourselves.
3117 
3118             Though reading again what the caller did read already is an
3119             expensive operation, we need to do this for correctness.
3120           */
3121           read_block_primary(keycache, block, keycache->key_cache_block_size,
3122                              read_length + offset);
3123         }
3124         else if (page_st == PAGE_TO_BE_READ)
3125         {
3126           /*
3127             This is a new block in the cache. If we come here, we have
3128             data for the whole block.
3129           */
3130           DBUG_ASSERT(block->hash_link->requests);
3131           DBUG_ASSERT(block->status & BLOCK_IN_USE);
3132           DBUG_ASSERT((page_st == PAGE_TO_BE_READ) ||
3133                       (block->status & BLOCK_READ));
3134 
3135 #if !defined(SERIALIZED_READ_FROM_CACHE)
3136           keycache_pthread_mutex_unlock(&keycache->cache_lock);
3137           /*
3138             Here other threads may step in and register as secondary readers.
3139             They will register in block->wqueue[COND_FOR_REQUESTED].
3140           */
3141 #endif
3142 
3143           /* Copy data from buff */
3144           memcpy(block->buffer+offset, buff, (size_t) read_length);
3145 
3146 #if !defined(SERIALIZED_READ_FROM_CACHE)
3147           keycache_pthread_mutex_lock(&keycache->cache_lock);
3148           DBUG_ASSERT(block->status & BLOCK_IN_USE);
3149           DBUG_ASSERT((page_st == PAGE_TO_BE_READ) ||
3150                       (block->status & BLOCK_READ));
3151 #endif
3152           /*
3153             After the data is in the buffer, we can declare the block
3154             valid. Now other threads do not need to register as
3155             secondary readers any more. They can immediately access the
3156             block.
3157           */
3158           block->status|= BLOCK_READ;
3159           block->length= read_length+offset;
3160           /*
3161             Do not set block->offset here. If this block is marked
3162             BLOCK_CHANGED later, we want to flush only the modified part. So
3163             only a writer may set block->offset down from
3164             keycache->key_cache_block_size.
3165           */
3166           KEYCACHE_DBUG_PRINT("key_cache_insert",
3167                               ("primary request: new page in cache"));
3168           /* Signal all pending requests. */
3169           release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
3170         }
3171         else
3172         {
3173           /*
3174             page_st == PAGE_READ. The block is in the buffer. All data
3175             must already be present. Blocks are always read with all
3176             data available on file. Assert that the block does not have
3177             less contents than the preloader supplies. If the caller has
3178             data beyond block->length, it means that a file write has
3179             been done while this block was in cache and not extended
3180             with the new data. If the condition is met, we can simply
3181             ignore the block.
3182           */
3183           DBUG_ASSERT((page_st == PAGE_READ) &&
3184                       (read_length + offset <= block->length));
3185         }
3186 
3187         /*
3188           A secondary request must now have the block assigned to the
3189           requested file block. It does not hurt to check it for primary
3190           requests too.
3191         */
3192         DBUG_ASSERT(block->hash_link->file == file);
3193         DBUG_ASSERT(block->hash_link->diskpos == filepos);
3194         DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
3195       } /* end of if (!(block->status & BLOCK_ERROR)) */
3196 
3197       remove_reader(block);
3198 
3199       /* Error injection for coverage testing. */
3200       DBUG_EXECUTE_IF("key_cache_insert_block_error",
3201                       block->status|= BLOCK_ERROR; errno=EIO;);
3202 
3203       /* Do not link erroneous blocks into the LRU ring, but free them. */
3204       if (!(block->status & BLOCK_ERROR))
3205       {
3206         /*
3207           Link the block into the LRU ring if it's the last submitted
3208           request for the block. This enables eviction for the block.
3209         */
3210         unreg_request(keycache, block, 1);
3211       }
3212       else
3213       {
3214         free_block(keycache, block);
3215         error= 1;
3216         break;
3217       }
3218 
3219       buff+= read_length;
3220       filepos+= read_length+offset;
3221       offset= 0;
3222 
3223     } while ((length-= read_length));
3224 
3225   no_key_cache:
3226     if (locked_and_incremented)
3227       dec_counter_for_resize_op(keycache);
3228     keycache_pthread_mutex_unlock(&keycache->cache_lock);
3229   }
3230   DBUG_RETURN(error);
3231 }
3232 
3233 
3234 /*
3235   Write a buffer into a simple key cache
3236 
3237   SYNOPSIS
3238 
3239     simple_key_cache_write()
3240     keycache            pointer to the control block of a simple key cache
3241     file                handler for the file to write data to
3242     file_extra          maps of key cache partitions containing
3243                         dirty pages from file
3244     filepos             position in the file to write data to
3245     level               determines the weight of the data
3246     buff                buffer with the data
3247     length              length of the buffer
3248     dont_write          if is 0 then all dirty pages involved in writing
3249                         should have been flushed from key cache
3250 
3251   DESCRIPTION
3252     This function is the implementation of the key_cache_write interface
3253     function that is employed by simple (non-partitioned) key caches.
3254     The function takes the parameter keycache as a pointer to the
3255     control block structure of the type SIMPLE_KEY_CACHE_CB for a simple key
3256     cache.
3257     In a general case the function copies data from a buffer into the key
3258     cache. The buffer is specified with the parameters buff and length -
3259     the pointer to the beginning of the buffer and its size respectively.
3260     It's assumed the buffer contains the data to be written into 'file'
3261     starting from the position filepos. The data is copied from the buffer
3262     in key_cache_block_size increments.
3263     If the value of the parameter dont_write is FALSE then the function
3264     also writes the data into file.
3265     The parameter level is used to set one characteristic for the key buffers
3266     filled with the data from buff. The characteristic is employed only by
3267     the midpoint insertion strategy.
3268     The parameter file_extra currently makes sense only for simple key caches
3269     that are elements of a partitioned key cache. It provides a pointer to the
3270     shared bitmap of the partitions that may contains dirty pages for the file.
3271     This bitmap is used to optimize the function
3272     flush_partitioned_key_cache_blocks.
3273 
3274   RETURN VALUE
3275     0 if a success, 1 - otherwise.
3276 
3277   NOTES
3278     This implementation exploits the fact that the function is called only
3279     when a thread has got an exclusive lock for the key file.
3280 */
3281 
3282 static
3283 int simple_key_cache_write(SIMPLE_KEY_CACHE_CB *keycache,
3284                            File file, void *file_extra __attribute__((unused)),
3285                            my_off_t filepos, int level,
3286                            uchar *buff, uint length,
3287                            uint block_length  __attribute__((unused)),
3288                            int dont_write)
3289 {
3290   my_bool locked_and_incremented= FALSE;
3291   int error=0;
3292   DBUG_ENTER("simple_key_cache_write");
3293   DBUG_PRINT("enter",
3294              ("fd: %u  pos: %lu  length: %u  block_length: %u"
3295               "  key_block_length: %u",
3296               (uint) file, (ulong) filepos, length, block_length,
3297               keycache ? keycache->key_cache_block_size : 0));
3298 
3299   if (!dont_write)
3300   {
3301     /* purecov: begin inspected */
3302     /* Not used in the server. */
3303     /* Force writing from buff into disk. */
3304     keycache->global_cache_w_requests++;
3305     keycache->global_cache_write++;
3306     if (my_pwrite(file, buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL)))
3307       DBUG_RETURN(1);
3308     /* purecov: end */
3309   }
3310 
3311 #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
3312   DBUG_EXECUTE("check_keycache",
3313                test_key_cache(keycache, "start of key_cache_write", 1););
3314 #endif
3315 
3316   if (keycache->key_cache_inited)
3317   {
3318     /* Key cache is used */
3319     reg1 BLOCK_LINK *block;
3320     uint read_length;
3321     uint offset;
3322     int page_st;
3323 
3324     if (MYSQL_KEYCACHE_WRITE_START_ENABLED())
3325     {
3326       MYSQL_KEYCACHE_WRITE_START(my_filename(file), length,
3327                                  (ulong) (keycache->blocks_used *
3328                                           keycache->key_cache_block_size),
3329                                  (ulong) (keycache->blocks_unused *
3330                                           keycache->key_cache_block_size));
3331     }
3332 
3333     /*
3334       When the key cache is once initialized, we use the cache_lock to
3335       reliably distinguish the cases of normal operation, resizing, and
3336       disabled cache. We always increment and decrement
3337       'cnt_for_resize_op' so that a resizer can wait for pending I/O.
3338     */
3339     keycache_pthread_mutex_lock(&keycache->cache_lock);
3340     /*
3341       Cache resizing has two phases: Flushing and re-initializing. In
3342       the flush phase write requests can modify dirty blocks that are
3343       not yet in flush. Otherwise they are allowed to bypass the cache.
3344       find_key_block() returns NULL in both cases (clean blocks and
3345       non-cached blocks).
3346 
3347       After the flush phase new I/O requests must wait until the
3348       re-initialization is done. The re-initialization can be done only
3349       if no I/O request is in progress. The reason is that
3350       key_cache_block_size can change. With enabled cache I/O is done in
3351       chunks of key_cache_block_size. Every chunk tries to use a cache
3352       block first. If the block size changes in the middle, a block
3353       could be missed and data could be written below a cached block.
3354     */
3355     while (keycache->in_resize && !keycache->resize_in_flush)
3356       wait_on_queue(&keycache->resize_queue, &keycache->cache_lock);
3357     /* Register the I/O for the next resize. */
3358     inc_counter_for_resize_op(keycache);
3359     locked_and_incremented= TRUE;
3360     /* Requested data may not always be aligned to cache blocks. */
3361     offset= (uint) (filepos % keycache->key_cache_block_size);
3362     /* Write data in key_cache_block_size increments. */
3363     do
3364     {
3365       /* Cache could be disabled in a later iteration. */
3366       if (!keycache->can_be_used)
3367 	goto no_key_cache;
3368 
3369       MYSQL_KEYCACHE_WRITE_BLOCK(keycache->key_cache_block_size);
3370       /* Start writing at the beginning of the cache block. */
3371       filepos-= offset;
3372       /* Do not write beyond the end of the cache block. */
3373       read_length= length;
3374       set_if_smaller(read_length, keycache->key_cache_block_size-offset);
3375       KEYCACHE_DBUG_ASSERT(read_length > 0);
3376 
3377       /* Request the cache block that matches file/pos. */
3378       keycache->global_cache_w_requests++;
3379       block= find_key_block(keycache, file, filepos, level, 1, &page_st);
3380       if (!block)
3381       {
3382         /*
3383           This happens only for requests submitted during key cache
3384           resize. The block is not in the cache and shall not go in.
3385           Write directly to file.
3386         */
3387         if (dont_write)
3388         {
3389           /* Used in the server. */
3390           keycache->global_cache_write++;
3391           keycache_pthread_mutex_unlock(&keycache->cache_lock);
3392           if (my_pwrite(file, (uchar*) buff, read_length, filepos + offset,
3393                         MYF(MY_NABP | MY_WAIT_IF_FULL)))
3394             error=1;
3395           keycache_pthread_mutex_lock(&keycache->cache_lock);
3396         }
3397         goto next_block;
3398       }
3399       /*
3400         Prevent block from flushing and from being selected for to be
3401         freed. This must be set when we release the cache_lock.
3402         However, we must not set the status of the block before it is
3403         assigned to this file/pos.
3404       */
3405       if (page_st != PAGE_WAIT_TO_BE_READ)
3406         block->status|= BLOCK_FOR_UPDATE;
3407       /*
3408         We must read the file block first if it is not yet in the cache
3409         and we do not replace all of its contents.
3410 
3411         In cases where the cache block is big enough to contain (parts
3412         of) index blocks of different indexes, our request can be
3413         secondary (PAGE_WAIT_TO_BE_READ). In this case another thread is
3414         reading the file block. If the read completes after us, it
3415         overwrites our new contents with the old contents. So we have to
3416         wait for the other thread to complete the read of this block.
3417         read_block_primary|secondary() takes care for the wait.
3418       */
3419       if (!(block->status & BLOCK_ERROR))
3420       {
3421         if (page_st == PAGE_TO_BE_READ &&
3422             (offset || read_length < keycache->key_cache_block_size))
3423         {
3424           read_block_primary(keycache, block,
3425                              offset + read_length >= keycache->key_cache_block_size?
3426                              offset : keycache->key_cache_block_size,
3427                              offset);
3428           /*
3429             Prevent block from flushing and from being selected for to be
3430             freed. This must be set when we release the cache_lock.
3431             Here we set it in case we could not set it above.
3432           */
3433           block->status|= BLOCK_FOR_UPDATE;
3434         }
3435         else if (page_st == PAGE_WAIT_TO_BE_READ)
3436         {
3437           read_block_secondary(keycache, block);
3438           block->status|= BLOCK_FOR_UPDATE;
3439         }
3440       }
3441       /*
3442         The block should always be assigned to the requested file block
3443         here. It need not be BLOCK_READ when overwriting the whole block.
3444       */
3445       DBUG_ASSERT(block->hash_link->file == file);
3446       DBUG_ASSERT(block->hash_link->diskpos == filepos);
3447       DBUG_ASSERT(block->status & BLOCK_IN_USE);
3448       DBUG_ASSERT((page_st == PAGE_TO_BE_READ) || (block->status & BLOCK_READ));
3449       /*
3450         The block to be written must not be marked BLOCK_REASSIGNED.
3451         Otherwise it could be freed in dirty state or reused without
3452         another flush during eviction. It must also not be in flush.
3453         Otherwise the old contens may have been flushed already and
3454         the flusher could clear BLOCK_CHANGED without flushing the
3455         new changes again.
3456       */
3457       DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED));
3458 
3459       while (block->status & BLOCK_IN_FLUSHWRITE)
3460       {
3461         /*
3462           Another thread is flushing the block. It was dirty already.
3463           Wait until the block is flushed to file. Otherwise we could
3464           modify the buffer contents just while it is written to file.
3465           An unpredictable file block contents would be the result.
3466           While we wait, several things can happen to the block,
3467           including another flush. But the block cannot be reassigned to
3468           another hash_link until we release our request on it.
3469         */
3470         wait_on_queue(&block->wqueue[COND_FOR_SAVED], &keycache->cache_lock);
3471         DBUG_ASSERT(keycache->can_be_used);
3472         DBUG_ASSERT(block->status & (BLOCK_READ | BLOCK_IN_USE));
3473         /* Still must not be marked for free. */
3474         DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED));
3475         DBUG_ASSERT(block->hash_link && (block->hash_link->block == block));
3476       }
3477 
3478       /*
3479         We could perhaps release the cache_lock during access of the
3480         data like in the other functions. Locks outside of the key cache
3481         assure that readers and a writer do not access the same range of
3482         data. Parallel accesses should happen only if the cache block
3483         contains multiple index block(fragment)s. So different parts of
3484         the buffer would be read/written. An attempt to flush during
3485         memcpy() is prevented with BLOCK_FOR_UPDATE.
3486       */
3487       if (!(block->status & BLOCK_ERROR))
3488       {
3489 #if !defined(SERIALIZED_READ_FROM_CACHE)
3490         keycache_pthread_mutex_unlock(&keycache->cache_lock);
3491 #endif
3492         memcpy(block->buffer+offset, buff, (size_t) read_length);
3493 
3494 #if !defined(SERIALIZED_READ_FROM_CACHE)
3495         keycache_pthread_mutex_lock(&keycache->cache_lock);
3496 #endif
3497       }
3498 
3499       if (!dont_write)
3500       {
3501         /* Not used in the server. buff has been written to disk at start. */
3502         if ((block->status & BLOCK_CHANGED) &&
3503             (!offset && read_length >= keycache->key_cache_block_size))
3504              link_to_file_list(keycache, block, block->hash_link->file, 1);
3505       }
3506       else if (! (block->status & BLOCK_CHANGED))
3507         link_to_changed_list(keycache, block);
3508       block->status|=BLOCK_READ;
3509       /*
3510         Allow block to be selected for to be freed. Since it is marked
3511         BLOCK_CHANGED too, it won't be selected for to be freed without
3512         a flush.
3513       */
3514       block->status&= ~BLOCK_FOR_UPDATE;
3515       set_if_smaller(block->offset, offset);
3516       set_if_bigger(block->length, read_length+offset);
3517 
3518       /* Threads may be waiting for the changes to be complete. */
3519       release_whole_queue(&block->wqueue[COND_FOR_REQUESTED]);
3520 
3521       /*
3522         If only a part of the cache block is to be replaced, and the
3523         rest has been read from file, then the cache lock has been
3524         released for I/O and it could be possible that another thread
3525         wants to evict or free the block and waits for it to be
3526         released. So we must not just decrement hash_link->requests, but
3527         also wake a waiting thread.
3528       */
3529       remove_reader(block);
3530 
3531       /* Error injection for coverage testing. */
3532       DBUG_EXECUTE_IF("key_cache_write_block_error",
3533                       block->status|= BLOCK_ERROR;);
3534 
3535       /* Do not link erroneous blocks into the LRU ring, but free them. */
3536       if (!(block->status & BLOCK_ERROR))
3537       {
3538         /*
3539           Link the block into the LRU ring if it's the last submitted
3540           request for the block. This enables eviction for the block.
3541         */
3542         unreg_request(keycache, block, 1);
3543       }
3544       else
3545       {
3546         /* Pretend a "clean" block to avoid complications. */
3547         block->status&= ~(BLOCK_CHANGED);
3548         free_block(keycache, block);
3549         error= 1;
3550         break;
3551       }
3552 
3553     next_block:
3554       buff+= read_length;
3555       filepos+= read_length+offset;
3556       offset= 0;
3557 
3558     } while ((length-= read_length));
3559     goto end;
3560   }
3561 
3562 no_key_cache:
3563   /* Key cache is not used */
3564   if (dont_write)
3565   {
3566     /* Used in the server. */
3567     keycache->global_cache_w_requests++;
3568     keycache->global_cache_write++;
3569     if (locked_and_incremented)
3570       keycache_pthread_mutex_unlock(&keycache->cache_lock);
3571     if (my_pwrite(file, (uchar*) buff, length, filepos,
3572 		  MYF(MY_NABP | MY_WAIT_IF_FULL)))
3573       error=1;
3574     if (locked_and_incremented)
3575       keycache_pthread_mutex_lock(&keycache->cache_lock);
3576   }
3577 
3578 end:
3579   if (locked_and_incremented)
3580   {
3581     dec_counter_for_resize_op(keycache);
3582     keycache_pthread_mutex_unlock(&keycache->cache_lock);
3583   }
3584 
3585   if (MYSQL_KEYCACHE_WRITE_DONE_ENABLED())
3586   {
3587     MYSQL_KEYCACHE_WRITE_DONE((ulong) (keycache->blocks_used *
3588                                        keycache->key_cache_block_size),
3589                               (ulong) (keycache->blocks_unused *
3590                                        keycache->key_cache_block_size));
3591   }
3592 
3593 #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
3594   DBUG_EXECUTE("exec",
3595                test_key_cache(keycache, "end of key_cache_write", 1););
3596 #endif
3597   DBUG_RETURN(error);
3598 }
3599 
3600 
3601 /*
3602   Free block.
3603 
3604   SYNOPSIS
3605     free_block()
3606       keycache          Pointer to a key cache data structure
3607       block             Pointer to the block to free
3608 
3609   DESCRIPTION
3610     Remove reference to block from hash table.
3611     Remove block from the chain of clean blocks.
3612     Add block to the free list.
3613 
3614   NOTE
3615     Block must not be free (status == 0).
3616     Block must not be in free_block_list.
3617     Block must not be in the LRU ring.
3618     Block must not be in eviction (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH).
3619     Block must not be in free (BLOCK_REASSIGNED).
3620     Block must not be in flush (BLOCK_IN_FLUSH).
3621     Block must not be dirty (BLOCK_CHANGED).
3622     Block must not be in changed_blocks (dirty) hash.
3623     Block must be in file_blocks (clean) hash.
3624     Block must refer to a hash_link.
3625     Block must have a request registered on it.
3626 */
3627 
3628 static void free_block(SIMPLE_KEY_CACHE_CB *keycache, BLOCK_LINK *block)
3629 {
3630   KEYCACHE_THREAD_TRACE("free block");
3631   KEYCACHE_DBUG_PRINT("free_block",
3632                       ("block %u to be freed, hash_link %p  status: %u",
3633                        BLOCK_NUMBER(block), block->hash_link,
3634                        block->status));
3635   /*
3636     Assert that the block is not free already. And that it is in a clean
3637     state. Note that the block might just be assigned to a hash_link and
3638     not yet read (BLOCK_READ may not be set here). In this case a reader
3639     is registered in the hash_link and free_block() will wait for it
3640     below.
3641   */
3642   DBUG_ASSERT((block->status & BLOCK_IN_USE) &&
3643               !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
3644                                  BLOCK_REASSIGNED | BLOCK_IN_FLUSH |
3645                                  BLOCK_CHANGED | BLOCK_FOR_UPDATE)));
3646   /* Assert that the block is in a file_blocks chain. */
3647   DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
3648   /* Assert that the block is not in the LRU ring. */
3649   DBUG_ASSERT(!block->next_used && !block->prev_used);
3650   /*
3651     IMHO the below condition (if()) makes no sense. I can't see how it
3652     could be possible that free_block() is entered with a NULL hash_link
3653     pointer. The only place where it can become NULL is in free_block()
3654     (or before its first use ever, but for those blocks free_block() is
3655     not called). I don't remove the conditional as it cannot harm, but
3656     place an DBUG_ASSERT to confirm my hypothesis. Eventually the
3657     condition (if()) can be removed.
3658   */
3659   DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
3660   if (block->hash_link)
3661   {
3662     /*
3663       While waiting for readers to finish, new readers might request the
3664       block. But since we set block->status|= BLOCK_REASSIGNED, they
3665       will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled
3666       later.
3667     */
3668     block->status|= BLOCK_REASSIGNED;
3669     wait_for_readers(keycache, block);
3670     /*
3671       The block must not have been freed by another thread. Repeat some
3672       checks. An additional requirement is that it must be read now
3673       (BLOCK_READ).
3674     */
3675     DBUG_ASSERT(block->hash_link && block->hash_link->block == block);
3676     DBUG_ASSERT((block->status & (BLOCK_READ | BLOCK_IN_USE |
3677                                   BLOCK_REASSIGNED)) &&
3678                 !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
3679                                    BLOCK_IN_FLUSH | BLOCK_CHANGED |
3680                                    BLOCK_FOR_UPDATE)));
3681     DBUG_ASSERT(block->prev_changed && *block->prev_changed == block);
3682     DBUG_ASSERT(!block->prev_used);
3683     /*
3684       Unset BLOCK_REASSIGNED again. If we hand the block to an evicting
3685       thread (through unreg_request() below), other threads must not see
3686       this flag. They could become confused.
3687     */
3688     block->status&= ~BLOCK_REASSIGNED;
3689     /*
3690       Do not release the hash_link until the block is off all lists.
3691       At least not if we hand it over for eviction in unreg_request().
3692     */
3693   }
3694 
3695   /*
3696     Unregister the block request and link the block into the LRU ring.
3697     This enables eviction for the block. If the LRU ring was empty and
3698     threads are waiting for a block, then the block wil be handed over
3699     for eviction immediately. Otherwise we will unlink it from the LRU
3700     ring again, without releasing the lock in between. So decrementing
3701     the request counter and updating statistics are the only relevant
3702     operation in this case. Assert that there are no other requests
3703     registered.
3704   */
3705   DBUG_ASSERT(block->requests == 1);
3706   unreg_request(keycache, block, 0);
3707   /*
3708     Note that even without releasing the cache lock it is possible that
3709     the block is immediately selected for eviction by link_block() and
3710     thus not added to the LRU ring. In this case we must not touch the
3711     block any more.
3712   */
3713   if (block->status & BLOCK_IN_EVICTION)
3714     return;
3715 
3716   /* Error blocks are not put into the LRU ring. */
3717   if (!(block->status & BLOCK_ERROR))
3718   {
3719     /* Here the block must be in the LRU ring. Unlink it again. */
3720     DBUG_ASSERT(block->next_used && block->prev_used &&
3721                 *block->prev_used == block);
3722     unlink_block(keycache, block);
3723   }
3724   if (block->temperature == BLOCK_WARM)
3725     keycache->warm_blocks--;
3726   block->temperature= BLOCK_COLD;
3727 
3728   /* Remove from file_blocks hash. */
3729   unlink_changed(block);
3730 
3731   /* Remove reference to block from hash table. */
3732   unlink_hash(keycache, block->hash_link);
3733   block->hash_link= NULL;
3734 
3735   block->status= 0;
3736   block->length= 0;
3737   block->offset= keycache->key_cache_block_size;
3738   KEYCACHE_THREAD_TRACE("free block");
3739   KEYCACHE_DBUG_PRINT("free_block", ("block is freed"));
3740 
3741   /* Enforced by unlink_changed(), but just to be sure. */
3742   DBUG_ASSERT(!block->next_changed && !block->prev_changed);
3743   /* Enforced by unlink_block(): not in LRU ring nor in free_block_list. */
3744   DBUG_ASSERT(!block->next_used && !block->prev_used);
3745   /* Insert the free block in the free list. */
3746   block->next_used= keycache->free_block_list;
3747   keycache->free_block_list= block;
3748   /* Keep track of the number of currently unused blocks. */
3749   keycache->blocks_unused++;
3750 
3751   /* All pending requests for this page must be resubmitted. */
3752   release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
3753 }
3754 
3755 
3756 static int cmp_sec_link(BLOCK_LINK **a, BLOCK_LINK **b)
3757 {
3758   return (((*a)->hash_link->diskpos < (*b)->hash_link->diskpos) ? -1 :
3759       ((*a)->hash_link->diskpos > (*b)->hash_link->diskpos) ? 1 : 0);
3760 }
3761 
3762 
3763 /*
3764   Flush a portion of changed blocks to disk,
3765   free used blocks if requested
3766 */
3767 
3768 static int flush_cached_blocks(SIMPLE_KEY_CACHE_CB *keycache,
3769                                File file, BLOCK_LINK **cache,
3770                                BLOCK_LINK **end,
3771                                enum flush_type type)
3772 {
3773   int error;
3774   int last_errno= 0;
3775   uint count= (uint) (end-cache);
3776 
3777   /* Don't lock the cache during the flush */
3778   keycache_pthread_mutex_unlock(&keycache->cache_lock);
3779   /*
3780      As all blocks referred in 'cache' are marked by BLOCK_IN_FLUSH
3781      we are guarunteed no thread will change them
3782   */
3783   my_qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link);
3784 
3785   keycache_pthread_mutex_lock(&keycache->cache_lock);
3786   /*
3787     Note: Do not break the loop. We have registered a request on every
3788     block in 'cache'. These must be unregistered by free_block() or
3789     unreg_request().
3790   */
3791   for ( ; cache != end ; cache++)
3792   {
3793     BLOCK_LINK *block= *cache;
3794 
3795     KEYCACHE_DBUG_PRINT("flush_cached_blocks",
3796                         ("block %u to be flushed", BLOCK_NUMBER(block)));
3797     /*
3798       If the block contents is going to be changed, we abandon the flush
3799       for this block. flush_key_blocks_int() will restart its search and
3800       handle the block properly.
3801     */
3802     if (!(block->status & BLOCK_FOR_UPDATE))
3803     {
3804       /* Blocks coming here must have a certain status. */
3805       DBUG_ASSERT(block->hash_link);
3806       DBUG_ASSERT(block->hash_link->block == block);
3807       DBUG_ASSERT(block->hash_link->file == file);
3808       DBUG_ASSERT((block->status & ~BLOCK_IN_EVICTION) ==
3809                   (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE));
3810       block->status|= BLOCK_IN_FLUSHWRITE;
3811       keycache_pthread_mutex_unlock(&keycache->cache_lock);
3812       error= (int)my_pwrite(file, block->buffer + block->offset,
3813                        block->length - block->offset,
3814                        block->hash_link->diskpos + block->offset,
3815                        MYF(MY_NABP | MY_WAIT_IF_FULL));
3816       keycache_pthread_mutex_lock(&keycache->cache_lock);
3817       keycache->global_cache_write++;
3818       if (error)
3819       {
3820         block->status|= BLOCK_ERROR;
3821         if (!last_errno)
3822           last_errno= errno ? errno : -1;
3823       }
3824       block->status&= ~BLOCK_IN_FLUSHWRITE;
3825       /* Block must not have changed status except BLOCK_FOR_UPDATE. */
3826       DBUG_ASSERT(block->hash_link);
3827       DBUG_ASSERT(block->hash_link->block == block);
3828       DBUG_ASSERT(block->hash_link->file == file);
3829       DBUG_ASSERT((block->status & ~(BLOCK_FOR_UPDATE | BLOCK_IN_EVICTION)) ==
3830                   (BLOCK_READ | BLOCK_IN_FLUSH | BLOCK_CHANGED | BLOCK_IN_USE));
3831       /*
3832         Set correct status and link in right queue for free or later use.
3833         free_block() must not see BLOCK_CHANGED and it may need to wait
3834         for readers of the block. These should not see the block in the
3835         wrong hash. If not freeing the block, we need to have it in the
3836         right queue anyway.
3837       */
3838       link_to_file_list(keycache, block, file, 1);
3839     }
3840     block->status&= ~BLOCK_IN_FLUSH;
3841     /*
3842       Let to proceed for possible waiting requests to write to the block page.
3843       It might happen only during an operation to resize the key cache.
3844     */
3845     release_whole_queue(&block->wqueue[COND_FOR_SAVED]);
3846     /* type will never be FLUSH_IGNORE_CHANGED here */
3847     if (!(type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE) &&
3848         !(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
3849                            BLOCK_FOR_UPDATE)))
3850     {
3851       /*
3852         Note that a request has been registered against the block in
3853         flush_key_blocks_int().
3854       */
3855       free_block(keycache, block);
3856     }
3857     else
3858     {
3859       /*
3860         Link the block into the LRU ring if it's the last submitted
3861         request for the block. This enables eviction for the block.
3862         Note that a request has been registered against the block in
3863         flush_key_blocks_int().
3864       */
3865       unreg_request(keycache, block, 1);
3866     }
3867 
3868   } /* end of for ( ; cache != end ; cache++) */
3869   return last_errno;
3870 }
3871 
3872 
3873 /*
3874   Flush all key blocks for a file to disk, but don't do any mutex locks
3875 
3876   SYNOPSIS
3877     flush_key_blocks_int()
3878       keycache            pointer to a key cache data structure
3879       file                handler for the file to flush to
3880       flush_type          type of the flush
3881 
3882   NOTES
3883     This function doesn't do any mutex locks because it needs to be called both
3884     from flush_key_blocks and flush_all_key_blocks (the later one does the
3885     mutex lock in the resize_key_cache() function).
3886 
3887     We do only care about changed blocks that exist when the function is
3888     entered. We do not guarantee that all changed blocks of the file are
3889     flushed if more blocks change while this function is running.
3890 
3891   RETURN
3892     0   ok
3893     1  error
3894 */
3895 
3896 static int flush_key_blocks_int(SIMPLE_KEY_CACHE_CB *keycache,
3897 				File file, enum flush_type type)
3898 {
3899   BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
3900   int last_errno= 0;
3901   int last_errcnt= 0;
3902   DBUG_ENTER("flush_key_blocks_int");
3903   DBUG_PRINT("enter",("file: %d  blocks_used: %lu  blocks_changed: %lu",
3904               file, keycache->blocks_used, keycache->blocks_changed));
3905 
3906 #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
3907   DBUG_EXECUTE("check_keycache",
3908                test_key_cache(keycache, "start of flush_key_blocks", 0););
3909 #endif
3910 
3911   DBUG_ASSERT(type != FLUSH_KEEP_LAZY);
3912   cache= cache_buff;
3913   if (keycache->disk_blocks > 0 &&
3914       (!my_disable_flush_key_blocks || type != FLUSH_KEEP))
3915   {
3916     /* Key cache exists and flush is not disabled */
3917     int error= 0;
3918     uint count= FLUSH_CACHE;
3919     BLOCK_LINK **pos,**end;
3920     BLOCK_LINK *first_in_switch= NULL;
3921     BLOCK_LINK *last_in_flush;
3922     BLOCK_LINK *last_for_update;
3923     BLOCK_LINK *block, *next;
3924 #if defined(KEYCACHE_DEBUG)
3925     uint cnt=0;
3926 #endif
3927 
3928     if (type != FLUSH_IGNORE_CHANGED)
3929     {
3930       /*
3931          Count how many key blocks we have to cache to be able
3932          to flush all dirty pages with minimum seek moves
3933       */
3934       count= 0;
3935       for (block= keycache->changed_blocks[FILE_HASH(file, keycache)] ;
3936            block ;
3937            block= block->next_changed)
3938       {
3939         if ((block->hash_link->file == file) &&
3940             !(block->status & BLOCK_IN_FLUSH))
3941         {
3942           count++;
3943           KEYCACHE_DBUG_ASSERT(count<= keycache->blocks_used);
3944         }
3945       }
3946       /*
3947         Allocate a new buffer only if its bigger than the one we have.
3948         Assure that we always have some entries for the case that new
3949         changed blocks appear while we need to wait for something.
3950       */
3951       if ((count > FLUSH_CACHE) &&
3952           !(cache= (BLOCK_LINK**) my_malloc(sizeof(BLOCK_LINK*)*count,
3953                                             MYF(0))))
3954         cache= cache_buff;
3955       /*
3956         After a restart there could be more changed blocks than now.
3957         So we should not let count become smaller than the fixed buffer.
3958       */
3959       if (cache == cache_buff)
3960         count= FLUSH_CACHE;
3961     }
3962 
3963     /* Retrieve the blocks and write them to a buffer to be flushed */
3964 restart:
3965     last_in_flush= NULL;
3966     last_for_update= NULL;
3967     end= (pos= cache)+count;
3968     for (block= keycache->changed_blocks[FILE_HASH(file, keycache)] ;
3969          block ;
3970          block= next)
3971     {
3972 #if defined(KEYCACHE_DEBUG)
3973       cnt++;
3974       KEYCACHE_DBUG_ASSERT(cnt <= keycache->blocks_used);
3975 #endif
3976       next= block->next_changed;
3977       if (block->hash_link->file == file)
3978       {
3979         if (!(block->status & (BLOCK_IN_FLUSH | BLOCK_FOR_UPDATE)))
3980         {
3981           /*
3982             Note: The special handling of BLOCK_IN_SWITCH is obsolete
3983             since we set BLOCK_IN_FLUSH if the eviction includes a
3984             flush. It can be removed in a later version.
3985           */
3986           if (!(block->status & BLOCK_IN_SWITCH))
3987           {
3988             /*
3989               We care only for the blocks for which flushing was not
3990               initiated by another thread and which are not in eviction.
3991               Registering a request on the block unlinks it from the LRU
3992               ring and protects against eviction.
3993             */
3994             reg_requests(keycache, block, 1);
3995             if (type != FLUSH_IGNORE_CHANGED)
3996             {
3997               /* It's not a temporary file */
3998               if (pos == end)
3999               {
4000                 /*
4001                   This should happen relatively seldom. Remove the
4002                   request because we won't do anything with the block
4003                   but restart and pick it again in the next iteration.
4004                 */
4005                 unreg_request(keycache, block, 0);
4006                 /*
4007                   This happens only if there is not enough
4008                   memory for the big block
4009                 */
4010                 if ((error= flush_cached_blocks(keycache, file, cache,
4011                                                 end,type)))
4012                 {
4013                   /* Do not loop infinitely trying to flush in vain. */
4014                   if ((last_errno == error) && (++last_errcnt > 5))
4015                     goto err;
4016                   last_errno= error;
4017                 }
4018                 /*
4019                   Restart the scan as some other thread might have changed
4020                   the changed blocks chain: the blocks that were in switch
4021                   state before the flush started have to be excluded
4022                 */
4023                 goto restart;
4024               }
4025               /*
4026                 Mark the block with BLOCK_IN_FLUSH in order not to let
4027                 other threads to use it for new pages and interfere with
4028                 our sequence of flushing dirty file pages. We must not
4029                 set this flag before actually putting the block on the
4030                 write burst array called 'cache'.
4031               */
4032               block->status|= BLOCK_IN_FLUSH;
4033               /* Add block to the array for a write burst. */
4034               *pos++= block;
4035             }
4036             else
4037             {
4038               /* It's a temporary file */
4039               DBUG_ASSERT(!(block->status & BLOCK_REASSIGNED));
4040               /*
4041                 free_block() must not be called with BLOCK_CHANGED. Note
4042                 that we must not change the BLOCK_CHANGED flag outside of
4043                 link_to_file_list() so that it is always in the correct
4044                 queue and the *blocks_changed counters are correct.
4045               */
4046               link_to_file_list(keycache, block, file, 1);
4047               if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH)))
4048               {
4049                 /* A request has been registered against the block above. */
4050                 free_block(keycache, block);
4051               }
4052               else
4053               {
4054                 /*
4055                   Link the block into the LRU ring if it's the last
4056                   submitted request for the block. This enables eviction
4057                   for the block. A request has been registered against
4058                   the block above.
4059                 */
4060                 unreg_request(keycache, block, 1);
4061               }
4062             }
4063           }
4064           else
4065           {
4066             /*
4067               Link the block into a list of blocks 'in switch'.
4068 
4069               WARNING: Here we introduce a place where a changed block
4070               is not in the changed_blocks hash! This is acceptable for
4071               a BLOCK_IN_SWITCH. Never try this for another situation.
4072               Other parts of the key cache code rely on changed blocks
4073               being in the changed_blocks hash.
4074             */
4075             unlink_changed(block);
4076             link_changed(block, &first_in_switch);
4077           }
4078         }
4079         else if (type != FLUSH_KEEP)
4080         {
4081           /*
4082             During the normal flush at end of statement (FLUSH_KEEP) we
4083             do not need to ensure that blocks in flush or update by
4084             other threads are flushed. They will be flushed by them
4085             later. In all other cases we must assure that we do not have
4086             any changed block of this file in the cache when this
4087             function returns.
4088           */
4089           if (block->status & BLOCK_IN_FLUSH)
4090           {
4091             /* Remember the last block found to be in flush. */
4092             last_in_flush= block;
4093           }
4094           else
4095           {
4096             /* Remember the last block found to be selected for update. */
4097             last_for_update= block;
4098           }
4099         }
4100       }
4101     }
4102     if (pos != cache)
4103     {
4104       if ((error= flush_cached_blocks(keycache, file, cache, pos, type)))
4105       {
4106         /* Do not loop inifnitely trying to flush in vain. */
4107         if ((last_errno == error) && (++last_errcnt > 5))
4108           goto err;
4109         last_errno= error;
4110       }
4111       /*
4112         Do not restart here during the normal flush at end of statement
4113         (FLUSH_KEEP). We have now flushed at least all blocks that were
4114         changed when entering this function. In all other cases we must
4115         assure that we do not have any changed block of this file in the
4116         cache when this function returns.
4117       */
4118       if (type != FLUSH_KEEP)
4119         goto restart;
4120     }
4121     if (last_in_flush)
4122     {
4123       /*
4124         There are no blocks to be flushed by this thread, but blocks in
4125         flush by other threads. Wait until one of the blocks is flushed.
4126         Re-check the condition for last_in_flush. We may have unlocked
4127         the cache_lock in flush_cached_blocks(). The state of the block
4128         could have changed.
4129       */
4130       if (last_in_flush->status & BLOCK_IN_FLUSH)
4131         wait_on_queue(&last_in_flush->wqueue[COND_FOR_SAVED],
4132                       &keycache->cache_lock);
4133       /* Be sure not to lose a block. They may be flushed in random order. */
4134       goto restart;
4135     }
4136     if (last_for_update)
4137     {
4138       /*
4139         There are no blocks to be flushed by this thread, but blocks for
4140         update by other threads. Wait until one of the blocks is updated.
4141         Re-check the condition for last_for_update. We may have unlocked
4142         the cache_lock in flush_cached_blocks(). The state of the block
4143         could have changed.
4144       */
4145       if (last_for_update->status & BLOCK_FOR_UPDATE)
4146         wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED],
4147                       &keycache->cache_lock);
4148       /* The block is now changed. Flush it. */
4149       goto restart;
4150     }
4151 
4152     /*
4153       Wait until the list of blocks in switch is empty. The threads that
4154       are switching these blocks will relink them to clean file chains
4155       while we wait and thus empty the 'first_in_switch' chain.
4156     */
4157     while (first_in_switch)
4158     {
4159 #if defined(KEYCACHE_DEBUG)
4160       cnt= 0;
4161 #endif
4162       wait_on_queue(&first_in_switch->wqueue[COND_FOR_SAVED],
4163                     &keycache->cache_lock);
4164 #if defined(KEYCACHE_DEBUG)
4165       cnt++;
4166       KEYCACHE_DBUG_ASSERT(cnt <= keycache->blocks_used);
4167 #endif
4168       /*
4169         Do not restart here. We have flushed all blocks that were
4170         changed when entering this function and were not marked for
4171         eviction. Other threads have now flushed all remaining blocks in
4172         the course of their eviction.
4173       */
4174     }
4175 
4176     if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE))
4177     {
4178       BLOCK_LINK *last_in_switch= NULL;
4179       uint total_found= 0;
4180       uint found;
4181       last_for_update= NULL;
4182 
4183       /*
4184         Finally free all clean blocks for this file.
4185         During resize this may be run by two threads in parallel.
4186       */
4187       do
4188       {
4189         found= 0;
4190         for (block= keycache->file_blocks[FILE_HASH(file, keycache)] ;
4191              block ;
4192              block= next)
4193         {
4194           /* Remember the next block. After freeing we cannot get at it. */
4195           next= block->next_changed;
4196 
4197           /* Changed blocks cannot appear in the file_blocks hash. */
4198           DBUG_ASSERT(!(block->status & BLOCK_CHANGED));
4199           if (block->hash_link->file == file)
4200           {
4201             /* We must skip blocks that will be changed. */
4202             if (block->status & BLOCK_FOR_UPDATE)
4203             {
4204               last_for_update= block;
4205               continue;
4206             }
4207 
4208             /*
4209               We must not free blocks in eviction (BLOCK_IN_EVICTION |
4210               BLOCK_IN_SWITCH) or blocks intended to be freed
4211               (BLOCK_REASSIGNED).
4212             */
4213             if (!(block->status & (BLOCK_IN_EVICTION | BLOCK_IN_SWITCH |
4214                                    BLOCK_REASSIGNED)))
4215             {
4216               struct st_hash_link *UNINIT_VAR(next_hash_link);
4217               my_off_t UNINIT_VAR(next_diskpos);
4218               File UNINIT_VAR(next_file);
4219               uint UNINIT_VAR(next_status);
4220               uint UNINIT_VAR(hash_requests);
4221 
4222               total_found++;
4223               found++;
4224               KEYCACHE_DBUG_ASSERT(found <= keycache->blocks_used);
4225 
4226               /*
4227                 Register a request. This unlinks the block from the LRU
4228                 ring and protects it against eviction. This is required
4229                 by free_block().
4230               */
4231               reg_requests(keycache, block, 1);
4232 
4233               /*
4234                 free_block() may need to wait for readers of the block.
4235                 This is the moment where the other thread can move the
4236                 'next' block from the chain. free_block() needs to wait
4237                 if there are requests for the block pending.
4238               */
4239               if (next && (hash_requests= block->hash_link->requests))
4240               {
4241                 /* Copy values from the 'next' block and its hash_link. */
4242                 next_status=    next->status;
4243                 next_hash_link= next->hash_link;
4244                 next_diskpos=   next_hash_link->diskpos;
4245                 next_file=      next_hash_link->file;
4246                 DBUG_ASSERT(next == next_hash_link->block);
4247               }
4248 
4249               free_block(keycache, block);
4250               /*
4251                 If we had to wait and the state of the 'next' block
4252                 changed, break the inner loop. 'next' may no longer be
4253                 part of the current chain.
4254 
4255                 We do not want to break the loop after every free_block(),
4256                 not even only after waits. The chain might be quite long
4257                 and contain blocks for many files. Traversing it again and
4258                 again to find more blocks for this file could become quite
4259                 inefficient.
4260               */
4261               if (next && hash_requests &&
4262                   ((next_status    != next->status) ||
4263                    (next_hash_link != next->hash_link) ||
4264                    (next_file      != next_hash_link->file) ||
4265                    (next_diskpos   != next_hash_link->diskpos) ||
4266                    (next           != next_hash_link->block)))
4267                 break;
4268             }
4269             else
4270             {
4271               last_in_switch= block;
4272             }
4273           }
4274         } /* end for block in file_blocks */
4275       } while (found);
4276 
4277       /*
4278         If any clean block has been found, we may have waited for it to
4279         become free. In this case it could be possible that another clean
4280         block became dirty. This is possible if the write request existed
4281         before the flush started (BLOCK_FOR_UPDATE). Re-check the hashes.
4282       */
4283       if (total_found)
4284         goto restart;
4285 
4286       /*
4287         To avoid an infinite loop, wait until one of the blocks marked
4288         for update is updated.
4289       */
4290       if (last_for_update)
4291       {
4292         /* We did not wait. Block must not have changed status. */
4293         DBUG_ASSERT(last_for_update->status & BLOCK_FOR_UPDATE);
4294         wait_on_queue(&last_for_update->wqueue[COND_FOR_REQUESTED],
4295                       &keycache->cache_lock);
4296         goto restart;
4297       }
4298 
4299       /*
4300         To avoid an infinite loop wait until one of the blocks marked
4301         for eviction is switched.
4302       */
4303       if (last_in_switch)
4304       {
4305         /* We did not wait. Block must not have changed status. */
4306         DBUG_ASSERT(last_in_switch->status & (BLOCK_IN_EVICTION |
4307                                               BLOCK_IN_SWITCH |
4308                                               BLOCK_REASSIGNED));
4309         wait_on_queue(&last_in_switch->wqueue[COND_FOR_SAVED],
4310                       &keycache->cache_lock);
4311         goto restart;
4312       }
4313 
4314     } /* if (! (type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE)) */
4315 
4316   } /* if (keycache->disk_blocks > 0 */
4317 
4318   DBUG_EXECUTE("check_keycache",
4319                test_key_cache(keycache, "end of flush_key_blocks", 0););
4320 err:
4321   if (cache != cache_buff)
4322     my_free(cache);
4323   if (last_errno)
4324     errno=last_errno;                /* Return first error */
4325   DBUG_RETURN(last_errno != 0);
4326 }
4327 
4328 
4329 /*
4330   Flush all blocks for a file from key buffers of a simple key cache
4331 
4332   SYNOPSIS
4333 
4334     flush_simple_key_blocks()
4335     keycache            pointer to the control block of a simple key cache
4336     file                handler for the file to flush to
4337     file_extra          maps of key cache partitions containing
4338                         dirty pages from file (not used)
4339     flush_type          type of the flush operation
4340 
4341   DESCRIPTION
4342     This function is the implementation of the flush_key_blocks interface
4343     function that is employed by simple (non-partitioned) key caches.
4344     The function takes the parameter keycache as a pointer to the
4345     control block structure of the type S_KEY_CACHE_CB for a simple key
4346     cache.
4347     In a general case the function flushes the data from all dirty key
4348     buffers related to the file 'file' into this file. The function does
4349     exactly this if the value of the parameter type is FLUSH_KEEP. If the
4350     value of this parameter is FLUSH_RELEASE, the function additionally
4351     releases the key buffers containing data from 'file' for new usage.
4352     If the value of the parameter type is FLUSH_IGNORE_CHANGED the function
4353     just releases the key buffers containing data from 'file'.
4354     The parameter file_extra currently is not used by this function.
4355 
4356   RETURN
4357     0   ok
4358     1  error
4359 
4360   NOTES
4361     This implementation exploits the fact that the function is called only
4362     when a thread has got an exclusive lock for the key file.
4363 */
4364 
4365 static
4366 int flush_simple_key_cache_blocks(SIMPLE_KEY_CACHE_CB *keycache,
4367                                   File file,
4368                                   void *file_extra __attribute__((unused)),
4369                                   enum flush_type type)
4370 {
4371   int res= 0;
4372   DBUG_ENTER("flush_key_blocks");
4373   DBUG_PRINT("enter", ("keycache: %p",  keycache));
4374 
4375   if (!keycache->key_cache_inited)
4376     DBUG_RETURN(0);
4377 
4378   keycache_pthread_mutex_lock(&keycache->cache_lock);
4379   /* While waiting for lock, keycache could have been ended. */
4380   if (keycache->disk_blocks > 0)
4381   {
4382     inc_counter_for_resize_op(keycache);
4383     res= flush_key_blocks_int(keycache, file, type);
4384     dec_counter_for_resize_op(keycache);
4385   }
4386   keycache_pthread_mutex_unlock(&keycache->cache_lock);
4387   DBUG_RETURN(res);
4388 }
4389 
4390 
4391 /*
4392   Flush all blocks in the key cache to disk.
4393 
4394   SYNOPSIS
4395     flush_all_key_blocks()
4396       keycache                  pointer to key cache root structure
4397 
4398   DESCRIPTION
4399 
4400     Flushing of the whole key cache is done in two phases.
4401 
4402     1. Flush all changed blocks, waiting for them if necessary. Loop
4403     until there is no changed block left in the cache.
4404 
4405     2. Free all clean blocks. Normally this means free all blocks. The
4406     changed blocks were flushed in phase 1 and became clean. However we
4407     may need to wait for blocks that are read by other threads. While we
4408     wait, a clean block could become changed if that operation started
4409     before the resize operation started. To be safe we must restart at
4410     phase 1.
4411 
4412     When we can run through the changed_blocks and file_blocks hashes
4413     without finding a block any more, then we are done.
4414 
4415     Note that we hold keycache->cache_lock all the time unless we need
4416     to wait for something.
4417 
4418   RETURN
4419     0           OK
4420     != 0        Error
4421 */
4422 
4423 static int flush_all_key_blocks(SIMPLE_KEY_CACHE_CB *keycache)
4424 {
4425   BLOCK_LINK    *block;
4426   uint          total_found;
4427   uint          found;
4428   uint          idx;
4429   uint          changed_blocks_hash_size= keycache->changed_blocks_hash_size;
4430   DBUG_ENTER("flush_all_key_blocks");
4431 
4432   do
4433   {
4434     mysql_mutex_assert_owner(&keycache->cache_lock);
4435     total_found= 0;
4436 
4437     /*
4438       Phase1: Flush all changed blocks, waiting for them if necessary.
4439       Loop until there is no changed block left in the cache.
4440     */
4441     do
4442     {
4443       found= 0;
4444       /* Step over the whole changed_blocks hash array. */
4445       for (idx= 0; idx < changed_blocks_hash_size; idx++)
4446       {
4447         /*
4448           If an array element is non-empty, use the first block from its
4449           chain to find a file for flush. All changed blocks for this
4450           file are flushed. So the same block will not appear at this
4451           place again with the next iteration. New writes for blocks are
4452           not accepted during the flush. If multiple files share the
4453           same hash bucket, one of them will be flushed per iteration
4454           of the outer loop of phase 1.
4455         */
4456         while ((block= keycache->changed_blocks[idx]))
4457         {
4458           found++;
4459           /*
4460             Flush dirty blocks but do not free them yet. They can be used
4461             for reading until all other blocks are flushed too.
4462           */
4463           if (flush_key_blocks_int(keycache, block->hash_link->file,
4464                                    FLUSH_FORCE_WRITE))
4465             DBUG_RETURN(1);
4466         }
4467       }
4468     } while (found);
4469 
4470     /*
4471       Phase 2: Free all clean blocks. Normally this means free all
4472       blocks. The changed blocks were flushed in phase 1 and became
4473       clean. However we may need to wait for blocks that are read by
4474       other threads. While we wait, a clean block could become changed
4475       if that operation started before the resize operation started. To
4476       be safe we must restart at phase 1.
4477     */
4478     do
4479     {
4480       found= 0;
4481       /* Step over the whole file_blocks hash array. */
4482       for (idx= 0; idx < changed_blocks_hash_size; idx++)
4483       {
4484         /*
4485           If an array element is non-empty, use the first block from its
4486           chain to find a file for flush. All blocks for this file are
4487           freed. So the same block will not appear at this place again
4488           with the next iteration. If multiple files share the
4489           same hash bucket, one of them will be flushed per iteration
4490           of the outer loop of phase 2.
4491         */
4492         while ((block= keycache->file_blocks[idx]))
4493         {
4494           total_found++;
4495           found++;
4496           if (flush_key_blocks_int(keycache, block->hash_link->file,
4497                                    FLUSH_RELEASE))
4498             DBUG_RETURN(1);
4499         }
4500       }
4501     } while (found);
4502 
4503     /*
4504       If any clean block has been found, we may have waited for it to
4505       become free. In this case it could be possible that another clean
4506       block became dirty. This is possible if the write request existed
4507       before the resize started (BLOCK_FOR_UPDATE). Re-check the hashes.
4508     */
4509   } while (total_found);
4510 
4511 #ifndef DBUG_OFF
4512   /* Now there should not exist any block any more. */
4513   for (idx= 0; idx < changed_blocks_hash_size; idx++)
4514   {
4515     DBUG_ASSERT(!keycache->changed_blocks[idx]);
4516     DBUG_ASSERT(!keycache->file_blocks[idx]);
4517   }
4518 #endif
4519 
4520   DBUG_RETURN(0);
4521 }
4522 
4523 
4524 /*
4525   Reset the counters of a simple key cache
4526 
4527   SYNOPSIS
4528     reset_simple_key_cache_counters()
4529     name                the name of a key cache
4530     keycache            pointer to the control block of a simple key cache
4531 
4532   DESCRIPTION
4533     This function is the implementation of the reset_key_cache_counters
4534     interface function that is employed by simple (non-partitioned) key caches.
4535     The function takes the parameter keycache as a pointer to the
4536     control block structure of the type S_KEY_CACHE_CB for a simple key cache.
4537     This function resets the values of all statistical counters for the key
4538     cache to 0.
4539     The parameter name is currently not used.
4540 
4541   RETURN
4542     0 on success (always because it can't fail)
4543 */
4544 
4545 static
4546 int reset_simple_key_cache_counters(const char *name __attribute__((unused)),
4547                                     SIMPLE_KEY_CACHE_CB *keycache)
4548 {
4549   DBUG_ENTER("reset_simple_key_cache_counters");
4550   if (!keycache->key_cache_inited)
4551   {
4552     DBUG_PRINT("info", ("Key cache %s not initialized.", name));
4553     DBUG_RETURN(0);
4554   }
4555   DBUG_PRINT("info", ("Resetting counters for key cache %s.", name));
4556 
4557   keycache->global_blocks_changed= 0;   /* Key_blocks_not_flushed */
4558   keycache->global_cache_r_requests= 0; /* Key_read_requests */
4559   keycache->global_cache_read= 0;       /* Key_reads */
4560   keycache->global_cache_w_requests= 0; /* Key_write_requests */
4561   keycache->global_cache_write= 0;      /* Key_writes */
4562   DBUG_RETURN(0);
4563 }
4564 
4565 
4566 #ifndef DBUG_OFF
4567 /*
4568   Test if disk-cache is ok
4569 */
4570 static
4571 void test_key_cache(SIMPLE_KEY_CACHE_CB *keycache __attribute__((unused)),
4572                     const char *where __attribute__((unused)),
4573                     my_bool lock __attribute__((unused)))
4574 {
4575   /* TODO */
4576 }
4577 #endif
4578 
4579 #if defined(KEYCACHE_TIMEOUT)
4580 
4581 #define KEYCACHE_DUMP_FILE  "keycache_dump.txt"
4582 #define MAX_QUEUE_LEN  100
4583 
4584 
4585 static void keycache_dump(SIMPLE_KEY_CACHE_CB *keycache)
4586 {
4587   FILE *keycache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w");
4588   struct st_my_thread_var *last;
4589   struct st_my_thread_var *thread;
4590   BLOCK_LINK *block;
4591   HASH_LINK *hash_link;
4592   KEYCACHE_PAGE *page;
4593   uint i;
4594 
4595   fprintf(keycache_dump_file, "thread:%lu\n", (ulong) thread->id);
4596 
4597   i=0;
4598   thread=last=waiting_for_hash_link.last_thread;
4599   fprintf(keycache_dump_file, "queue of threads waiting for hash link\n");
4600   if (thread)
4601     do
4602     {
4603       thread=thread->next;
4604       page= (KEYCACHE_PAGE *) thread->keycache_link;
4605       fprintf(keycache_dump_file,
4606               "thread:%lu, (file,filepos)=(%u,%lu)\n",
4607               (ulong) thread->id,(uint) page->file,(ulong) page->filepos);
4608       if (++i == MAX_QUEUE_LEN)
4609         break;
4610     }
4611     while (thread != last);
4612 
4613   i=0;
4614   thread=last=waiting_for_block.last_thread;
4615   fprintf(keycache_dump_file, "queue of threads waiting for block\n");
4616   if (thread)
4617     do
4618     {
4619       thread=thread->next;
4620       hash_link= (HASH_LINK *) thread->keycache_link;
4621       fprintf(keycache_dump_file,
4622               "thread:%lu hash_link:%u (file,filepos)=(%u,%lu)\n",
4623               (ulong) thread->id, (uint) HASH_LINK_NUMBER(hash_link),
4624         (uint) hash_link->file,(ulong) hash_link->diskpos);
4625       if (++i == MAX_QUEUE_LEN)
4626         break;
4627     }
4628     while (thread != last);
4629 
4630   for (i=0 ; i< keycache->blocks_used ; i++)
4631   {
4632     int j;
4633     block= &keycache->block_root[i];
4634     hash_link= block->hash_link;
4635     fprintf(keycache_dump_file,
4636             "block:%u hash_link:%d status:%x #requests=%u waiting_for_readers:%d\n",
4637             i, (int) (hash_link ? HASH_LINK_NUMBER(hash_link) : -1),
4638             block->status, block->requests, block->condvar ? 1 : 0);
4639     for (j=0 ; j < 2; j++)
4640     {
4641       KEYCACHE_WQUEUE *wqueue=&block->wqueue[j];
4642       thread= last= wqueue->last_thread;
4643       fprintf(keycache_dump_file, "queue #%d\n", j);
4644       if (thread)
4645       {
4646         do
4647         {
4648           thread=thread->next;
4649           fprintf(keycache_dump_file,
4650                   "thread:%lu\n", (ulong) thread->id);
4651           if (++i == MAX_QUEUE_LEN)
4652             break;
4653         }
4654         while (thread != last);
4655       }
4656     }
4657   }
4658   fprintf(keycache_dump_file, "LRU chain:");
4659   block= keycache= used_last;
4660   if (block)
4661   {
4662     do
4663     {
4664       block= block->next_used;
4665       fprintf(keycache_dump_file,
4666               "block:%u, ", BLOCK_NUMBER(block));
4667     }
4668     while (block != keycache->used_last);
4669   }
4670   fprintf(keycache_dump_file, "\n");
4671 
4672   fclose(keycache_dump_file);
4673 }
4674 
4675 #endif /* defined(KEYCACHE_TIMEOUT) */
4676 
4677 #if defined(KEYCACHE_TIMEOUT) && !defined(__WIN__)
4678 
4679 
4680 static int keycache_pthread_cond_wait(mysql_cond_t *cond,
4681                                       mysql_mutex_t *mutex)
4682 {
4683   int rc;
4684   struct timeval  now;            /* time when we started waiting        */
4685   struct timespec timeout;        /* timeout value for the wait function */
4686   struct timezone tz;
4687 #if defined(KEYCACHE_DEBUG)
4688   int cnt=0;
4689 #endif
4690 
4691   /* Get current time */
4692   gettimeofday(&now, &tz);
4693   /* Prepare timeout value */
4694   timeout.tv_sec= now.tv_sec + KEYCACHE_TIMEOUT;
4695  /*
4696    timeval uses microseconds.
4697    timespec uses nanoseconds.
4698    1 nanosecond = 1000 micro seconds
4699  */
4700   timeout.tv_nsec= now.tv_usec * 1000;
4701   KEYCACHE_THREAD_TRACE_END("started waiting");
4702 #if defined(KEYCACHE_DEBUG)
4703   cnt++;
4704   if (cnt % 100 == 0)
4705     fprintf(keycache_debug_log, "waiting...\n");
4706     fflush(keycache_debug_log);
4707 #endif
4708   rc= mysql_cond_timedwait(cond, mutex, &timeout);
4709   KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
4710   if (rc == ETIMEDOUT || rc == ETIME)
4711   {
4712 #if defined(KEYCACHE_DEBUG)
4713     fprintf(keycache_debug_log,"aborted by keycache timeout\n");
4714     fclose(keycache_debug_log);
4715     abort();
4716 #endif
4717     keycache_dump();
4718   }
4719 
4720 #if defined(KEYCACHE_DEBUG)
4721   KEYCACHE_DBUG_ASSERT(rc != ETIMEDOUT);
4722 #else
4723   assert(rc != ETIMEDOUT);
4724 #endif
4725   return rc;
4726 }
4727 #else
4728 #if defined(KEYCACHE_DEBUG)
4729 static int keycache_pthread_cond_wait(mysql_cond_t *cond,
4730                                       mysql_mutex_t *mutex)
4731 {
4732   int rc;
4733   KEYCACHE_THREAD_TRACE_END("started waiting");
4734   rc= mysql_cond_wait(cond, mutex);
4735   KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
4736   return rc;
4737 }
4738 #endif
4739 #endif /* defined(KEYCACHE_TIMEOUT) && !defined(__WIN__) */
4740 
4741 #if defined(KEYCACHE_DEBUG)
4742 
4743 
4744 static int keycache_pthread_mutex_lock(mysql_mutex_t *mutex)
4745 {
4746   int rc;
4747   rc= mysql_mutex_lock(mutex);
4748   KEYCACHE_THREAD_TRACE_BEGIN("");
4749   return rc;
4750 }
4751 
4752 
4753 static void keycache_pthread_mutex_unlock(mysql_mutex_t *mutex)
4754 {
4755   KEYCACHE_THREAD_TRACE_END("");
4756   mysql_mutex_unlock(mutex);
4757 }
4758 
4759 
4760 static int keycache_pthread_cond_signal(mysql_cond_t *cond)
4761 {
4762   int rc;
4763   KEYCACHE_THREAD_TRACE("signal");
4764   rc= mysql_cond_signal(cond);
4765   return rc;
4766 }
4767 
4768 
4769 #if defined(KEYCACHE_DEBUG_LOG)
4770 
4771 
4772 static void keycache_debug_print(const char * fmt,...)
4773 {
4774   va_list args;
4775   va_start(args,fmt);
4776   if (keycache_debug_log)
4777   {
4778     (void) vfprintf(keycache_debug_log, fmt, args);
4779     (void) fputc('\n',keycache_debug_log);
4780   }
4781   va_end(args);
4782 }
4783 #endif /* defined(KEYCACHE_DEBUG_LOG) */
4784 
4785 #if defined(KEYCACHE_DEBUG_LOG)
4786 
4787 
4788 void keycache_debug_log_close(void)
4789 {
4790   if (keycache_debug_log)
4791     fclose(keycache_debug_log);
4792 }
4793 #endif /* defined(KEYCACHE_DEBUG_LOG) */
4794 
4795 #endif /* defined(KEYCACHE_DEBUG) */
4796 
4797 #ifdef DBUG_ASSERT_EXISTS
4798 #define F_B_PRT(_f_, _v_) DBUG_PRINT("assert_fail", (_f_, _v_))
4799 
4800 static int fail_block(BLOCK_LINK *block  __attribute__((unused)))
4801 {
4802 #ifndef DBUG_OFF
4803   F_B_PRT("block->next_used:    %p\n", block->next_used);
4804   F_B_PRT("block->prev_used:    %p\n", block->prev_used);
4805   F_B_PRT("block->next_changed: %p\n", block->next_changed);
4806   F_B_PRT("block->prev_changed: %p\n", block->prev_changed);
4807   F_B_PRT("block->hash_link:    %p\n", block->hash_link);
4808   F_B_PRT("block->status:       %u\n", block->status);
4809   F_B_PRT("block->length:       %u\n", block->length);
4810   F_B_PRT("block->offset:       %u\n", block->offset);
4811   F_B_PRT("block->requests:     %u\n", block->requests);
4812   F_B_PRT("block->temperature:  %u\n", block->temperature);
4813 #endif
4814   return 0; /* Let the assert fail. */
4815 }
4816 #endif
4817 
4818 #ifndef DBUG_OFF
4819 static int fail_hlink(HASH_LINK *hlink  __attribute__((unused)))
4820 {
4821   F_B_PRT("hlink->next:    %p\n", hlink->next);
4822   F_B_PRT("hlink->prev:    %p\n", hlink->prev);
4823   F_B_PRT("hlink->block:   %p\n", hlink->block);
4824   F_B_PRT("hlink->diskpos: %lu\n", (ulong) hlink->diskpos);
4825   F_B_PRT("hlink->file:    %d\n", hlink->file);
4826   return 0; /* Let the assert fail. */
4827 }
4828 
4829 static int cache_empty(SIMPLE_KEY_CACHE_CB *keycache)
4830 {
4831   int errcnt= 0;
4832   int idx;
4833   if (keycache->disk_blocks <= 0)
4834     return 1;
4835   for (idx= 0; idx < keycache->disk_blocks; idx++)
4836   {
4837     BLOCK_LINK *block= keycache->block_root + idx;
4838     if (block->status || block->requests || block->hash_link)
4839     {
4840       fprintf(stderr, "block index: %u\n", idx);
4841       fail_block(block);
4842       errcnt++;
4843     }
4844   }
4845   for (idx= 0; idx < keycache->hash_links; idx++)
4846   {
4847     HASH_LINK *hash_link= keycache->hash_link_root + idx;
4848     if (hash_link->requests || hash_link->block)
4849     {
4850       fprintf(stderr, "hash_link index: %u\n", idx);
4851       fail_hlink(hash_link);
4852       errcnt++;
4853     }
4854   }
4855   if (errcnt)
4856   {
4857     fprintf(stderr, "blocks: %d  used: %lu\n",
4858             keycache->disk_blocks, keycache->blocks_used);
4859     fprintf(stderr, "hash_links: %d  used: %d\n",
4860             keycache->hash_links, keycache->hash_links_used);
4861     fprintf(stderr, "\n");
4862   }
4863   return !errcnt;
4864 }
4865 #endif
4866 
4867 
4868 /*
4869   Get statistics for a simple key cache
4870 
4871   SYNOPSIS
4872     get_simple_key_cache_statistics()
4873     keycache            pointer to the control block of a simple key cache
4874     partition_no        partition number (not used)
4875     key_cache_stats OUT pointer to the structure for the returned statistics
4876 
4877   DESCRIPTION
4878     This function is the implementation of the get_key_cache_statistics
4879     interface function that is employed by simple (non-partitioned) key caches.
4880     The function takes the parameter keycache as a pointer to the
4881     control block structure of the type SIMPLE_KEY_CACHE_CB for a simple key
4882     cache. This function returns the statistical data for the key cache.
4883     The parameter partition_no is not used by this function.
4884 
4885   RETURN
4886     none
4887 */
4888 
4889 static
4890 void get_simple_key_cache_statistics(SIMPLE_KEY_CACHE_CB *keycache,
4891                                      uint partition_no __attribute__((unused)),
4892                                      KEY_CACHE_STATISTICS *keycache_stats)
4893 {
4894   DBUG_ENTER("simple_get_key_cache_statistics");
4895 
4896   keycache_stats->mem_size= (longlong) keycache->key_cache_mem_size;
4897   keycache_stats->block_size= (longlong) keycache->key_cache_block_size;
4898   keycache_stats->blocks_used= keycache->blocks_used;
4899   keycache_stats->blocks_unused= keycache->blocks_unused;
4900   keycache_stats->blocks_changed= keycache->global_blocks_changed;
4901   keycache_stats->blocks_warm= keycache->warm_blocks;
4902   keycache_stats->read_requests= keycache->global_cache_r_requests;
4903   keycache_stats->reads= keycache->global_cache_read;
4904   keycache_stats->write_requests= keycache->global_cache_w_requests;
4905   keycache_stats->writes= keycache->global_cache_write;
4906   DBUG_VOID_RETURN;
4907 }
4908 
4909 
4910 /*
4911   The array of pointer to the key cache interface functions used for simple
4912   key caches. Any simple key cache objects including those incorporated into
4913   partitioned keys caches exploit this array.
4914 
4915   The current implementation of these functions allows to call them from
4916   the MySQL server code directly. We don't do it though.
4917 */
4918 
4919 static KEY_CACHE_FUNCS simple_key_cache_funcs =
4920 {
4921   (INIT_KEY_CACHE) init_simple_key_cache,
4922   (RESIZE_KEY_CACHE) resize_simple_key_cache,
4923   (CHANGE_KEY_CACHE_PARAM) change_simple_key_cache_param,
4924   (KEY_CACHE_READ) simple_key_cache_read,
4925   (KEY_CACHE_INSERT) simple_key_cache_insert,
4926   (KEY_CACHE_WRITE) simple_key_cache_write,
4927   (FLUSH_KEY_BLOCKS) flush_simple_key_cache_blocks,
4928   (RESET_KEY_CACHE_COUNTERS) reset_simple_key_cache_counters,
4929   (END_KEY_CACHE) end_simple_key_cache,
4930   (GET_KEY_CACHE_STATISTICS) get_simple_key_cache_statistics,
4931 };
4932 
4933 
4934 /******************************************************************************
4935   Partitioned Key Cache Module
4936 
4937   The module contains implementations of all key cache interface functions
4938   employed by partitioned key caches.
4939 
4940   A partitioned key cache is a collection of structures for simple key caches
4941   called key cache partitions. Any page from a file can be placed into a buffer
4942   of only one partition. The number of the partition is calculated from
4943   the file number and the position of the page in the file, and it's always the
4944   same for the page. The function that maps pages into partitions takes care
4945   of even distribution of pages among partitions.
4946 
4947   Partition key cache mitigate one of the major problem of simple key cache:
4948   thread contention for key cache lock (mutex). Every call of a key cache
4949   interface function must acquire this lock. So threads compete for this lock
4950   even in the case when they have acquired shared locks for the file and
4951   pages they want read from are in the key cache buffers.
4952   When working with a partitioned key cache any key cache interface function
4953   that needs only one page has to acquire the key cache lock only for the
4954   partition the page is ascribed to. This makes the chances for threads not
4955   compete for the same key cache lock better. Unfortunately if we use a
4956   partitioned key cache with N partitions for B-tree indexes we can't say
4957   that the chances becomes N times less. The fact is that any index lookup
4958   operation requires reading from the root page that, for any index, is always
4959   ascribed to the same partition. To resolve this problem we should have
4960   employed more sophisticated mechanisms of working with root pages.
4961 
4962   Currently the number of partitions in a partitioned key cache is limited
4963   by 64. We could increase this limit. Simultaneously we would have to increase
4964   accordingly the size of the bitmap dirty_part_map from the MYISAM_SHARE
4965   structure.
4966 
4967 ******************************************************************************/
4968 
4969 /* Control block for a partitioned key cache */
4970 
4971 typedef struct st_partitioned_key_cache_cb
4972 {
4973   my_bool key_cache_inited;     /*<=> control block is allocated            */
4974   SIMPLE_KEY_CACHE_CB **partition_array; /* the key cache partitions        */
4975   size_t key_cache_mem_size;    /* specified size of the cache memory       */
4976   uint key_cache_block_size;    /* size of the page buffer of a cache block */
4977   uint partitions;              /* number of partitions in the key cache    */
4978 } PARTITIONED_KEY_CACHE_CB;
4979 
4980 static
4981 void end_partitioned_key_cache(PARTITIONED_KEY_CACHE_CB *keycache,
4982                                my_bool cleanup);
4983 
4984 static int
4985 reset_partitioned_key_cache_counters(const char *name,
4986                                      PARTITIONED_KEY_CACHE_CB *keycache);
4987 
4988 /*
4989   Determine the partition to which the index block to read is ascribed
4990 
4991   SYNOPSIS
4992     get_key_cache_partition()
4993     keycache            pointer to the control block of a partitioned key cache
4994     file                handler for the file for the block of data to be read
4995     filepos             position of the block of data in the file
4996 
4997   DESCRIPTION
4998     The function determines the number of the partition in whose buffer the
4999     block from 'file' at the position filepos has to be placed for reading.
5000     The function returns the control block of the simple key cache for this
5001     partition to the caller.
5002 
5003   RETURN VALUE
5004     The pointer to the control block of the partition to which the specified
5005     file block is ascribed.
5006 */
5007 
5008 static
5009 SIMPLE_KEY_CACHE_CB *
5010 get_key_cache_partition(PARTITIONED_KEY_CACHE_CB *keycache,
5011                         File file, my_off_t filepos)
5012 {
5013   uint i= KEYCACHE_BASE_EXPR(file, filepos) % keycache->partitions;
5014   return keycache->partition_array[i];
5015 }
5016 
5017 
5018 /*
5019   Determine the partition to which the index block to write is ascribed
5020 
5021   SYNOPSIS
5022     get_key_cache_partition()
5023     keycache            pointer to the control block of a partitioned key cache
5024     file                handler for the file for the block of data to be read
5025     filepos             position of the block of data in the file
5026     dirty_part_map      pointer to the bitmap of dirty partitions for the file
5027 
5028   DESCRIPTION
5029     The function determines the number of the partition in whose buffer the
5030     block from 'file' at the position filepos has to be placed for writing and
5031     marks the partition as dirty in the dirty_part_map bitmap.
5032     The function returns the control block of the simple key cache for this
5033     partition to the caller.
5034 
5035   RETURN VALUE
5036     The pointer to the control block of the partition to which the specified
5037     file block is ascribed.
5038 */
5039 
5040 static SIMPLE_KEY_CACHE_CB
5041 *get_key_cache_partition_for_write(PARTITIONED_KEY_CACHE_CB *keycache,
5042                                    File file, my_off_t filepos,
5043                                    ulonglong* dirty_part_map)
5044 {
5045   uint i= KEYCACHE_BASE_EXPR( file, filepos) % keycache->partitions;
5046   *dirty_part_map|= 1ULL << i;
5047   return keycache->partition_array[i];
5048 }
5049 
5050 
5051 /*
5052   Initialize a partitioned key cache
5053 
5054   SYNOPSIS
5055     init_partitioned_key_cache()
5056     keycache            pointer to the control block of a partitioned key cache
5057     key_cache_block_size    size of blocks to keep cached data
5058     use_mem             total memory to use for all key cache partitions
5059     division_limit      division limit (may be zero)
5060     age_threshold       age threshold (may be zero)
5061 
5062   DESCRIPTION
5063     This function is the implementation of the init_key_cache
5064     interface function that is employed by partitioned key caches.
5065 
5066     The function builds and initializes an array of simple key caches,
5067     and then initializes the control block structure of the type
5068     PARTITIONED_KEY_CACHE_CB that is used for a partitioned key
5069     cache. The parameter keycache is supposed to point to this
5070     structure. The number of partitions in the partitioned key cache
5071     to be built must be passed through the field 'partitions' of this
5072     structure.
5073     The parameter key_cache_block_size specifies the size of the
5074     blocks in the the simple key caches to be built.
5075     The parameters division_limit and  age_threshold determine the initial
5076     values of those characteristics of the simple key caches that are used for
5077     midpoint insertion strategy. The parameter use_mem specifies the total
5078     amount of memory to be allocated for the key cache blocks in all simple key
5079     caches and for all auxiliary structures.
5080 
5081   RETURN VALUE
5082     total number of blocks in key cache partitions, if successful,
5083     <= 0 - otherwise.
5084 
5085   NOTES
5086     If keycache->key_cache_inited != 0 then we assume that the memory for
5087     the array of partitions has been already allocated.
5088 
5089     It's assumed that no two threads call this function simultaneously
5090     referring to the same key cache handle.
5091 */
5092 
5093 static
5094 int init_partitioned_key_cache(PARTITIONED_KEY_CACHE_CB *keycache,
5095                                uint key_cache_block_size,
5096                                size_t use_mem, uint division_limit,
5097                                uint age_threshold, uint changed_blocks_hash_size)
5098 {
5099   int i;
5100   size_t mem_per_cache;
5101   size_t mem_decr;
5102   int cnt;
5103   SIMPLE_KEY_CACHE_CB *partition;
5104   SIMPLE_KEY_CACHE_CB **partition_ptr;
5105   uint partitions= keycache->partitions;
5106   int blocks= 0;
5107   DBUG_ENTER("partitioned_init_key_cache");
5108 
5109   keycache->key_cache_block_size = key_cache_block_size;
5110 
5111   if (keycache->key_cache_inited)
5112     partition_ptr= keycache->partition_array;
5113   else
5114   {
5115     if(!(partition_ptr=
5116        (SIMPLE_KEY_CACHE_CB **) my_malloc(sizeof(SIMPLE_KEY_CACHE_CB *) *
5117                                           partitions, MYF(MY_WME))))
5118       DBUG_RETURN(-1);
5119     bzero(partition_ptr, sizeof(SIMPLE_KEY_CACHE_CB *) * partitions);
5120     keycache->partition_array= partition_ptr;
5121   }
5122 
5123   mem_per_cache = use_mem / partitions;
5124   mem_decr= mem_per_cache / 5;
5125 
5126   for (i= 0; i < (int) partitions; i++)
5127   {
5128     my_bool key_cache_inited= keycache->key_cache_inited;
5129     if (key_cache_inited)
5130       partition= *partition_ptr;
5131     else
5132     {
5133       if (!(partition=
5134               (SIMPLE_KEY_CACHE_CB *)  my_malloc(sizeof(SIMPLE_KEY_CACHE_CB),
5135 						 MYF(MY_WME))))
5136         continue;
5137       partition->key_cache_inited= 0;
5138     }
5139 
5140     cnt= init_simple_key_cache(partition, key_cache_block_size, mem_per_cache,
5141 			       division_limit, age_threshold,
5142                                changed_blocks_hash_size);
5143     if (cnt <= 0)
5144     {
5145       end_simple_key_cache(partition, 1);
5146       if (!key_cache_inited)
5147       {
5148         my_free(partition);
5149         partition= 0;
5150       }
5151       if ((i == 0 && cnt < 0) || i > 0)
5152       {
5153         /*
5154           Here we have two cases:
5155             1. i == 0 and cnt < 0
5156             cnt < 0 => mem_per_cache is not big enough to allocate minimal
5157             number of key blocks in the key cache of the partition.
5158             Decrease the the number of the partitions by 1 and start again.
5159             2. i > 0
5160             There is not enough memory for one of the succeeding partitions.
5161             Just skip this partition decreasing the number of partitions in
5162             the key cache by one.
5163           Do not change the value of mem_per_cache in both cases.
5164 	*/
5165         if (key_cache_inited)
5166 	{
5167           my_free(partition);
5168           partition= 0;
5169           if(key_cache_inited)
5170             memmove(partition_ptr, partition_ptr+1,
5171                     sizeof(partition_ptr)*(partitions-i-1));
5172 	}
5173         if (!--partitions)
5174           break;
5175       }
5176       else
5177       {
5178         /*
5179           We come here when i == 0 && cnt == 0.
5180           cnt == 0 => the memory allocator fails to allocate a block of
5181           memory of the size mem_per_cache. Decrease the value of
5182           mem_per_cache  without changing the current number of partitions
5183           and start again. Make sure that such a decrease may happen not
5184           more than 5 times in total.
5185 	*/
5186         if (use_mem <= mem_decr)
5187           break;
5188         use_mem-= mem_decr;
5189       }
5190       i--;
5191       mem_per_cache= use_mem/partitions;
5192       continue;
5193     }
5194     else
5195     {
5196       blocks+= cnt;
5197       *partition_ptr++= partition;
5198     }
5199   }
5200 
5201   keycache->partitions= partitions= (uint) (partition_ptr-keycache->partition_array);
5202   keycache->key_cache_mem_size= mem_per_cache * partitions;
5203   for (i= 0; i < (int) partitions; i++)
5204     keycache->partition_array[i]->hash_factor= partitions;
5205 
5206   keycache->key_cache_inited= 1;
5207 
5208   if (!partitions)
5209     blocks= -1;
5210 
5211   DBUG_RETURN(blocks);
5212 }
5213 
5214 
5215 /*
5216   Resize a partitioned key cache
5217 
5218   SYNOPSIS
5219     resize_partitioned_key_cache()
5220     keycache            pointer to the control block of a partitioned key cache
5221     key_cache_block_size    size of blocks to keep cached data
5222     use_mem             total memory to use for the new key cache
5223     division_limit      new division limit (if not zero)
5224     age_threshold       new age threshold (if not zero)
5225 
5226   DESCRIPTION
5227     This function is the implementation of the resize_key_cache interface
5228     function that is employed by partitioned key caches.
5229     The function takes the parameter keycache as a pointer to the
5230     control block structure of the type PARTITIONED_KEY_CACHE_CB for the
5231     partitioned key cache to be resized.
5232     The parameter key_cache_block_size specifies the new size of the blocks in
5233     the simple key caches that comprise the partitioned key cache.
5234     The parameters division_limit and age_threshold determine the new initial
5235     values of those characteristics of the simple key cache that are used for
5236     midpoint insertion strategy. The parameter use-mem specifies the total
5237     amount of  memory to be allocated for the key cache blocks in all new
5238     simple key caches and for all auxiliary structures.
5239 
5240   RETURN VALUE
5241     number of blocks in the key cache, if successful,
5242     0 - otherwise.
5243 
5244   NOTES.
5245     The function first calls prepare_resize_simple_key_cache for each simple
5246     key cache effectively flushing all dirty pages from it and destroying
5247     the key cache. Then init_partitioned_key_cache is called. This call builds
5248     a new array of simple key caches containing the same number of elements
5249     as the old one. After this the function calls the function
5250     finish_resize_simple_key_cache for each simple key cache from this array.
5251 
5252     This implementation doesn't block the calls and executions of other
5253     functions from the key cache interface. However it assumes that the
5254     calls of resize_partitioned_key_cache itself are serialized.
5255 */
5256 
5257 static
5258 int resize_partitioned_key_cache(PARTITIONED_KEY_CACHE_CB *keycache,
5259                                  uint key_cache_block_size,
5260 		                 size_t use_mem, uint division_limit,
5261 		                 uint age_threshold,
5262                                  uint changed_blocks_hash_size)
5263 {
5264   uint i;
5265   uint partitions= keycache->partitions;
5266   my_bool cleanup= use_mem == 0;
5267   int blocks= -1;
5268   int err= 0;
5269   DBUG_ENTER("partitioned_resize_key_cache");
5270   if (cleanup)
5271   {
5272     end_partitioned_key_cache(keycache, 0);
5273     DBUG_RETURN(-1);
5274   }
5275   for (i= 0; i < partitions; i++)
5276   {
5277     err|= prepare_resize_simple_key_cache(keycache->partition_array[i], 1);
5278   }
5279   if (!err)
5280     blocks= init_partitioned_key_cache(keycache, key_cache_block_size,
5281                                        use_mem, division_limit, age_threshold,
5282                                        changed_blocks_hash_size);
5283   if (blocks > 0)
5284   {
5285     for (i= 0; i < partitions; i++)
5286     {
5287       keycache_pthread_mutex_lock(&keycache->partition_array[i]->cache_lock);
5288       finish_resize_simple_key_cache(keycache->partition_array[i]);
5289     }
5290   }
5291   DBUG_RETURN(blocks);
5292 }
5293 
5294 
5295 /*
5296   Change key cache parameters of a partitioned key cache
5297 
5298   SYNOPSIS
5299     partitioned_change_key_cache_param()
5300     keycache            pointer to the control block of a partitioned key cache
5301     division_limit      new division limit (if not zero)
5302     age_threshold       new age threshold (if not zero)
5303 
5304   DESCRIPTION
5305     This function is the implementation of the change_key_cache_param interface
5306     function that is employed by partitioned key caches.
5307     The function takes the parameter keycache as a pointer to the
5308     control block structure of the type PARTITIONED_KEY_CACHE_CB for the simple
5309     key cache where new values of the division limit and the age threshold used
5310     for midpoint insertion strategy are to be set.  The parameters
5311     division_limit and age_threshold provide these new values.
5312 
5313   RETURN VALUE
5314     none
5315 
5316   NOTES
5317     The function just calls change_simple_key_cache_param for each element from
5318     the array of simple caches that comprise the partitioned key cache.
5319 */
5320 
5321 static
5322 void change_partitioned_key_cache_param(PARTITIONED_KEY_CACHE_CB *keycache,
5323                                         uint division_limit,
5324                                         uint age_threshold)
5325 {
5326   uint i;
5327   uint partitions= keycache->partitions;
5328   DBUG_ENTER("partitioned_change_key_cache_param");
5329   for (i= 0; i < partitions; i++)
5330   {
5331     change_simple_key_cache_param(keycache->partition_array[i], division_limit,
5332                                   age_threshold);
5333   }
5334   DBUG_VOID_RETURN;
5335 }
5336 
5337 
5338 /*
5339   Destroy a partitioned key cache
5340 
5341   SYNOPSIS
5342     end_partitioned_key_cache()
5343     keycache            pointer to the control block of a partitioned key cache
5344     cleanup             <=> complete free (free also control block structures
5345                             for all simple key caches)
5346 
5347   DESCRIPTION
5348     This function is the implementation of the end_key_cache interface
5349     function that is employed by partitioned key caches.
5350     The function takes the parameter keycache as a pointer to the
5351     control block structure of the type PARTITIONED_KEY_CACHE_CB for the
5352     partitioned key cache to be destroyed.
5353     The function frees the memory allocated for the cache blocks and
5354     auxiliary structures used by simple key caches that comprise the
5355     partitioned key cache. If the value of the parameter cleanup is TRUE
5356     then even the memory used for control blocks of the simple key caches
5357     and the array of pointers to them are freed.
5358 
5359   RETURN VALUE
5360     none
5361 */
5362 
5363 static
5364 void end_partitioned_key_cache(PARTITIONED_KEY_CACHE_CB *keycache,
5365                                my_bool cleanup)
5366 {
5367   uint i;
5368   uint partitions= keycache->partitions;
5369   DBUG_ENTER("partitioned_end_key_cache");
5370   DBUG_PRINT("enter", ("key_cache: %p",  keycache));
5371 
5372   for (i= 0; i < partitions; i++)
5373   {
5374     end_simple_key_cache(keycache->partition_array[i], cleanup);
5375   }
5376   if (cleanup)
5377   {
5378     for (i= 0; i < partitions; i++)
5379       my_free(keycache->partition_array[i]);
5380     my_free(keycache->partition_array);
5381     keycache->key_cache_inited= 0;
5382   }
5383   DBUG_VOID_RETURN;
5384 }
5385 
5386 
5387 /*
5388   Read a block of data from a partitioned key cache into a buffer
5389 
5390   SYNOPSIS
5391 
5392     partitioned_key_cache_read()
5393     keycache            pointer to the control block of a partitioned key cache
5394     file                handler for the file for the block of data to be read
5395     filepos             position of the block of data in the file
5396     level               determines the weight of the data
5397     buff                buffer to where the data must be placed
5398     length              length of the buffer
5399     block_length        length of the read data from a key cache block
5400     return_buffer       return pointer to the key cache buffer with the data
5401 
5402   DESCRIPTION
5403     This function is the implementation of the key_cache_read interface
5404     function that is employed by partitioned key caches.
5405     The function takes the parameter keycache as a pointer to the
5406     control block structure of the type PARTITIONED_KEY_CACHE_CB for a
5407     partitioned key cache.
5408     In a general case the function reads a block of data from the key cache
5409     into the buffer buff of the size specified by the parameter length. The
5410     beginning of the  block of data to be read is  specified by the parameters
5411     file and filepos. The length of the read data is the same as the length
5412     of the buffer. The data is read into the buffer in key_cache_block_size
5413     increments. To read each portion the function first finds out in what
5414     partition of the key cache this portion(page) is to be saved, and calls
5415     simple_key_cache_read with the pointer to the corresponding simple key as
5416     its first parameter.
5417     If the parameter return_buffer is not ignored and its value is TRUE, and
5418     the data to be read of the specified size block_length can be read from one
5419     key cache buffer, then the function returns a pointer to the data in the
5420     key cache buffer.
5421     The function takes into account parameters block_length and return buffer
5422     only in a single-threaded environment.
5423     The parameter 'level' is used only by the midpoint insertion strategy
5424     when the data or its portion cannot be found in the key cache.
5425 
5426   RETURN VALUE
5427     Returns address from where the data is placed if successful, 0 - otherwise.
5428 */
5429 
5430 static
5431 uchar *partitioned_key_cache_read(PARTITIONED_KEY_CACHE_CB *keycache,
5432                                   File file, my_off_t filepos, int level,
5433                                   uchar *buff, uint length,
5434                                   uint block_length __attribute__((unused)),
5435                                   int return_buffer __attribute__((unused)))
5436 {
5437   uint r_length;
5438   uint offset= (uint) (filepos % keycache->key_cache_block_size);
5439   uchar *start= buff;
5440   DBUG_ENTER("partitioned_key_cache_read");
5441   DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
5442                (uint) file, (ulong) filepos, length));
5443 
5444 
5445   /* Read data in key_cache_block_size increments */
5446   do
5447   {
5448     SIMPLE_KEY_CACHE_CB *partition= get_key_cache_partition(keycache,
5449                                                             file, filepos);
5450     uchar *ret_buff= 0;
5451     r_length= length;
5452     set_if_smaller(r_length, keycache->key_cache_block_size - offset);
5453     ret_buff= simple_key_cache_read((void *) partition,
5454                                     file, filepos, level,
5455                                     buff, r_length,
5456                                     block_length, return_buffer);
5457     if (ret_buff == 0)
5458       DBUG_RETURN(0);
5459     filepos+= r_length;
5460     buff+= r_length;
5461     offset= 0;
5462   } while ((length-= r_length));
5463 
5464   DBUG_RETURN(start);
5465 }
5466 
5467 
5468 /*
5469   Insert a block of file data from a buffer into a partitioned key cache
5470 
5471   SYNOPSIS
5472     partitioned_key_cache_insert()
5473     keycache            pointer to the control block of a partitioned key cache
5474     file                handler for the file to insert data from
5475     filepos             position of the block of data in the file to insert
5476     level               determines the weight of the data
5477     buff                buffer to read data from
5478     length              length of the data in the buffer
5479 
5480   DESCRIPTION
5481     This function is the implementation of the key_cache_insert interface
5482     function that is employed by partitioned key caches.
5483     The function takes the parameter keycache as a pointer to the
5484     control block structure of the type PARTITIONED_KEY_CACHE_CB for a
5485     partitioned key cache.
5486     The function writes a block of file data from a buffer into the key cache.
5487     The buffer is specified with the parameters buff and length - the pointer
5488     to the beginning of the buffer and its size respectively. It's assumed
5489     that the buffer contains the data from 'file' allocated from the position
5490     filepos. The data is copied from the buffer in key_cache_block_size
5491     increments. For every portion of data the function finds out in what simple
5492     key cache from the array of partitions the data must be stored, and after
5493     this calls simple_key_cache_insert to copy the data into a key buffer of
5494     this simple key cache.
5495     The parameter level is used to set one characteristic for the key buffers
5496     loaded with the data from buff. The characteristic is used only by the
5497     midpoint insertion strategy.
5498 
5499   RETURN VALUE
5500     0 if a success, 1 - otherwise.
5501 
5502   NOTES
5503     The function is used by MyISAM to move all blocks from a index file to
5504     the key cache. It can be performed in parallel with reading the file data
5505     from the key buffers by other threads.
5506 */
5507 
5508 static
5509 int partitioned_key_cache_insert(PARTITIONED_KEY_CACHE_CB *keycache,
5510                                  File file, my_off_t filepos, int level,
5511                                  uchar *buff, uint length)
5512 {
5513   uint w_length;
5514   uint offset= (uint) (filepos % keycache->key_cache_block_size);
5515   DBUG_ENTER("partitioned_key_cache_insert");
5516   DBUG_PRINT("enter", ("fd: %u  pos: %lu  length: %u",
5517                (uint) file,(ulong) filepos, length));
5518 
5519 
5520   /* Write data in key_cache_block_size increments */
5521   do
5522   {
5523     SIMPLE_KEY_CACHE_CB *partition= get_key_cache_partition(keycache,
5524                                                             file, filepos);
5525     w_length= length;
5526     set_if_smaller(w_length, keycache->key_cache_block_size - offset);
5527     if (simple_key_cache_insert((void *) partition,
5528                                 file, filepos, level,
5529                                 buff, w_length))
5530       DBUG_RETURN(1);
5531 
5532     filepos+= w_length;
5533     buff+= w_length;
5534     offset = 0;
5535   } while ((length-= w_length));
5536 
5537   DBUG_RETURN(0);
5538 }
5539 
5540 
5541 /*
5542   Write data from a buffer into a partitioned key cache
5543 
5544   SYNOPSIS
5545 
5546     partitioned_key_cache_write()
5547     keycache            pointer to the control block of a partitioned key cache
5548     file                handler for the file to write data to
5549     filepos             position in the file to write data to
5550     level               determines the weight of the data
5551     buff                buffer with the data
5552     length              length of the buffer
5553     dont_write          if is 0 then all dirty pages involved in writing
5554                         should have been flushed from key cache
5555     file_extra          maps of key cache partitions containing
5556                         dirty pages from file
5557 
5558   DESCRIPTION
5559     This function is the implementation of the key_cache_write interface
5560     function that is employed by partitioned key caches.
5561     The function takes the parameter keycache as a pointer to the
5562     control block structure of the type PARTITIONED_KEY_CACHE_CB for a
5563     partitioned key cache.
5564     In a general case the function copies data from a buffer into the key
5565     cache. The buffer is specified with the parameters buff and length -
5566     the pointer to the beginning of the buffer and its size respectively.
5567     It's assumed the buffer contains the data to be written into 'file'
5568     starting from the position filepos. The data is copied from the buffer
5569     in key_cache_block_size increments. For every portion of data the
5570     function finds out in what simple key cache from the array of partitions
5571     the data must be stored, and after this calls simple_key_cache_write to
5572     copy the data into a key buffer of this simple key cache.
5573     If the value of the parameter dont_write is FALSE then the function
5574     also writes the data into file.
5575     The parameter level is used to set one characteristic for the key buffers
5576     filled with the data from buff. The characteristic is employed only by
5577     the midpoint insertion strategy.
5578     The parameter file_expra provides a pointer to the shared bitmap of
5579     the partitions that may contains dirty pages for the file. This bitmap
5580     is used to optimize the function flush_partitioned_key_cache_blocks.
5581 
5582   RETURN VALUE
5583     0 if a success, 1 - otherwise.
5584 
5585   NOTES
5586     This implementation exploits the fact that the function is called only
5587     when a thread has got an exclusive lock for the key file.
5588 */
5589 
5590 static
5591 int partitioned_key_cache_write(PARTITIONED_KEY_CACHE_CB *keycache,
5592                                 File file, void *file_extra,
5593                                 my_off_t filepos, int level,
5594                                 uchar *buff, uint length,
5595                                 uint block_length  __attribute__((unused)),
5596                                 int dont_write)
5597 {
5598   uint w_length;
5599   ulonglong *part_map= (ulonglong *) file_extra;
5600   uint offset= (uint) (filepos % keycache->key_cache_block_size);
5601   DBUG_ENTER("partitioned_key_cache_write");
5602   DBUG_PRINT("enter",
5603              ("fd: %u  pos: %lu  length: %u  block_length: %u"
5604               "  key_block_length: %u",
5605               (uint) file, (ulong) filepos, length, block_length,
5606               keycache ? keycache->key_cache_block_size : 0));
5607 
5608 
5609   /* Write data in key_cache_block_size increments */
5610   do
5611   {
5612     SIMPLE_KEY_CACHE_CB *partition= get_key_cache_partition_for_write(keycache,
5613                                                                       file,
5614                                                                       filepos,
5615                                                                       part_map);
5616     w_length = length;
5617     set_if_smaller(w_length, keycache->key_cache_block_size - offset );
5618     if (simple_key_cache_write(partition,
5619                                file, 0, filepos, level,
5620                                buff, w_length, block_length,
5621                                dont_write))
5622       DBUG_RETURN(1);
5623 
5624     filepos+= w_length;
5625     buff+= w_length;
5626     offset= 0;
5627   } while ((length-= w_length));
5628 
5629   DBUG_RETURN(0);
5630 }
5631 
5632 
5633 /*
5634   Flush all blocks for a file from key buffers of a partitioned key cache
5635 
5636   SYNOPSIS
5637 
5638     flush_partitioned_key_cache_blocks()
5639     keycache            pointer to the control block of a partitioned key cache
5640     file                handler for the file to flush to
5641     file_extra          maps of key cache partitions containing
5642                         dirty pages from file (not used)
5643     flush_type          type of the flush operation
5644 
5645   DESCRIPTION
5646     This function is the implementation of the flush_key_blocks interface
5647     function that is employed by partitioned key caches.
5648     The function takes the parameter keycache as a pointer to the
5649     control block structure of the type PARTITIONED_KEY_CACHE_CB for a
5650     partitioned key cache.
5651     In a general case the function flushes the data from all dirty key
5652     buffers related to the file 'file' into this file. The function does
5653     exactly this if the value of the parameter type is FLUSH_KEEP. If the
5654     value of this parameter is FLUSH_RELEASE, the function additionally
5655     releases the key buffers containing data from 'file' for new usage.
5656     If the value of the parameter type is FLUSH_IGNORE_CHANGED the function
5657     just releases the key buffers containing data from 'file'.
5658     The function performs the operation by calling the function
5659     flush_simple_key_cache_blocks for the elements of the array of the
5660     simple key caches that comprise the partitioned key_cache. If the value
5661     of the parameter type is FLUSH_KEEP s_flush_key_blocks is called only
5662     for the partitions with possibly dirty pages marked in the bitmap
5663     pointed to by the parameter file_extra.
5664 
5665   RETURN
5666     0   ok
5667     1  error
5668 
5669   NOTES
5670     This implementation exploits the fact that the function is called only
5671     when a thread has got an exclusive lock for the key file.
5672 */
5673 
5674 static
5675 int flush_partitioned_key_cache_blocks(PARTITIONED_KEY_CACHE_CB *keycache,
5676                                        File file, void *file_extra,
5677                                        enum flush_type type)
5678 {
5679   uint i;
5680   uint partitions= keycache->partitions;
5681   int err= 0;
5682   ulonglong *dirty_part_map= (ulonglong *) file_extra;
5683   DBUG_ENTER("partitioned_flush_key_blocks");
5684   DBUG_PRINT("enter", ("keycache: %p",  keycache));
5685 
5686   for (i= 0; i < partitions; i++)
5687   {
5688     SIMPLE_KEY_CACHE_CB *partition= keycache->partition_array[i];
5689     if ((type == FLUSH_KEEP || type == FLUSH_FORCE_WRITE) &&
5690         !((*dirty_part_map) & ((ulonglong) 1 << i)))
5691       continue;
5692     err|= MY_TEST(flush_simple_key_cache_blocks(partition, file, 0, type));
5693   }
5694   *dirty_part_map= 0;
5695 
5696   DBUG_RETURN(err);
5697 }
5698 
5699 
5700 /*
5701   Reset the counters of a partitioned key cache
5702 
5703   SYNOPSIS
5704     reset_partitioned_key_cache_counters()
5705     name                the name of a key cache
5706     keycache            pointer to the control block of a partitioned key cache
5707 
5708   DESCRIPTION
5709     This function is the implementation of the reset_key_cache_counters
5710     interface function that is employed by partitioned key caches.
5711     The function takes the parameter keycache as a pointer to the
5712     control block structure of the type PARTITIONED_KEY_CACHE_CB for a partitioned
5713     key cache.
5714     This function resets the values of the statistical counters of the simple
5715     key caches comprising partitioned key cache to 0. It does it by calling
5716     reset_simple_key_cache_counters for each key  cache partition.
5717     The parameter name is currently not used.
5718 
5719   RETURN
5720     0 on success (always because it can't fail)
5721 */
5722 
5723 static int
5724 reset_partitioned_key_cache_counters(const char *name __attribute__((unused)),
5725                                      PARTITIONED_KEY_CACHE_CB *keycache)
5726 {
5727   uint i;
5728   uint partitions= keycache->partitions;
5729   DBUG_ENTER("partitioned_reset_key_cache_counters");
5730 
5731   for (i = 0; i < partitions; i++)
5732   {
5733     reset_simple_key_cache_counters(name,  keycache->partition_array[i]);
5734   }
5735   DBUG_RETURN(0);
5736 }
5737 
5738 
5739 /*
5740   Get statistics for a partition key cache
5741 
5742   SYNOPSIS
5743     get_partitioned_key_cache_statistics()
5744     keycache            pointer to the control block of a partitioned key cache
5745     partition_no        partition number to get statistics for
5746     key_cache_stats OUT pointer to the structure for the returned statistics
5747 
5748   DESCRIPTION
5749     This function is the implementation of the get_key_cache_statistics
5750     interface function that is employed by partitioned key caches.
5751     The function takes the parameter keycache as a pointer to the
5752     control block structure of the type PARTITIONED_KEY_CACHE_CB for
5753     a partitioned key cache.
5754     If the value of the parameter partition_no is equal to 0 then aggregated
5755     statistics for all partitions is returned in the fields of the
5756     structure key_cache_stat of the type KEY_CACHE_STATISTICS . Otherwise
5757     the function returns data for the partition number partition_no of the
5758     key cache in the structure key_cache_stat. (Here partitions are numbered
5759     starting from 1.)
5760 
5761   RETURN
5762     none
5763 */
5764 
5765 static
5766 void
5767 get_partitioned_key_cache_statistics(PARTITIONED_KEY_CACHE_CB *keycache,
5768                                      uint partition_no,
5769                                      KEY_CACHE_STATISTICS *keycache_stats)
5770 {
5771   uint i;
5772   SIMPLE_KEY_CACHE_CB *partition;
5773   uint partitions= keycache->partitions;
5774   DBUG_ENTER("get_partitioned_key_cache_statistics");
5775 
5776   if (partition_no != 0)
5777   {
5778     partition= keycache->partition_array[partition_no-1];
5779     get_simple_key_cache_statistics((void *) partition, 0, keycache_stats);
5780     DBUG_VOID_RETURN;
5781   }
5782   bzero(keycache_stats, sizeof(KEY_CACHE_STATISTICS));
5783   keycache_stats->mem_size= (longlong) keycache->key_cache_mem_size;
5784   keycache_stats->block_size= (longlong) keycache->key_cache_block_size;
5785   for (i = 0; i < partitions; i++)
5786   {
5787     partition= keycache->partition_array[i];
5788     keycache_stats->blocks_used+= partition->blocks_used;
5789     keycache_stats->blocks_unused+= partition->blocks_unused;
5790     keycache_stats->blocks_changed+= partition->global_blocks_changed;
5791     keycache_stats->blocks_warm+= partition->warm_blocks;
5792     keycache_stats->read_requests+= partition->global_cache_r_requests;
5793     keycache_stats->reads+= partition->global_cache_read;
5794     keycache_stats->write_requests+= partition->global_cache_w_requests;
5795     keycache_stats->writes+= partition->global_cache_write;
5796   }
5797   DBUG_VOID_RETURN;
5798 }
5799 
5800 /*
5801   The array of pointers to the key cache interface functions used by
5802   partitioned key caches. Any partitioned key cache object caches exploits
5803   this array.
5804 
5805   The current implementation of these functions does not allow to call
5806   them from the MySQL server code directly. The key cache interface
5807   wrappers must be used for this purpose.
5808 */
5809 
5810 static KEY_CACHE_FUNCS partitioned_key_cache_funcs =
5811 {
5812   (INIT_KEY_CACHE) init_partitioned_key_cache,
5813   (RESIZE_KEY_CACHE) resize_partitioned_key_cache,
5814   (CHANGE_KEY_CACHE_PARAM) change_partitioned_key_cache_param,
5815   (KEY_CACHE_READ) partitioned_key_cache_read,
5816   (KEY_CACHE_INSERT) partitioned_key_cache_insert,
5817   (KEY_CACHE_WRITE) partitioned_key_cache_write,
5818   (FLUSH_KEY_BLOCKS) flush_partitioned_key_cache_blocks,
5819   (RESET_KEY_CACHE_COUNTERS) reset_partitioned_key_cache_counters,
5820   (END_KEY_CACHE) end_partitioned_key_cache,
5821   (GET_KEY_CACHE_STATISTICS) get_partitioned_key_cache_statistics,
5822 };
5823 
5824 
5825 /******************************************************************************
5826   Key Cache Interface Module
5827 
5828   The module contains wrappers for all key cache interface functions.
5829 
5830   Currently there are key caches of two types: simple key caches and
5831   partitioned key caches. Each type (class) has its own implementation of the
5832   basic key cache operations used the MyISAM storage engine. The pointers
5833   to the implementation functions are stored in two static structures of the
5834   type KEY_CACHE_FUNC: simple_key_cache_funcs - for simple key caches, and
5835   partitioned_key_cache_funcs - for partitioned key caches. When a key cache
5836   object is created the constructor procedure init_key_cache places a pointer
5837   to the corresponding table into one of its fields. The procedure also
5838   initializes a control block for the key cache oject and saves the pointer
5839   to this block in another field of the key cache object.
5840   When a key cache wrapper function is invoked for a key cache object to
5841   perform a basic key cache operation it looks into the interface table
5842   associated with the key cache oject and calls the corresponding
5843   implementation of the operation. It passes the saved key cache control
5844   block to this implementation. If, for some reasons, the control block
5845   has not been fully initialized yet, the wrapper function either does not
5846   do anything or, in the case when it perform a read/write operation, the
5847   function do it directly through the system i/o functions.
5848 
5849   As we can see the model with which the key cache interface is supported
5850   as quite conventional for interfaces in general.
5851 
5852 ******************************************************************************/
5853 
5854 static
5855 int repartition_key_cache_internal(KEY_CACHE *keycache,
5856                                    uint key_cache_block_size, size_t use_mem,
5857                                    uint division_limit, uint age_threshold,
5858                                    uint changed_blocks_hash_size,
5859                                    uint partitions, my_bool use_op_lock);
5860 
5861 /*
5862   Initialize a key cache : internal
5863 
5864   SYNOPSIS
5865     init_key_cache_internal()
5866     keycache           pointer to the key cache to be initialized
5867     key_cache_block_size    size of blocks to keep cached data
5868     use_mem             total memory to use for cache buffers/structures
5869     division_limit      division limit (may be zero)
5870     age_threshold       age threshold (may be zero)
5871     changed_blocks_hash_size Number of hash buckets to hold a link of different
5872                         files. Should be proportional to number of different
5873                         files sused.
5874     partitions          Number of partitions in the key cache
5875     use_op_lock         if TRUE use keycache->op_lock, otherwise - ignore it
5876 
5877   DESCRIPTION
5878     The function performs the actions required from init_key_cache().
5879     It has an additional parameter: use_op_lock. When the parameter
5880     is TRUE than the function initializes keycache->op_lock if needed,
5881     then locks it, and unlocks it before the return. Otherwise the actions
5882     with the lock are omitted.
5883 
5884   RETURN VALUE
5885     total number of blocks in key cache partitions, if successful,
5886     <= 0 - otherwise.
5887 
5888   NOTES
5889     if keycache->key_cache_inited != 0 we assume that the memory
5890     for the control block of the key cache has been already allocated.
5891 */
5892 
5893 static
5894 int init_key_cache_internal(KEY_CACHE *keycache, uint key_cache_block_size,
5895 		            size_t use_mem, uint division_limit,
5896 		            uint age_threshold, uint changed_blocks_hash_size,
5897                             uint partitions,
5898                             my_bool use_op_lock)
5899 {
5900   void *keycache_cb;
5901   int blocks;
5902   if (keycache->key_cache_inited)
5903   {
5904     if (use_op_lock)
5905       pthread_mutex_lock(&keycache->op_lock);
5906     keycache_cb= keycache->keycache_cb;
5907   }
5908   else
5909   {
5910     if (partitions == 0)
5911     {
5912       if (!(keycache_cb= (void *)  my_malloc(sizeof(SIMPLE_KEY_CACHE_CB),
5913                                              MYF(0))))
5914         return 0;
5915       ((SIMPLE_KEY_CACHE_CB *) keycache_cb)->key_cache_inited= 0;
5916       keycache->key_cache_type= SIMPLE_KEY_CACHE;
5917       keycache->interface_funcs= &simple_key_cache_funcs;
5918     }
5919     else
5920     {
5921       if (!(keycache_cb= (void *)  my_malloc(sizeof(PARTITIONED_KEY_CACHE_CB),
5922                                              MYF(0))))
5923         return 0;
5924       ((PARTITIONED_KEY_CACHE_CB *) keycache_cb)->key_cache_inited= 0;
5925       keycache->key_cache_type= PARTITIONED_KEY_CACHE;
5926       keycache->interface_funcs= &partitioned_key_cache_funcs;
5927     }
5928     /*
5929       Initialize op_lock if it's not initialized before.
5930       The mutex may have been initialized before if we are being called
5931       from repartition_key_cache_internal().
5932     */
5933     if (use_op_lock)
5934       pthread_mutex_init(&keycache->op_lock, MY_MUTEX_INIT_FAST);
5935     keycache->keycache_cb= keycache_cb;
5936     keycache->key_cache_inited= 1;
5937     if (use_op_lock)
5938       pthread_mutex_lock(&keycache->op_lock);
5939   }
5940 
5941   if (partitions != 0)
5942   {
5943     ((PARTITIONED_KEY_CACHE_CB *) keycache_cb)->partitions= partitions;
5944   }
5945   keycache->can_be_used= 0;
5946   blocks= keycache->interface_funcs->init(keycache_cb, key_cache_block_size,
5947                                           use_mem, division_limit,
5948                                           age_threshold, changed_blocks_hash_size);
5949   keycache->partitions= partitions ?
5950                         ((PARTITIONED_KEY_CACHE_CB *) keycache_cb)->partitions :
5951                         0;
5952   DBUG_ASSERT(partitions <= MAX_KEY_CACHE_PARTITIONS);
5953   keycache->key_cache_mem_size=
5954     keycache->partitions ?
5955     ((PARTITIONED_KEY_CACHE_CB *) keycache_cb)->key_cache_mem_size :
5956     ((SIMPLE_KEY_CACHE_CB *) keycache_cb)->key_cache_mem_size;
5957   if (blocks > 0)
5958     keycache->can_be_used= 1;
5959   if (use_op_lock)
5960     pthread_mutex_unlock(&keycache->op_lock);
5961   return blocks;
5962 }
5963 
5964 
5965 /*
5966   Initialize a key cache
5967 
5968   SYNOPSIS
5969     init_key_cache()
5970     keycache           pointer to the key cache to be initialized
5971     key_cache_block_size    size of blocks to keep cached data
5972     use_mem             total memory to use for cache buffers/structures
5973     division_limit      division limit (may be zero)
5974     age_threshold       age threshold (may be zero)
5975     partitions          number of partitions in the key cache
5976 
5977   DESCRIPTION
5978     The function creates a control block structure for a key cache and
5979     places the pointer to this block in the structure keycache.
5980     If the value of the parameter 'partitions' is 0 then a simple key cache
5981     is created. Otherwise a partitioned key cache with the specified number
5982     of partitions is created.
5983     The parameter key_cache_block_size specifies the size of the blocks in
5984     the key cache to be created. The parameters division_limit and
5985     age_threshold determine the initial values of those characteristics of
5986     the key cache that are used for midpoint insertion strategy. The parameter
5987     use_mem  specifies the total amount of memory to be allocated for the
5988     key cache buffers and for all auxiliary structures.
5989     The function calls init_key_cache_internal() to perform all these actions
5990     with the last parameter set to TRUE.
5991 
5992   RETURN VALUE
5993     total number of blocks in key cache partitions, if successful,
5994     <= 0 - otherwise.
5995 
5996   NOTES
5997     It's assumed that no two threads call this function simultaneously
5998     referring to the same key cache handle.
5999 */
6000 
6001 int init_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
6002 		   size_t use_mem, uint division_limit,
6003 		   uint age_threshold, uint changed_blocks_hash_size,
6004                    uint partitions)
6005 {
6006   return init_key_cache_internal(keycache,  key_cache_block_size, use_mem,
6007 				 division_limit, age_threshold,
6008                                  changed_blocks_hash_size, partitions, 1);
6009 }
6010 
6011 
6012 /*
6013   Resize a key cache
6014 
6015   SYNOPSIS
6016     resize_key_cache()
6017     keycache            pointer to the key cache to be resized
6018     key_cache_block_size    size of blocks to keep cached data
6019     use_mem             total memory to use for the new key cache
6020     division_limit      new division limit (if not zero)
6021     age_threshold       new age threshold (if not zero)
6022 
6023   DESCRIPTION
6024     The function operates over the key cache key cache.
6025     The parameter key_cache_block_size specifies the new size of the block
6026     buffers in the key cache. The parameters division_limit and age_threshold
6027     determine the new initial values of those characteristics of the key cache
6028     that are used for midpoint insertion strategy. The parameter use_mem
6029     specifies the total amount of  memory to be allocated for the key cache
6030     buffers and for all auxiliary structures.
6031 
6032   RETURN VALUE
6033     number of blocks in the key cache, if successful,
6034     0 - otherwise.
6035 
6036   NOTES
6037     The function does not block the calls and executions of other functions
6038     from the key cache interface. However it assumes that the calls of
6039     resize_key_cache itself are serialized.
6040 
6041     Currently the function is called when the values of the variables
6042     key_buffer_size and/or key_cache_block_size are being reset for
6043     the key cache keycache.
6044 */
6045 
6046 int resize_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
6047 		     size_t use_mem, uint division_limit, uint age_threshold,
6048                      uint changed_blocks_hash_size)
6049 {
6050   int blocks= -1;
6051   if (keycache->key_cache_inited)
6052   {
6053     pthread_mutex_lock(&keycache->op_lock);
6054     if ((uint) keycache->param_partitions != keycache->partitions && use_mem)
6055       blocks= repartition_key_cache_internal(keycache,
6056                                              key_cache_block_size, use_mem,
6057                                              division_limit, age_threshold,
6058                                              changed_blocks_hash_size,
6059                                              (uint) keycache->param_partitions,
6060                                              0);
6061     else
6062     {
6063       blocks= keycache->interface_funcs->resize(keycache->keycache_cb,
6064                                                 key_cache_block_size,
6065                                                 use_mem, division_limit,
6066                                                 age_threshold,
6067                                                 changed_blocks_hash_size);
6068 
6069       if (keycache->partitions)
6070         keycache->partitions=
6071           ((PARTITIONED_KEY_CACHE_CB *)(keycache->keycache_cb))->partitions;
6072     }
6073 
6074     keycache->key_cache_mem_size=
6075     keycache->partitions ?
6076     ((PARTITIONED_KEY_CACHE_CB *)(keycache->keycache_cb))->key_cache_mem_size :
6077     ((SIMPLE_KEY_CACHE_CB *)(keycache->keycache_cb))->key_cache_mem_size;
6078 
6079     keycache->can_be_used= (blocks >= 0);
6080     pthread_mutex_unlock(&keycache->op_lock);
6081   }
6082   return blocks;
6083 }
6084 
6085 
6086 /*
6087   Change key cache parameters of a key cache
6088 
6089   SYNOPSIS
6090     change_key_cache_param()
6091     keycache            pointer to the key cache to change parameters for
6092     division_limit      new division limit (if not zero)
6093     age_threshold       new age threshold (if not zero)
6094 
6095   DESCRIPTION
6096     The function sets new values of the division limit and the age threshold
6097     used when the key cache keycach employs midpoint insertion strategy.
6098     The parameters division_limit and age_threshold provide these new values.
6099 
6100   RETURN VALUE
6101     none
6102 
6103   NOTES
6104     Currently the function is called when the values of the variables
6105     key_cache_division_limit and/or key_cache_age_threshold are being reset
6106     for the key cache keycache.
6107 */
6108 
6109 void change_key_cache_param(KEY_CACHE *keycache, uint division_limit,
6110 			    uint age_threshold)
6111 {
6112   if (keycache->key_cache_inited)
6113   {
6114     pthread_mutex_lock(&keycache->op_lock);
6115     keycache->interface_funcs->change_param(keycache->keycache_cb,
6116                                             division_limit,
6117                                             age_threshold);
6118     pthread_mutex_unlock(&keycache->op_lock);
6119   }
6120 }
6121 
6122 
6123 /*
6124   Destroy a key cache : internal
6125 
6126   SYNOPSIS
6127     end_key_cache_internal()
6128     keycache            pointer to the key cache to be destroyed
6129     cleanup             <=> complete free
6130     use_op_lock         if TRUE use keycache->op_lock, otherwise - ignore it
6131 
6132   DESCRIPTION
6133     The function performs the actions required from end_key_cache().
6134     It has an additional parameter: use_op_lock. When the parameter
6135     is TRUE than the function destroys keycache->op_lock if cleanup is true.
6136     Otherwise the action with the lock is omitted.
6137 
6138   RETURN VALUE
6139     none
6140 */
6141 
6142 static
6143 void end_key_cache_internal(KEY_CACHE *keycache, my_bool cleanup,
6144                             my_bool use_op_lock)
6145 {
6146   if (keycache->key_cache_inited)
6147   {
6148     keycache->interface_funcs->end(keycache->keycache_cb, cleanup);
6149     if (cleanup)
6150     {
6151       if (keycache->keycache_cb)
6152       {
6153         my_free(keycache->keycache_cb);
6154         keycache->keycache_cb= 0;
6155       }
6156       /*
6157         We do not destroy op_lock if we are going to reuse the same key cache.
6158         This happens if we are called from  repartition_key_cache_internal().
6159       */
6160       if (use_op_lock)
6161         pthread_mutex_destroy(&keycache->op_lock);
6162       keycache->key_cache_inited= 0;
6163     }
6164     keycache->can_be_used= 0;
6165   }
6166 }
6167 
6168 
6169 /*
6170   Destroy a key cache
6171 
6172   SYNOPSIS
6173     end_key_cache()
6174     keycache            pointer to the key cache to be destroyed
6175     cleanup             <=> complete free
6176 
6177   DESCRIPTION
6178     The function frees the memory allocated for the cache blocks and
6179     auxiliary structures used by the key cache keycache. If the value
6180     of the parameter cleanup is TRUE then all resources used by the key
6181     cache are to be freed.
6182     The function calls end_key_cache_internal() to perform all these actions
6183     with the last parameter set to TRUE.
6184 
6185   RETURN VALUE
6186     none
6187 */
6188 
6189 void end_key_cache(KEY_CACHE *keycache, my_bool cleanup)
6190 {
6191   end_key_cache_internal(keycache, cleanup, 1);
6192 }
6193 
6194 
6195 /*
6196   Read a block of data from a key cache into a buffer
6197 
6198   SYNOPSIS
6199 
6200     key_cache_read()
6201     keycache            pointer to the key cache to read data from
6202     file                handler for the file for the block of data to be read
6203     filepos             position of the block of data in the file
6204     level               determines the weight of the data
6205     buff                buffer to where the data must be placed
6206     length              length of the buffer
6207     block_length        length of the data read from a key cache block
6208     return_buffer       return pointer to the key cache buffer with the data
6209 
6210   DESCRIPTION
6211     The function operates over buffers of the key cache keycache.
6212     In a general case the function reads a block of data from the key cache
6213     into the buffer buff of the size specified by the parameter length. The
6214     beginning of the block of data to be read is specified by the parameters
6215     file and filepos. The length of the read data is the same as the length
6216     of the buffer.
6217     If the parameter return_buffer is not ignored and its value is TRUE, and
6218     the data to be read of the specified size block_length can be read from one
6219     key cache buffer, then the function returns a pointer to the data in the
6220     key cache buffer.
6221     The parameter 'level' is used only by the midpoint insertion strategy
6222     when the data or its portion cannot be found in the key cache.
6223     The function reads data into the buffer directly from file if the control
6224     block of the key cache has not been initialized yet.
6225 
6226   RETURN VALUE
6227     Returns address from where the data is placed if successful, 0 - otherwise.
6228 
6229   NOTES.
6230     Filepos must be a multiple of 'block_length', but it doesn't
6231     have to be a multiple of key_cache_block_size;
6232 */
6233 
6234 uchar *key_cache_read(KEY_CACHE *keycache,
6235                       File file, my_off_t filepos, int level,
6236                       uchar *buff, uint length,
6237 		      uint block_length, int return_buffer)
6238 {
6239   if (keycache->can_be_used)
6240     return keycache->interface_funcs->read(keycache->keycache_cb,
6241                                            file, filepos, level,
6242                                            buff, length,
6243                                            block_length, return_buffer);
6244 
6245   /* We can't use mutex here as the key cache may not be initialized */
6246 
6247   if (my_pread(file, (uchar*) buff, length, filepos, MYF(MY_NABP)))
6248     return (uchar *) 0;
6249 
6250   return buff;
6251 }
6252 
6253 
6254 /*
6255   Insert a block of file data from a buffer into a key cache
6256 
6257   SYNOPSIS
6258     key_cache_insert()
6259     keycache            pointer to the key cache to insert data into
6260     file                handler for the file to insert data from
6261     filepos             position of the block of data in the file to insert
6262     level               determines the weight of the data
6263     buff                buffer to read data from
6264     length              length of the data in the buffer
6265 
6266   DESCRIPTION
6267     The function operates over buffers of the key cache keycache.
6268     The function writes a block of file data from a buffer into the key cache.
6269     The buffer is specified with the parameters buff and length - the pointer
6270     to the beginning of the buffer and its size respectively. It's assumed
6271     that the buffer contains the data from 'file' allocated from the position
6272     filepos.
6273     The parameter level is used to set one characteristic for the key buffers
6274     loaded with the data from buff. The characteristic is used only by the
6275     midpoint insertion strategy.
6276 
6277   RETURN VALUE
6278     0 if a success, 1 - otherwise.
6279 
6280   NOTES
6281     The function is used by MyISAM to move all blocks from a index file to
6282     the key cache.
6283     It is assumed that it may be performed in parallel with reading the file
6284     data from the key buffers by other threads.
6285 */
6286 
6287 int key_cache_insert(KEY_CACHE *keycache,
6288                      File file, my_off_t filepos, int level,
6289                      uchar *buff, uint length)
6290 {
6291   if (keycache->can_be_used)
6292     return keycache->interface_funcs->insert(keycache->keycache_cb,
6293                                              file, filepos, level,
6294                                              buff, length);
6295   return 0;
6296 }
6297 
6298 
6299 /*
6300   Write data from a buffer into a key cache
6301 
6302   SYNOPSIS
6303 
6304     key_cache_write()
6305     keycache            pointer to the key cache to write data to
6306     file                handler for the file to write data to
6307     filepos             position in the file to write data to
6308     level               determines the weight of the data
6309     buff                buffer with the data
6310     length              length of the buffer
6311     dont_write          if is 0 then all dirty pages involved in writing
6312                         should have been flushed from key cache
6313     file_extra          pointer to optional file attributes
6314 
6315   DESCRIPTION
6316     The function operates over buffers of the key cache keycache.
6317     In a general case the function writes data from a buffer into the key
6318     cache. The buffer is specified with the parameters buff and length -
6319     the pointer to the beginning of the buffer and its size respectively.
6320     It's assumed the buffer contains the data to be written into 'file'
6321     starting from the position filepos.
6322     If the value of the parameter dont_write is FALSE then the function
6323     also writes the data into file.
6324     The parameter level is used to set one characteristic for the key buffers
6325     filled with the data from buff. The characteristic is employed only by
6326     the midpoint insertion strategy.
6327     The parameter file_expra may point to additional file attributes used
6328     for optimization or other purposes.
6329     The function writes data from the buffer directly into file if the control
6330     block of the key cache has not been initialized yet.
6331 
6332   RETURN VALUE
6333     0 if a success, 1 - otherwise.
6334 
6335   NOTES
6336     This implementation may exploit the fact that the function is called only
6337     when a thread has got an exclusive lock for the key file.
6338 */
6339 
6340 int key_cache_write(KEY_CACHE *keycache,
6341                     File file, void *file_extra,
6342                     my_off_t filepos, int level,
6343                     uchar *buff, uint length,
6344 		    uint block_length, int force_write)
6345 {
6346   if (keycache->can_be_used)
6347     return keycache->interface_funcs->write(keycache->keycache_cb,
6348                                             file, file_extra,
6349                                             filepos, level,
6350                                             buff, length,
6351                                             block_length, force_write);
6352 
6353   /* We can't use mutex here as the key cache may not be initialized */
6354   if (my_pwrite(file, buff, length, filepos, MYF(MY_NABP | MY_WAIT_IF_FULL)))
6355     return 1;
6356 
6357   return 0;
6358 }
6359 
6360 
6361 /*
6362   Flush all blocks for a file from key buffers of a key cache
6363 
6364   SYNOPSIS
6365 
6366     flush_key_blocks()
6367     keycache            pointer to the key cache whose blocks are to be flushed
6368     file                handler for the file to flush to
6369     file_extra          maps of key cache (used for partitioned key caches)
6370     flush_type          type of the flush operation
6371 
6372   DESCRIPTION
6373     The function operates over buffers of the key cache keycache.
6374     In a general case the function flushes the data from all dirty key
6375     buffers related to the file 'file' into this file. The function does
6376     exactly this if the value of the parameter type is FLUSH_KEEP. If the
6377     value of this parameter is FLUSH_RELEASE, the function additionally
6378     releases the key buffers containing data from 'file' for new usage.
6379     If the value of the parameter type is FLUSH_IGNORE_CHANGED the function
6380     just releases the key buffers containing data from 'file'.
6381     If the value of the parameter type is FLUSH_KEEP the function may use
6382     the value of the parameter file_extra pointing to possibly dirty
6383     partitions to optimize the operation for partitioned key caches.
6384 
6385   RETURN
6386     0   ok
6387     1  error
6388 
6389   NOTES
6390     Any implementation of the function may exploit the fact that the function
6391     is called only when a thread has got an exclusive lock for the key file.
6392 */
6393 
6394 int flush_key_blocks(KEY_CACHE *keycache,
6395                      int file, void *file_extra,
6396                      enum flush_type type)
6397 {
6398   if (keycache->can_be_used)
6399     return keycache->interface_funcs->flush(keycache->keycache_cb,
6400                                             file, file_extra, type);
6401   return 0;
6402 }
6403 
6404 
6405 /*
6406   Reset the counters of a key cache
6407 
6408   SYNOPSIS
6409     reset_key_cache_counters()
6410     name          the name of a key cache (unused)
6411     keycache      pointer to the key cache for which to reset counters
6412 
6413   DESCRIPTION
6414     This function resets the values of the statistical counters for the key
6415     cache keycache.
6416     The parameter name is currently not used.
6417 
6418   RETURN
6419     0 on success (always because it can't fail)
6420 
6421   NOTES
6422    This procedure is used by process_key_caches() to reset the counters of all
6423    currently used key caches, both the default one and the named ones.
6424 */
6425 
6426 int reset_key_cache_counters(const char *name __attribute__((unused)),
6427                              KEY_CACHE *keycache,
6428                              void *unused __attribute__((unused)))
6429 {
6430   int rc= 0;
6431   if (keycache->key_cache_inited)
6432   {
6433     pthread_mutex_lock(&keycache->op_lock);
6434     rc= keycache->interface_funcs->reset_counters(name,
6435                                                   keycache->keycache_cb);
6436     pthread_mutex_unlock(&keycache->op_lock);
6437   }
6438   return rc;
6439 }
6440 
6441 
6442 /*
6443   Get statistics for a key cache
6444 
6445   SYNOPSIS
6446     get_key_cache_statistics()
6447     keycache            pointer to the key cache to get statistics for
6448     partition_no        partition number to get statistics for
6449     key_cache_stats OUT pointer to the structure for the returned statistics
6450 
6451   DESCRIPTION
6452     If the value of the parameter partition_no is equal to 0 then statistics
6453     for the whole key cache keycache (aggregated statistics) is returned in the
6454     fields of the structure key_cache_stat of the type KEY_CACHE_STATISTICS.
6455     Otherwise the value of the parameter partition_no makes sense only for
6456     a partitioned key cache. In this case the function returns statistics
6457     for the partition with the specified number partition_no.
6458 
6459   RETURN
6460     none
6461 */
6462 
6463 void get_key_cache_statistics(KEY_CACHE *keycache, uint partition_no,
6464                               KEY_CACHE_STATISTICS *key_cache_stats)
6465 {
6466   if (keycache->key_cache_inited)
6467   {
6468     pthread_mutex_lock(&keycache->op_lock);
6469     keycache->interface_funcs->get_stats(keycache->keycache_cb,
6470                                          partition_no, key_cache_stats);
6471     pthread_mutex_unlock(&keycache->op_lock);
6472   }
6473 }
6474 
6475 
6476 /*
6477   Repartition a key cache : internal
6478 
6479   SYNOPSIS
6480     repartition_key_cache_internal()
6481     keycache           pointer to the key cache to be repartitioned
6482     key_cache_block_size    size of blocks to keep cached data
6483     use_mem             total memory to use for the new key cache
6484     division_limit      new division limit (if not zero)
6485     age_threshold       new age threshold (if not zero)
6486     partitions          new number of partitions in the key cache
6487     use_op_lock         if TRUE use keycache->op_lock, otherwise - ignore it
6488 
6489   DESCRIPTION
6490     The function performs the actions required from repartition_key_cache().
6491     It has an additional parameter: use_op_lock. When the parameter
6492     is TRUE then the function locks keycache->op_lock at start and
6493     unlocks it before the return. Otherwise the actions with the lock
6494     are omitted.
6495 
6496   RETURN VALUE
6497     number of blocks in the key cache, if successful,
6498     0 - otherwise.
6499 */
6500 
6501 static
6502 int repartition_key_cache_internal(KEY_CACHE *keycache,
6503                                    uint key_cache_block_size, size_t use_mem,
6504                                    uint division_limit, uint age_threshold,
6505                                    uint changed_blocks_hash_size,
6506                                    uint partitions, my_bool use_op_lock)
6507 {
6508   uint blocks= -1;
6509   if (keycache->key_cache_inited)
6510   {
6511     if (use_op_lock)
6512       pthread_mutex_lock(&keycache->op_lock);
6513     keycache->interface_funcs->resize(keycache->keycache_cb,
6514                                       key_cache_block_size, 0,
6515                                       division_limit, age_threshold,
6516                                       changed_blocks_hash_size);
6517     end_key_cache_internal(keycache, 1, 0);
6518     blocks= init_key_cache_internal(keycache, key_cache_block_size, use_mem,
6519                                     division_limit, age_threshold,
6520                                     changed_blocks_hash_size, partitions,
6521                                     0);
6522     if (use_op_lock)
6523       pthread_mutex_unlock(&keycache->op_lock);
6524   }
6525   return blocks;
6526 }
6527 
6528 /*
6529   Repartition a key cache
6530 
6531   SYNOPSIS
6532     repartition_key_cache()
6533     keycache           pointer to the key cache to be repartitioned
6534     key_cache_block_size    size of blocks to keep cached data
6535     use_mem             total memory to use for the new key cache
6536     division_limit      new division limit (if not zero)
6537     age_threshold       new age threshold (if not zero)
6538     partitions          new number of partitions in the key cache
6539 
6540   DESCRIPTION
6541     The function operates over the key cache keycache.
6542     The parameter partitions specifies the number of partitions in the key
6543     cache after repartitioning. If the value of this parameter is 0 then
6544     a simple key cache must be created instead of the old one.
6545     The parameter key_cache_block_size specifies the new size of the block
6546     buffers in the key cache. The parameters division_limit and age_threshold
6547     determine the new initial values of those characteristics of the key cache
6548     that are used for midpoint insertion strategy. The parameter use_mem
6549     specifies the total amount of  memory to be allocated for the new key
6550     cache buffers and for all auxiliary structures.
6551     The function calls repartition_key_cache_internal() to perform all these
6552     actions with the last parameter set to TRUE.
6553 
6554   RETURN VALUE
6555     number of blocks in the key cache, if successful,
6556     0 - otherwise.
6557 
6558   NOTES
6559     Currently the function is called when the value of the variable
6560     key_cache_partitions is being reset for the key cache keycache.
6561 */
6562 
6563 int repartition_key_cache(KEY_CACHE *keycache, uint key_cache_block_size,
6564 		          size_t use_mem, uint division_limit,
6565                           uint age_threshold, uint changed_blocks_hash_size,
6566                           uint partitions)
6567 {
6568   return repartition_key_cache_internal(keycache, key_cache_block_size, use_mem,
6569 			                division_limit, age_threshold,
6570                                         changed_blocks_hash_size,
6571                                         partitions, 1);
6572 }
6573 
6574