1 /* Copyright (C) 2006,2007 MySQL AB
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    GNU General Public License for more details.
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
16 /*
17   WL#3071 Maria checkpoint
18   First version written by Guilhem Bichot on 2006-04-27.
19 */
21 /* Here is the implementation of this module */
23 /** @todo RECOVERY BUG this is unreviewed code */
24 /*
25   Summary:
26   checkpoints are done either by a background thread (checkpoint every Nth
27   second) or by a client.
28   In ha_maria, it's not made available to clients, and will soon be done by a
29   background thread (periodically taking checkpoints and flushing dirty
30   pages).
31 */
33 #include "maria_def.h"
34 #include "ma_pagecache.h"
35 #include "ma_blockrec.h"
36 #include "ma_checkpoint.h"
37 #include "ma_loghandler_lsn.h"
38 #include "ma_servicethread.h"
39 #include "ma_crypt.h"
41 /** @brief type of checkpoint currently running */
42 static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
43 /** @brief protects checkpoint_in_progress */
44 static mysql_mutex_t LOCK_checkpoint;
45 /** @brief for killing the background checkpoint thread */
46 static mysql_cond_t  COND_checkpoint;
47 /** @brief control structure for checkpoint background thread */
48 static MA_SERVICE_THREAD_CONTROL checkpoint_control=
49   {0, FALSE, FALSE, &LOCK_checkpoint, &COND_checkpoint};
50 /* is ulong like pagecache->blocks_changed */
51 static uint pages_to_flush_before_next_checkpoint;
52 static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
53   *dfiles_end; /**< list of data files ends here */
54 static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
55   *kfiles_end; /**< list of index files ends here */
56 /* those two statistics below could serve in SHOW GLOBAL STATUS */
57 static uint checkpoints_total= 0, /**< all checkpoint requests made */
58   checkpoints_ok_total= 0; /**< all checkpoints which succeeded */
60 struct st_filter_param
61 {
62   LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
63   uint max_pages; /**< stop after flushing this number pages */
64 }; /**< information to determine which dirty pages should be flushed */
66 static enum pagecache_flush_filter_result
67 filter_flush_file_medium(enum pagecache_page_type type,
68                          pgcache_page_no_t page,
69                          LSN rec_lsn, void *arg);
70 static enum pagecache_flush_filter_result
71 filter_flush_file_full(enum pagecache_page_type type,
72                        pgcache_page_no_t page,
73                        LSN rec_lsn, void *arg);
74 static enum pagecache_flush_filter_result
75 filter_flush_file_evenly(enum pagecache_page_type type,
76                          pgcache_page_no_t pageno,
77                          LSN rec_lsn, void *arg);
78 static int really_execute_checkpoint(void);
79 pthread_handler_t ma_checkpoint_background(void *arg);
80 static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
82 /**
83    @brief Does a checkpoint
85    @param  level               what level of checkpoint to do
86    @param  no_wait             if another checkpoint of same or stronger level
87                                is already running, consider our job done
89    @note In ha_maria, there can never be two threads trying a checkpoint at
90    the same time.
92    @return Operation status
93     @retval 0 ok
94     @retval !=0 error
95 */
ma_checkpoint_execute(CHECKPOINT_LEVEL level,my_bool no_wait)97 int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
98 {
99   int result= 0;
100   DBUG_ENTER("ma_checkpoint_execute");
102   if (!checkpoint_control.inited)
103   {
104     /*
105       If ha_maria failed to start, maria_panic_hton is called, we come here.
106     */
107     DBUG_RETURN(0);
108   }
111   /* look for already running checkpoints */
112   mysql_mutex_lock(&LOCK_checkpoint);
113   while (checkpoint_in_progress != CHECKPOINT_NONE)
114   {
115     if (no_wait && (checkpoint_in_progress >= level))
116     {
117       /*
118         If we are the checkpoint background thread, we don't wait (it's
119         smarter to flush pages instead of waiting here while the other thread
120         finishes its checkpoint).
121       */
122       mysql_mutex_unlock(&LOCK_checkpoint);
123       goto end;
124     }
125     mysql_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
126   }
128   checkpoint_in_progress= level;
129   mysql_mutex_unlock(&LOCK_checkpoint);
130   /* from then on, we are sure to be and stay the only checkpointer */
132   result= really_execute_checkpoint();
133   DBUG_EXECUTE_IF("maria_crash_after_checkpoint",
134                   { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
136   mysql_cond_broadcast(&COND_checkpoint);
137 end:
138   DBUG_RETURN(result);
139 }
142 /**
143    @brief Does a checkpoint, really; expects no other checkpoints
144    running.
146    Checkpoint level requested is read from checkpoint_in_progress.
148    @return Operation status
149     @retval 0   ok
150     @retval !=0 error
151 */
really_execute_checkpoint(void)153 static int really_execute_checkpoint(void)
154 {
155   uint i, error= 0;
156   /** @brief checkpoint_start_log_horizon will be stored there */
157   char *ptr;
158   LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
159   LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
160   TRANSLOG_ADDRESS checkpoint_start_log_horizon;
161   char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
162   DBUG_ENTER("really_execute_checkpoint");
163   DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
164   bzero(&record_pieces, sizeof(record_pieces));
166   /*
167     STEP 1: record current end-of-log position using log's lock. It is
168     critical for the correctness of Checkpoint (related to memory visibility
169     rules, the log's lock is a mutex).
170     "Horizon" is a lower bound of the LSN of the next log record.
171   */
172   checkpoint_start_log_horizon= translog_get_horizon();
173   DBUG_PRINT("info",("checkpoint_start_log_horizon " LSN_FMT "",
174                      LSN_IN_PARTS(checkpoint_start_log_horizon)));
175   lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
177   /*
178     STEP 2: fetch information about transactions.
179     We must fetch transactions before dirty pages. Indeed, a transaction
180     first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
181     to 0. If we fetched pages first, we may see no dirty page yet, then we
182     fetch transactions but the transaction has already reset its rec_lsn to 0
183     so we miss rec_lsn again.
184     For a similar reason (over-allocated bitmap pages) we have to fetch
185     transactions before flushing bitmap pages.
187     min_trn_rec_lsn will serve to lower the starting point of the REDO phase
188     (down from checkpoint_start_log_horizon).
189  */
190   if (unlikely(trnman_collect_transactions(&record_pieces[0],
191                                            &record_pieces[1],
192                                            &min_trn_rec_lsn,
193                                            &min_first_undo_lsn)))
194     goto err;
197   /* STEP 3: fetch information about table files */
198   if (unlikely(collect_tables(&record_pieces[2],
199                               checkpoint_start_log_horizon)))
200     goto err;
203   /* STEP 4: fetch information about dirty pages */
204   /*
205     It's better to do it _after_ having flushed some data pages (which
206     collect_tables() may have done), because those are now non-dirty and so we
207     have a more up-to-date dirty pages list to put into the checkpoint record,
208     and thus we will have less work at Recovery.
209   */
210   /* Using default pagecache for now */
211   if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
212                                                          &record_pieces[3],
213                                                          &min_page_rec_lsn)))
214     goto err;
217   /* LAST STEP: now write the checkpoint log record */
218   {
219     LSN lsn;
220     translog_size_t total_rec_length;
221     /*
222       the log handler is allowed to modify "str" and "length" (but not "*str")
223       of its argument, so we must not pass it record_pieces directly,
224       otherwise we would later not know what memory pieces to my_free().
225     */
227     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
228       (uchar*) checkpoint_start_log_horizon_char;
229     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
230       sizeof(checkpoint_start_log_horizon_char);
231     for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
232     {
233       log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].str= (uchar*)record_pieces[i].str;
234       log_array[TRANSLOG_INTERNAL_PARTS + 1 + i].length= record_pieces[i].length;
235       total_rec_length+= (translog_size_t) record_pieces[i].length;
236     }
237     if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
238                                        &dummy_transaction_object, NULL,
239                                        total_rec_length,
240                                        sizeof(log_array)/sizeof(log_array[0]),
241                                        log_array, NULL, NULL) ||
242                  translog_flush(lsn)))
243       goto err;
244     translog_lock();
245     /*
246       This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
247       such hook would be called before translog_flush (and we must be sure
248       that log was flushed before we write to the control file).
249     */
250     if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
251                                                  max_trid_in_control_file,
252                                                  recovery_failures)))
253     {
254       translog_unlock();
255       goto err;
256     }
257     translog_unlock();
258   }
260   /*
261     Note that we should not alter memory structures until we have successfully
262     written the checkpoint record and control file.
263   */
264   /* checkpoint succeeded */
265   ptr= record_pieces[3].str;
266   pages_to_flush_before_next_checkpoint= uint4korr(ptr);
267   DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
268                           pages_to_flush_before_next_checkpoint));
270   /* compute log's low-water mark */
271   {
272     TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
273     set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
274     set_if_smaller(log_low_water_mark, min_first_undo_lsn);
275     set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
276     /**
277        Now purge unneeded logs.
278        As some systems have an unreliable fsync (drive lying), we could try to
279        be robust against that: remember a few previous checkpoints in the
280        control file, and not purge logs immediately... Think about it.
281     */
282     if (translog_purge(log_low_water_mark))
283       ma_message_no_user(0, "log purging failed");
284   }
286   goto end;
288 err:
289   error= 1;
290   ma_message_no_user(0, "checkpoint failed");
291   /* we were possibly not able to determine what pages to flush */
292   pages_to_flush_before_next_checkpoint= 0;
294 end:
295   for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
296     my_free(record_pieces[i].str);
297   mysql_mutex_lock(&LOCK_checkpoint);
298   checkpoint_in_progress= CHECKPOINT_NONE;
299   checkpoints_total++;
300   checkpoints_ok_total+= !error;
301   mysql_mutex_unlock(&LOCK_checkpoint);
302   DBUG_RETURN(error);
303 }
306 /**
307    @brief Initializes the checkpoint module
309    @param  interval           If one wants the module to create a
310                               thread which will periodically do
311                               checkpoints, and flush dirty pages, in the
312                               background, it should specify a non-zero
313                               interval in seconds. The thread will then be
314                               created and will take checkpoints separated by
315                               approximately 'interval' second.
317    @note A checkpoint is taken only if there has been some significant
318    activity since the previous checkpoint. Between checkpoint N and N+1 the
319    thread flushes all dirty pages which were already dirty at the time of
320    checkpoint N.
322    @return Operation status
323     @retval 0   ok
324     @retval !=0 error
325 */
ma_checkpoint_init(ulong interval)327 int ma_checkpoint_init(ulong interval)
328 {
329   int res= 0;
330   DBUG_ENTER("ma_checkpoint_init");
331   if (ma_service_thread_control_init(&checkpoint_control))
332     res= 1;
333   else if (interval > 0)
334   {
335     size_t intv= interval;
336     compile_time_assert(sizeof(void *) >= sizeof(ulong));
337     if ((res= mysql_thread_create(key_thread_checkpoint,
338                                   &checkpoint_control.thread, NULL,
339                                   ma_checkpoint_background,
340                                   (void*) intv)))
341       checkpoint_control.killed= TRUE;
342   }
343   else
344     checkpoint_control.killed= TRUE;
345   DBUG_RETURN(res);
346 }
349 #ifndef DBUG_OFF
350 /**
351    Function used to test recovery: flush some table pieces and then caller
352    crashes.
354    @param  what_to_flush   0: current bitmap and all data pages
355                            1: state
356                            2: all bitmap pages
357 */
flush_all_tables(int what_to_flush)358 static void flush_all_tables(int what_to_flush)
359 {
360   int res= 0;
361   LIST *pos; /**< to iterate over open tables */
362   mysql_mutex_lock(&THR_LOCK_maria);
363   for (pos= maria_open_list; pos; pos= pos->next)
364   {
365     MARIA_HA *info= (MARIA_HA*)pos->data;
366     if (info->s->now_transactional)
367     {
368       switch (what_to_flush)
369       {
370       case 0:
371         res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
372                                    FLUSH_KEEP, FLUSH_KEEP);
373         break;
374       case 1:
375         res= _ma_state_info_write(info->s,
376                                   MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
377                                   MA_STATE_INFO_WRITE_LOCK);
378         DBUG_PRINT("maria_flush_states",
379                    ("is_of_horizon: LSN " LSN_FMT,
380                     LSN_IN_PARTS(info->s->state.is_of_horizon)));
381         break;
382       case 2:
383         res= _ma_bitmap_flush_all(info->s);
384         break;
385       }
386     }
387     DBUG_ASSERT(res == 0);
388   }
389   mysql_mutex_unlock(&THR_LOCK_maria);
390 }
391 #endif
394 /**
395    @brief Destroys the checkpoint module
396 */
ma_checkpoint_end(void)398 void ma_checkpoint_end(void)
399 {
400   DBUG_ENTER("ma_checkpoint_end");
401   /*
402     Some intentional crash methods, usually triggered by
404   */
405   DBUG_EXECUTE_IF("maria_flush_bitmap",
406                   {
407                     DBUG_PRINT("maria_flush_bitmap", ("now"));
408                     flush_all_tables(2);
409                   });
410   DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
411                   {
412                     DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
413                     flush_all_tables(0);
414                   });
415   DBUG_EXECUTE_IF("maria_flush_whole_log",
416                   {
417                     DBUG_PRINT("maria_flush_whole_log", ("now"));
418                     translog_flush(translog_get_horizon());
419                   });
420   /*
421     Note that for WAL reasons, maria_flush_states requires
422     maria_flush_whole_log.
423   */
424   DBUG_EXECUTE_IF("maria_flush_states",
425                   {
426                     DBUG_PRINT("maria_flush_states", ("now"));
427                     flush_all_tables(1);
428                   });
429   DBUG_EXECUTE_IF("maria_crash",
430                   { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
432   if (checkpoint_control.inited)
433   {
434     ma_service_thread_control_end(&checkpoint_control);
435     my_free(dfiles);
436     my_free(kfiles);
437     dfiles= kfiles= NULL;
438   }
440 }
443 /**
444    @brief dirty-page filtering criteria for MEDIUM checkpoint.
446    We flush data/index pages which have been dirty since the previous
447    checkpoint (this is the two-checkpoint rule: the REDO phase will not have
448    to start from earlier than the next-to-last checkpoint).
449    Bitmap pages are handled by _ma_bitmap_flush_all().
451    @param  type                Page's type
452    @param  pageno              Page's number
453    @param  rec_lsn             Page's rec_lsn
454    @param  arg                 filter_param
455 */
457 static enum pagecache_flush_filter_result
filter_flush_file_medium(enum pagecache_page_type type,pgcache_page_no_t pageno,LSN rec_lsn,void * arg)458 filter_flush_file_medium(enum pagecache_page_type type,
459                          pgcache_page_no_t pageno __attribute__ ((unused)),
460                          LSN rec_lsn, void *arg)
461 {
462   struct st_filter_param *param= (struct st_filter_param *)arg;
463   return (type == PAGECACHE_LSN_PAGE) &&
464     (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
465 }
468 /**
469    @brief dirty-page filtering criteria for FULL checkpoint.
471    We flush all dirty data/index pages.
472    Bitmap pages are handled by _ma_bitmap_flush_all().
474    @param  type                Page's type
475    @param  pageno              Page's number
476    @param  rec_lsn             Page's rec_lsn
477    @param  arg                 filter_param
478 */
480 static enum pagecache_flush_filter_result
filter_flush_file_full(enum pagecache_page_type type,pgcache_page_no_t pageno,LSN rec_lsn,void * arg)481 filter_flush_file_full(enum pagecache_page_type type,
482                        pgcache_page_no_t pageno __attribute__ ((unused)),
483                        LSN rec_lsn __attribute__ ((unused)),
484                        void *arg __attribute__ ((unused)))
485 {
486   return (type == PAGECACHE_LSN_PAGE);
487 }
490 /**
491    @brief dirty-page filtering criteria for background flushing thread.
493    We flush data/index pages which have been dirty since the previous
494    checkpoint (this is the two-checkpoint rule: the REDO phase will not have
495    to start from earlier than the next-to-last checkpoint), and no
496    bitmap pages. But we flush no more than a certain number of pages (to have
497    an even flushing, no write burst).
498    The reason to not flush bitmap pages is that they may not be in a flushable
499    state at this moment and we don't want to wait for them.
501    @param  type                Page's type
502    @param  pageno              Page's number
503    @param  rec_lsn             Page's rec_lsn
504    @param  arg                 filter_param
505 */
507 static enum pagecache_flush_filter_result
filter_flush_file_evenly(enum pagecache_page_type type,pgcache_page_no_t pageno,LSN rec_lsn,void * arg)508 filter_flush_file_evenly(enum pagecache_page_type type,
509                          pgcache_page_no_t pageno __attribute__ ((unused)),
510                          LSN rec_lsn, void *arg)
511 {
512   struct st_filter_param *param= (struct st_filter_param *)arg;
513   if (unlikely(param->max_pages == 0)) /* all flushed already */
514     return FLUSH_FILTER_SKIP_ALL;
515   if ((type == PAGECACHE_LSN_PAGE) &&
516       (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
517   {
518     param->max_pages--;
519     return FLUSH_FILTER_OK;
520   }
522 }
525 /**
526    @brief Background thread which does checkpoints and flushes periodically.
528    Takes a checkpoint. After this, all pages dirty at the time of that
529    checkpoint are flushed evenly until it is time to take another checkpoint.
530    This ensures that the REDO phase starts at earliest (in LSN time) at the
531    next-to-last checkpoint record ("two-checkpoint rule").
533    @note MikaelR questioned why the same thread does two different jobs, the
534    risk could be that while a checkpoint happens no LRD flushing happens.
535 */
537 static ulong maria_checkpoint_min_cache_activity= 10*1024*1024;
538 /* Set in ha_maria.cc */
539 ulong maria_checkpoint_min_log_activity= 1*1024*1024;
ma_checkpoint_background(void * arg)541 pthread_handler_t ma_checkpoint_background(void *arg)
542 {
543   /** @brief At least this of log/page bytes written between checkpoints */
544   /*
545     If the interval could be changed by the user while we are in this thread,
546     it could be annoying: for example it could cause "case 2" to be executed
547     right after "case 0", thus having 'dfile' unset. So the thread cares only
548     about the interval's value when it started.
549   */
550   const size_t interval= (size_t)arg;
551   size_t sleeps, sleep_time;
552   TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
553     translog_get_horizon();
554   ulonglong pagecache_flushes_at_last_checkpoint=
555     maria_pagecache->global_cache_write;
556   uint UNINIT_VAR(pages_bunch_size);
557   struct st_filter_param filter_param;
558   PAGECACHE_FILE *UNINIT_VAR(dfile); /**< data file currently being flushed */
559   PAGECACHE_FILE *UNINIT_VAR(kfile); /**< index file currently being flushed */
561   my_thread_init();
562   DBUG_PRINT("info",("Maria background checkpoint thread starts"));
563   DBUG_ASSERT(interval > 0);
565   PSI_CALL_set_thread_user_host(0,0,0,0);
567   /*
568     Recovery ended with all tables closed and a checkpoint: no need to take
569     one immediately.
570   */
571   sleeps= 1;
572   pages_to_flush_before_next_checkpoint= 0;
574   for(;;) /* iterations of checkpoints and dirty page flushing */
575   {
576 #if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
577     sleeps=0;
578 #endif
579     switch (sleeps % interval)
580     {
581     case 0:
582     {
583       /* If checkpoints are disabled, wait 1 second and try again */
584       if (maria_checkpoint_disabled)
585       {
586         sleep_time= 1;
587         break;
588       }
589       {
590         TRANSLOG_ADDRESS horizon= translog_get_horizon();
592         /*
593           With background flushing evenly distributed over the time
594           between two checkpoints, we should have only little flushing to do
595           in the checkpoint.
596         */
597         /*
598           No checkpoint if little work of interest for recovery was done
599           since last checkpoint. Such work includes log writing (lengthens
600           recovery, checkpoint would shorten it), page flushing (checkpoint
601           would decrease the amount of read pages in recovery).
602           In case of one short statement per minute (very low load), we don't
603           want to checkpoint every minute, hence the positive
604           maria_checkpoint_min_activity.
605         */
606         if ((ulonglong) (horizon - log_horizon_at_last_checkpoint) <=
607             maria_checkpoint_min_log_activity &&
608             ((ulonglong) (maria_pagecache->global_cache_write -
609                           pagecache_flushes_at_last_checkpoint) *
610              maria_pagecache->block_size) <=
611             maria_checkpoint_min_cache_activity)
612         {
613           /*
614             Not enough has happend since last checkpoint.
615             Sleep for a while and try again later
616           */
617           sleep_time= interval;
618           break;
619         }
620         sleep_time= 1;
621         ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
622         /*
623           Snapshot this kind of "state" of the engine. Note that the value
624           below is possibly greater than last_checkpoint_lsn.
625         */
626         log_horizon_at_last_checkpoint= translog_get_horizon();
627         pagecache_flushes_at_last_checkpoint=
628           maria_pagecache->global_cache_write;
629         /*
630           If the checkpoint above succeeded it has set d|kfiles and
631           d|kfiles_end. If is has failed, it has set
632           pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
633           and sleep until the next checkpoint.
634         */
635       }
636       break;
637     }
638     case 1:
639       /* set up parameters for background page flushing */
640       filter_param.up_to_lsn= last_checkpoint_lsn;
641       pages_bunch_size= pages_to_flush_before_next_checkpoint / (uint)interval;
642       dfile= dfiles;
643       kfile= kfiles;
644       /* fall through */
645     default:
646       if (pages_bunch_size > 0)
647       {
648         DBUG_PRINT("checkpoint",
649                    ("Maria background checkpoint thread: %u pages",
650                     pages_bunch_size));
651         /* flush a bunch of dirty pages */
652         filter_param.max_pages= pages_bunch_size;
653         while (dfile != dfiles_end)
654         {
655           /*
656             We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
657             smarter to move to the next file than wait for this one to be
658             completely flushed, which may take long.
659             StaleFilePointersInFlush: notice how below we use "dfile" which
660             is an OS file descriptor plus some function and MARIA_SHARE
661             pointers; this data dates from a previous checkpoint; since then,
662             the table may have been closed (so MARIA_SHARE* became stale), and
663             the file descriptor reassigned to another table which does not
664             have the same CRC-read-set callbacks: it is thus important that
665             flush_pagecache_blocks_with_filter() does not use the pointers,
666             only the OS file descriptor.
667           */
668           int res=
669             flush_pagecache_blocks_with_filter(maria_pagecache,
670                                                dfile, FLUSH_KEEP_LAZY,
671                                                filter_flush_file_evenly,
672                                                &filter_param);
673           if (unlikely(res & PCFLUSH_ERROR))
674             ma_message_no_user(0, "background data page flush failed");
675           if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
676             break; /* and we will continue with the same file */
677           dfile++; /* otherwise all this file is flushed, move to next file */
678           /*
679             MikaelR noted that he observed that Linux's file cache may never
680             fsync to  disk until this cache is full, at which point it decides
681             to empty the cache, making the machine very slow. A solution was
682             to fsync after writing 2 MB. So we might want to fsync() here if
683             we wrote enough pages.
684           */
685         }
686         while (kfile != kfiles_end)
687         {
688           int res=
689             flush_pagecache_blocks_with_filter(maria_pagecache,
690                                                kfile, FLUSH_KEEP_LAZY,
691                                                filter_flush_file_evenly,
692                                                &filter_param);
693           if (unlikely(res & PCFLUSH_ERROR))
694             ma_message_no_user(0, "background index page flush failed");
695           if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
696             break; /* and we will continue with the same file */
697           kfile++; /* otherwise all this file is flushed, move to next file */
698         }
699         sleep_time= 1;
700       }
701       else
702       {
703         /* Can directly sleep until the next checkpoint moment */
704         sleep_time= interval - (sleeps % interval);
705       }
706     }
707     if (my_service_thread_sleep(&checkpoint_control,
708                                 sleep_time * 1000000000ULL))
709       break;
710     sleeps+= sleep_time;
711   }
712   DBUG_PRINT("info",("Maria background checkpoint thread ends"));
713   {
715     /*
716       That's the final one, which guarantees that a clean shutdown always ends
717       with a checkpoint.
718     */
719     DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
720     ma_checkpoint_execute(level, FALSE);
721   }
722   my_thread_end();
723   return 0;
724 }
727 /**
728    @brief Allocates buffer and stores in it some info about open tables,
729    does some flushing on those.
731    Does the allocation because the caller cannot know the size itself.
732    Memory freeing is to be done by the caller (if the "str" member of the
733    LEX_STRING is not NULL).
734    The caller is taking a checkpoint.
736    @param[out]  str        pointer to where the allocated buffer,
737                            and its size, will be put; buffer will be filled
738                            with info about open tables
739    @param       checkpoint_start_log_horizon  Of the in-progress checkpoint
740                                               record.
742    @return Operation status
743      @retval 0      OK
744      @retval 1      Error
745 */
collect_tables(LEX_STRING * str,LSN checkpoint_start_log_horizon)747 static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
748 {
749   MARIA_SHARE **distinct_shares= NULL;
750   char *ptr;
751   uint error= 1, sync_error= 0, nb, nb_stored, i;
752   my_bool unmark_tables= TRUE;
753   size_t total_names_length;
754   LIST *pos; /**< to iterate over open tables */
755   struct st_state_copy {
756     uint index;
757     MARIA_STATE_INFO state;
758   };
759   struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
760     *state_copies_end, /**< cache ends here */
761     *state_copy; /**< iterator in cache */
762   TRANSLOG_ADDRESS UNINIT_VAR(state_copies_horizon); /**< horizon of states' _copies_ */
763   struct st_filter_param filter_param;
765   DBUG_ENTER("collect_tables");
767   /* let's make a list of distinct shares */
768   mysql_mutex_lock(&THR_LOCK_maria);
769   for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
770   {
771     MARIA_HA *info= (MARIA_HA*)pos->data;
772     MARIA_SHARE *share= info->s;
773     /* the first three variables below can never change */
774     if (share->base.born_transactional && !share->temporary &&
775         share->mode != O_RDONLY &&
776         !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
777     {
778       /*
779         Apart from us, only maria_close() reads/sets in_checkpoint but cannot
780         run now as we hold THR_LOCK_maria.
781       */
782       /*
783         This table is relevant for checkpoint and not already seen. Mark it,
784         so that it is not seen again in the loop.
785       */
786       nb++;
787       DBUG_ASSERT(share->in_checkpoint == 0);
788       /* This flag ensures that we count only _distinct_ shares. */
789       share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
790     }
791   }
792   if (unlikely((distinct_shares=
793                 (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
794                                           MYF(MY_WME))) == NULL))
795     goto err;
796   for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
797   {
798     MARIA_HA *info= (MARIA_HA*)pos->data;
799     MARIA_SHARE *share= info->s;
800     if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
801     {
802       distinct_shares[i++]= share;
803       /*
804         With this we prevent the share from going away while we later flush
805         and force it without holding THR_LOCK_maria. For example if the share
806         could be my_free()d by maria_close() we would have a problem when we
807         access it to flush the table. We "pin" the share pointer.
808         And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
809         not seen again in the loop.
810       */
811       share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
812       total_names_length+= share->open_file_name.length;
813     }
814   }
816   DBUG_ASSERT(i == nb);
817   mysql_mutex_unlock(&THR_LOCK_maria);
818   DBUG_PRINT("info",("found %u table shares", nb));
820   str->length=
821     4 +               /* number of tables */
822     (2 +              /* short id */
823      LSN_STORE_SIZE + /* first_log_write_at_lsn */
824      1                /* end-of-name 0 */
825      ) * nb + total_names_length;
826   if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
827     goto err;
829   ptr= str->str;
830   ptr+= 4; /* real number of stored tables is not yet know */
832   /* only possible checkpointer, so can do the read below without mutex */
833   filter_param.up_to_lsn= last_checkpoint_lsn;
834   switch(checkpoint_in_progress)
835   {
837     filter= &filter_flush_file_medium;
838     break;
840     filter= &filter_flush_file_full;
841     break;
843     filter= NULL;
844     break;
845   default:
846     DBUG_ASSERT(0);
847     goto err;
848   }
850   /*
851     The principle of reading/writing the state below is explained in
852     ma_recovery.c, look for "Recovery of the state".
853   */
854 #define STATE_COPIES 1024
855   state_copies= (struct st_state_copy *)
856     my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
857   dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
858                                        /* avoid size of 0 for my_realloc */
859                                        MY_MAX(1, nb) * sizeof(PAGECACHE_FILE),
860                                        MYF(MY_WME | MY_ALLOW_ZERO_PTR));
861   kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
862                                        /* avoid size of 0 for my_realloc */
863                                        MY_MAX(1, nb) * sizeof(PAGECACHE_FILE),
864                                        MYF(MY_WME | MY_ALLOW_ZERO_PTR));
865   if (unlikely((state_copies == NULL) ||
866                (dfiles == NULL) || (kfiles == NULL)))
867     goto err;
868   state_copy= state_copies_end= NULL;
869   dfiles_end= dfiles;
870   kfiles_end= kfiles;
872   for (nb_stored= 0, i= 0; i < nb; i++)
873   {
874     MARIA_SHARE *share= distinct_shares[i];
875     PAGECACHE_FILE kfile, dfile;
876     my_bool ignore_share;
877     if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
878     {
879       /*
880         No need for a mutex to read the above, only us can write *this* bit of
881         the in_checkpoint bitmap
882       */
883       continue;
884     }
885     /**
886        @todo We should not look at tables which didn't change since last
887        checkpoint.
888     */
889     DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
890     if (state_copy == state_copies_end) /* we have no more cached states */
891     {
892       /*
893         Collect and cache a bunch of states. We do this for many states at a
894         time, to not lock/unlock the log's lock too often.
895       */
896       uint j, bound= MY_MIN(nb, i + STATE_COPIES);
897       state_copy= state_copies;
898       /* part of the state is protected by log's lock */
899       translog_lock();
900       state_copies_horizon= translog_get_horizon_no_lock();
901       for (j= i; j < bound; j++)
902       {
903         MARIA_SHARE *share2= distinct_shares[j];
904         if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
905           continue;
906         state_copy->index= j;
907         state_copy->state= share2->state; /* we copy the state */
908         state_copy++;
909         /*
910           data_file_length is not updated under log's lock by the bitmap
911           code, but writing a wrong data_file_length is ok: a next
912           maria_close() will correct it; if we crash before, Recovery will
913           set it to the true physical size.
914         */
915       }
916       translog_unlock();
917       if (state_copy == state_copies)
918         break;                                  /* Nothing to do */
920       /**
921          We are going to flush these states.
922          Before, all records describing how to undo such state must be
923          in the log (WAL). Usually this means UNDOs. In the special case of
924          data|key_file_length, recovery just needs to open the table to fix the
925          length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
926          understand it must open a table, is enough; so as long as
927          data|key_file_length is updated after writing any log record it's ok:
928          if we copied new value above, it means the record was before
929          state_copies_horizon and we flush such record below.
930          Apart from data|key_file_length which are easily recoverable from the
931          real file's size, all other state members must be updated only when
932          writing the UNDO; otherwise, if updated before, if their new value is
933          flushed by a checkpoint and there is a crash before UNDO is written,
934          their REDO group will be missing or at least incomplete and skipped
935          by recovery, so bad state value will stay. For example, setting
936          key_root before writing the UNDO: the table would have old index
937          pages (they were pinned at time of crash) and a new, thus wrong,
938          key_root.
939          @todo RECOVERY BUG check that all code honours that.
940       */
941       if (translog_flush(state_copies_horizon))
942         goto err;
943       /* now we have cached states and they are WAL-safe*/
944       state_copies_end= state_copy-1;
945       state_copy= state_copies;
946     }
948     /* locate our state among these cached ones */
949     for ( ; state_copy->index != i; state_copy++)
950       DBUG_ASSERT(state_copy <= state_copies_end);
952     /* OS file descriptors are ints which we stored in 4 bytes */
953     compile_time_assert(sizeof(int) <= 4);
954     /*
955       Protect against maria_close() (which does some memory freeing in
956       MARIA_FILE_BITMAP) with close_lock. intern_lock is not
957       sufficient as we, as well as maria_close(), are going to unlock
958       intern_lock in the middle of manipulating the table. Serializing us and
959       maria_close() should help avoid problems.
960     */
961     mysql_mutex_lock(&share->close_lock);
962     mysql_mutex_lock(&share->intern_lock);
963     /*
964       Tables in a normal state have their two file descriptors open.
965       In some rare cases like REPAIR, some descriptor may be closed or even
966       -1. If that happened, the _ma_state_info_write() may fail. This is
967       prevented by enclosing all all places which close/change kfile.file with
968       intern_lock.
969     */
970     kfile= share->kfile;
971     dfile= share->bitmap.file;
972     /*
973       Ignore table which has no logged writes (all its future log records will
974       be found naturally by Recovery). Ignore obsolete shares (_before_
975       setting themselves to last_version=0 they already did all flush and
976       sync; if we flush their state now we may be flushing an obsolete state
977       onto a newer one (assuming the table has been reopened with a different
978       share but of course same physical index file).
979     */
980     ignore_share= (share->id == 0) | (share->last_version == 0);
981     DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
982     if (!ignore_share)
983     {
984       size_t open_file_name_len= share->open_file_name.length + 1;
985       /* remember the descriptors for background flush */
986       *(dfiles_end++)= dfile;
987       *(kfiles_end++)= kfile;
988       /* we will store this table in the record */
989       nb_stored++;
990       int2store(ptr, share->id);
991       ptr+= 2;
992       lsn_store(ptr, share->lsn_of_file_id);
993       ptr+= LSN_STORE_SIZE;
994       /*
995         first_bitmap_with_space is not updated under log's lock, and is
996         important. We would need the bitmap's lock to get it right. Recovery
997         of this is not clear, so we just play safe: write it out as
998         unknown: if crash, _ma_bitmap_init() at next open (for example in
999         Recovery) will convert it to 0 and thus the first insertion will
1000         search for free space from the file's first bitmap (0) -
1001         under-optimal but safe.
1002         If no crash, maria_close() will write the exact value.
1003       */
1004       state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
1005       memcpy(ptr, share->open_file_name.str, open_file_name_len);
1006       ptr+= open_file_name_len;
1007       if (cmp_translog_addr(share->state.is_of_horizon,
1008                             checkpoint_start_log_horizon) >= 0)
1009       {
1010         /*
1011           State was flushed recently, it does not hold down the log's
1012           low-water mark and will not give avoidable work to Recovery. So we
1013           needn't flush it. Also, it is possible that while we copied the
1014           state above (under log's lock, without intern_lock) it was being
1015           modified in memory or flushed to disk (without log's lock, under
1016           intern_lock, like in maria_extra()), so our copy may be incorrect
1017           and we should not flush it.
1018           It may also be a share which got last_version==0 since we checked
1019           last_version; in this case, it flushed its state and the LSN test
1020           above will catch it.
1021         */
1022       }
1023       else
1024       {
1025         /*
1026           We could do the state flush only if share->changed, but it's
1027           tricky.
1028           Consider a maria_write() which has written REDO,UNDO, and before it
1029           calls _ma_writeinfo() (setting share->changed=1), checkpoint
1030           happens and sees share->changed=0, does not flush state. It is
1031           possible that Recovery does not start from before the REDO and thus
1032           the state is not recovered. A solution may be to set
1033           share->changed=1 under log mutex when writing log records.
1035           The current solution is to keep a copy the last saved state and
1036           not write the state if it was same as last time. It's ok if
1037           is_of_horizon would be different on disk if all other data is
1038           the same.
1039         */
1040         DBUG_ASSERT(share->last_version != 0);
1041         state_copy->state.is_of_horizon= share->state.is_of_horizon=
1042           share->checkpoint_state.is_of_horizon= state_copies_horizon;
1043         if (kfile.file >= 0 && memcmp(&share->checkpoint_state,
1044                                       &state_copy->state,
1045                                       sizeof(state_copy->state)))
1046         {
1047           sync_error|=
1048             _ma_state_info_write_sub(kfile.file, &state_copy->state,
1049                                      MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
1050           memcpy(&share->checkpoint_state,
1051                  &state_copy->state, sizeof(state_copy->state));
1052         }
1053         /*
1054           We don't set share->changed=0 because it may interfere with a
1055           concurrent _ma_writeinfo() doing share->changed=1 (cancel its
1056           effect). The sad consequence is that we will flush the same state at
1057           each checkpoint if the table was once written and then not anymore.
1058         */
1059       }
1060     }
1062     else
1063     {
1064       DBUG_ASSERT(share->bitmap.changed == 0 &&
1065                   share->bitmap.changed_not_flushed == 0);
1066     }
1067 #endif
1069     /*
1070       _ma_bitmap_flush_all() may wait, so don't keep intern_lock as
1071       otherwise this would deadlock with allocate_and_write_block_record()
1072       calling _ma_set_share_data_file_length()
1073     */
1074     mysql_mutex_unlock(&share->intern_lock);
1076     if (!ignore_share)
1077     {
1078       /*
1079         share->bitmap is valid because it's destroyed under close_lock which
1080         we hold.
1081       */
1082       if (_ma_bitmap_flush_all(share))
1083       {
1084         sync_error= 1;
1085         /** @todo all write failures should mark table corrupted */
1086         ma_message_no_user(0, "checkpoint bitmap page flush failed");
1087       }
1088       DBUG_ASSERT(share->pagecache == maria_pagecache);
1089     }
1090     /*
1091       Clean up any unused states.
1092       TODO: Only do this call if there has been # (10?) ended transactions
1093       since last call.
1094       We had to release intern_lock to respect lock order with LOCK_trn_list.
1095     */
1096     _ma_remove_not_visible_states_with_lock(share, FALSE);
1098     if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1099     {
1100       /*
1101         maria_close() left us free the share. When it run it set share->id
1102         to 0. As it run before we locked close_lock, we should have seen this
1103         and so this assertion should be true:
1104       */
1105       DBUG_ASSERT(ignore_share);
1106       mysql_mutex_destroy(&share->intern_lock);
1107       mysql_mutex_unlock(&share->close_lock);
1108       mysql_mutex_destroy(&share->close_lock);
1109       ma_crypt_free(share);
1110       my_free(share);
1111     }
1112     else
1113     {
1114       /* share goes back to normal state */
1115       share->in_checkpoint= 0;
1116       mysql_mutex_unlock(&share->close_lock);
1117     }
1119     /*
1120       We do the big disk writes out of intern_lock to not block other
1121       users of this table (intern_lock is taken at the start and end of
1122       every statement). This means that file descriptors may be invalid
1123       (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
1124       under Windows, or REPAIR). This should not be a problem as we use
1125       MY_IGNORE_BADFD. Descriptors may even point to other files but then
1126       the old blocks (of before the close) must have been flushed for sure,
1127       so our flush will flush new blocks (of after the latest open) and that
1128       should do no harm.
1129     */
1130     /*
1131       If CHECKPOINT_MEDIUM, this big flush below may result in a
1132       serious write burst. Realize that all pages dirtied between the
1133       last checkpoint and the one we are doing now, will be flushed at
1134       next checkpoint, except those evicted by LRU eviction (depending on
1135       the size of the page cache compared to the size of the working data
1136       set, eviction may be rare or frequent).
1137       We avoid that burst by anticipating: those pages are flushed
1138       in bunches spanned regularly over the time interval between now and
1139       the next checkpoint, by a background thread. Thus the next checkpoint
1140       will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
1141       only a little slower than CHECKPOINT_INDIRECT).
1142     */
1144     /*
1145       PageCacheFlushConcurrencyBugs
1146       Inside the page cache, calls to flush_pagecache_blocks_int() on the same
1147       file are serialized. Examples of concurrency bugs which happened when we
1148       didn't have this serialization:
1149       - maria_chk_size() (via CHECK TABLE) happens concurrently with
1150       Checkpoint: Checkpoint is flushing a page: it pins the page and is
1151       pre-empted, maria_chk_size() wants to flush this page too so gets an
1152       error because Checkpoint pinned this page. Such error makes
1153       maria_chk_size() mark the table as corrupted.
1154       - maria_close() happens concurrently with Checkpoint:
1155       Checkpoint is flushing a page: it registers a request on the page, is
1156       pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
1157       FLUSH_RELEASE will cause a free_block() which assumes the page is in the
1158       LRU, but it is not (as Checkpoint registered a request). Crash.
1159       - one thread is evicting a page of the file out of the LRU: it marks it
1160       iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
1161       of the same file concurrently (like above). Then one flusher sees the
1162       page is in switch, removes it from changed_blocks[] and puts it in its
1163       first_in_switch, so the other flusher will not see the page at all and
1164       return too early. If it's maria_close() which returns too early, then
1165       maria_close() may close the file descriptor, and the other flusher, and
1166       the evicter will fail to write their page: corruption.
1167     */
1169     if (!ignore_share)
1170     {
1171       if (filter != NULL)
1172       {
1173         if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1174                                                 &dfile, FLUSH_KEEP_LAZY,
1175                                                 filter, &filter_param) &
1176              PCFLUSH_ERROR))
1177           ma_message_no_user(0, "checkpoint data page flush failed");
1178         if ((flush_pagecache_blocks_with_filter(maria_pagecache,
1179                                                 &kfile, FLUSH_KEEP_LAZY,
1180                                                 filter, &filter_param) &
1181              PCFLUSH_ERROR))
1182           ma_message_no_user(0, "checkpoint index page flush failed");
1183       }
1184       /*
1185         fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
1186         per second, so if you have touched 1000 files it's 7 seconds).
1187       */
1188       sync_error|=
1189         mysql_file_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
1190         mysql_file_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
1191       /*
1192         in case of error, we continue because writing other tables to disk is
1193         still useful.
1194       */
1195     }
1196   }
1198   if (sync_error)
1199     goto err;
1200   /* We maybe over-estimated (due to share->id==0 or last_version==0) */
1201   DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
1202   str->length= (uint)(ptr - str->str);
1203   /*
1204     As we support max 65k tables open at a time (2-byte short id), we
1205     assume uint is enough for the cumulated length of table names; and
1206     LEX_STRING::length is uint.
1207   */
1208   int4store(str->str, nb_stored);
1209   error= unmark_tables= 0;
1211 err:
1212   if (unlikely(unmark_tables))
1213   {
1214     /* maria_close() uses THR_LOCK_maria from start to end */
1215     mysql_mutex_lock(&THR_LOCK_maria);
1216     for (i= 0; i < nb; i++)
1217     {
1218       MARIA_SHARE *share= distinct_shares[i];
1219       if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
1220       {
1221         /* maria_close() left us to free the share */
1222         mysql_mutex_destroy(&share->intern_lock);
1223         ma_crypt_free(share);
1224         my_free(share);
1225       }
1226       else
1227       {
1228         /* share goes back to normal state */
1229         share->in_checkpoint= 0;
1230       }
1231     }
1232     mysql_mutex_unlock(&THR_LOCK_maria);
1233   }
1234   my_free(distinct_shares);
1235   my_free(state_copies);
1236   DBUG_RETURN(error);
1237 }