1 /*
2  * linux/fs/jbd/recovery.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5  *
6  * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal recovery routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #ifndef __KERNEL__
17 #include "jfs_user.h"
18 #else
19 #include <linux/module.h>
20 #include <linux/time.h>
21 #include <linux/fs.h>
22 #include <linux/jbd.h>
23 #include <linux/errno.h>
24 #include <linux/slab.h>
25 #endif
26 
27 /*
28  * Maintain information about the progress of the recovery job, so that
29  * the different passes can carry information between them.
30  */
31 struct recovery_info
32 {
33     tid_t		start_transaction;
34     tid_t		end_transaction;
35 
36     int		nr_replays;
37     int		nr_revokes;
38     int		nr_revoke_hits;
39 };
40 
41 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
42 static int do_one_pass(journal_t *journal,
43                        struct recovery_info *info, enum passtype pass);
44 static int scan_revoke_records(journal_t *, struct buffer_head *,
45                                tid_t, struct recovery_info *);
46 
47 #ifdef __KERNEL__
48 
49 /* Release readahead buffers after use */
journal_brelse_array(struct buffer_head * b[],int n)50 static void journal_brelse_array(struct buffer_head *b[], int n)
51 {
52     while (--n >= 0)
53         brelse (b[n]);
54 }
55 
56 
57 /*
58  * When reading from the journal, we are going through the block device
59  * layer directly and so there is no readahead being done for us.  We
60  * need to implement any readahead ourselves if we want it to happen at
61  * all.  Recovery is basically one long sequential read, so make sure we
62  * do the IO in reasonably large chunks.
63  *
64  * This is not so critical that we need to be enormously clever about
65  * the readahead size, though.  128K is a purely arbitrary, good-enough
66  * fixed value.
67  */
68 
69 #define MAXBUF 8
do_readahead(journal_t * journal,unsigned int start)70 static int do_readahead(journal_t *journal, unsigned int start)
71 {
72     int err;
73     unsigned int max, nbufs, next;
74     unsigned long blocknr;
75     struct buffer_head *bh;
76 
77     struct buffer_head * bufs[MAXBUF];
78 
79     /* Do up to 128K of readahead */
80     max = start + (128 * 1024 / journal->j_blocksize);
81     if (max > journal->j_maxlen)
82         max = journal->j_maxlen;
83 
84     /* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
85      * a time to the block device IO layer. */
86 
87     nbufs = 0;
88 
89     for (next = start; next < max; next++) {
90         err = journal_bmap(journal, next, &blocknr);
91 
92         if (err) {
93             printk (KERN_ERR "JBD: bad block at offset %u\n",
94                     next);
95             goto failed;
96         }
97 
98         bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
99         if (!bh) {
100             err = -ENOMEM;
101             goto failed;
102         }
103 
104         if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
105             bufs[nbufs++] = bh;
106             if (nbufs == MAXBUF) {
107                 ll_rw_block(READ, nbufs, bufs);
108                 journal_brelse_array(bufs, nbufs);
109                 nbufs = 0;
110             }
111         } else
112             brelse(bh);
113     }
114 
115     if (nbufs)
116         ll_rw_block(READ, nbufs, bufs);
117     err = 0;
118 
119 failed:
120     if (nbufs)
121         journal_brelse_array(bufs, nbufs);
122     return err;
123 }
124 
125 #endif /* __KERNEL__ */
126 
127 
128 /*
129  * Read a block from the journal
130  */
131 
jread(struct buffer_head ** bhp,journal_t * journal,unsigned int offset)132 static int jread(struct buffer_head **bhp, journal_t *journal,
133                  unsigned int offset)
134 {
135     int err;
136     unsigned long blocknr;
137     struct buffer_head *bh;
138 
139     *bhp = NULL;
140 
141     if (offset >= journal->j_maxlen) {
142         printk(KERN_ERR "JBD: corrupted journal superblock\n");
143         return -EIO;
144     }
145 
146     err = journal_bmap(journal, offset, &blocknr);
147 
148     if (err) {
149         printk (KERN_ERR "JBD: bad block at offset %u\n",
150                 offset);
151         return err;
152     }
153 
154     bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
155     if (!bh)
156         return -ENOMEM;
157 
158     if (!buffer_uptodate(bh)) {
159         /* If this is a brand new buffer, start readahead.
160                    Otherwise, we assume we are already reading it.  */
161         if (!buffer_req(bh))
162             do_readahead(journal, offset);
163         wait_on_buffer(bh);
164     }
165 
166     if (!buffer_uptodate(bh)) {
167         printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
168                 offset);
169         brelse(bh);
170         return -EIO;
171     }
172 
173     *bhp = bh;
174     return 0;
175 }
176 
177 
178 /*
179  * Count the number of in-use tags in a journal descriptor block.
180  */
181 
count_tags(struct buffer_head * bh,int size)182 static int count_tags(struct buffer_head *bh, int size)
183 {
184     char *			tagp;
185     journal_block_tag_t *	tag;
186     int			nr = 0;
187 
188     tagp = &bh->b_data[sizeof(journal_header_t)];
189 
190     while (((int)(tagp - bh->b_data) + (int)sizeof(journal_block_tag_t)) <= size) {
191         tag = (journal_block_tag_t *) tagp;
192 
193         nr++;
194         tagp += sizeof(journal_block_tag_t);
195         if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID)))
196             tagp += 16;
197 
198         if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG))
199             break;
200     }
201 
202     return nr;
203 }
204 
205 
206 /* Make sure we wrap around the log correctly! */
207 #define wrap(journal, var)						\
208 do {									\
209 	if (var >= (journal)->j_last)					\
210 		var -= ((journal)->j_last - (journal)->j_first);	\
211 } while (0)
212 
213 /**
214  * journal_recover - recovers a on-disk journal
215  * @journal: the journal to recover
216  *
217  * The primary function for recovering the log contents when mounting a
218  * journaled device.
219  *
220  * Recovery is done in three passes.  In the first pass, we look for the
221  * end of the log.  In the second, we assemble the list of revoke
222  * blocks.  In the third and final pass, we replay any un-revoked blocks
223  * in the log.
224  */
journal_recover(journal_t * journal)225 int journal_recover(journal_t *journal)
226 {
227     int			err;
228     journal_superblock_t *	sb;
229 
230     struct recovery_info	info;
231 
232     memset(&info, 0, sizeof(info));
233     sb = journal->j_superblock;
234 
235     /*
236      * The journal superblock's s_start field (the current log head)
237      * is always zero if, and only if, the journal was cleanly
238      * unmounted.
239      */
240 
241     if (!sb->s_start) {
242         jbd_debug(1, "No recovery required, last transaction %d\n",
243                   be32_to_cpu(sb->s_sequence));
244         journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
245         return 0;
246     }
247 
248     err = do_one_pass(journal, &info, PASS_SCAN);
249     if (!err)
250         err = do_one_pass(journal, &info, PASS_REVOKE);
251     if (!err)
252         err = do_one_pass(journal, &info, PASS_REPLAY);
253 
254     jbd_debug(1, "JBD: recovery, exit status %d, "
255               "recovered transactions %u to %u\n",
256               err, info.start_transaction, info.end_transaction);
257     jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
258               info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
259 
260     /* Restart the log at the next transaction ID, thus invalidating
261      * any existing commit records in the log. */
262     journal->j_transaction_sequence = ++info.end_transaction;
263 
264     journal_clear_revoke(journal);
265     sync_blockdev(journal->j_fs_dev);
266     return err;
267 }
268 
269 /**
270  * journal_skip_recovery - Start journal and wipe exiting records
271  * @journal: journal to startup
272  *
273  * Locate any valid recovery information from the journal and set up the
274  * journal structures in memory to ignore it (presumably because the
275  * caller has evidence that it is out of date).
276  * This function does'nt appear to be exorted..
277  *
278  * We perform one pass over the journal to allow us to tell the user how
279  * much recovery information is being erased, and to let us initialise
280  * the journal transaction sequence numbers to the next unused ID.
281  */
journal_skip_recovery(journal_t * journal)282 int journal_skip_recovery(journal_t *journal)
283 {
284     int			err;
285     journal_superblock_t *	sb;
286 
287     struct recovery_info	info;
288 
289     memset (&info, 0, sizeof(info));
290     sb = journal->j_superblock;
291 
292     err = do_one_pass(journal, &info, PASS_SCAN);
293 
294     if (err) {
295         printk(KERN_ERR "JBD: error %d scanning journal\n", err);
296         ++journal->j_transaction_sequence;
297     } else {
298 #ifdef CONFIG_JBD_DEBUG
299         int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
300         jbd_debug(1,
301                   "JBD: ignoring %d transaction%s from the journal.\n",
302                   dropped, (dropped == 1) ? "" : "s");
303 #endif
304         journal->j_transaction_sequence = ++info.end_transaction;
305     }
306 
307     journal->j_tail = 0;
308     return err;
309 }
310 
do_one_pass(journal_t * journal,struct recovery_info * info,enum passtype pass)311 static int do_one_pass(journal_t *journal,
312                        struct recovery_info *info, enum passtype pass)
313 {
314     unsigned int		first_commit_ID, next_commit_ID;
315     unsigned long		next_log_block;
316     int			err, success = 0;
317     journal_superblock_t *	sb;
318     journal_header_t *	tmp;
319     struct buffer_head *	bh;
320     unsigned int		sequence;
321     int			blocktype;
322 
323     /* Precompute the maximum metadata descriptors in a descriptor block */
324     int			MAX_BLOCKS_PER_DESC;
325     MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
326                            / sizeof(journal_block_tag_t));
327 
328     /*
329      * First thing is to establish what we expect to find in the log
330      * (in terms of transaction IDs), and where (in terms of log
331      * block offsets): query the superblock.
332      */
333 
334     sb = journal->j_superblock;
335     next_commit_ID = be32_to_cpu(sb->s_sequence);
336     next_log_block = be32_to_cpu(sb->s_start);
337 
338     first_commit_ID = next_commit_ID;
339     if (pass == PASS_SCAN)
340         info->start_transaction = first_commit_ID;
341 
342     jbd_debug(1, "Starting recovery pass %d\n", pass);
343 
344     /*
345      * Now we walk through the log, transaction by transaction,
346      * making sure that each transaction has a commit block in the
347      * expected place.  Each complete transaction gets replayed back
348      * into the main filesystem.
349      */
350 
351     while (1) {
352         int			flags;
353         char *			tagp;
354         journal_block_tag_t *	tag;
355         struct buffer_head *	obh;
356         struct buffer_head *	nbh;
357 
358         cond_resched();
359 
360         /* If we already know where to stop the log traversal,
361          * check right now that we haven't gone past the end of
362          * the log. */
363 
364         if (pass != PASS_SCAN)
365             if (tid_geq(next_commit_ID, info->end_transaction))
366                 break;
367 
368         jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
369                   next_commit_ID, next_log_block, journal->j_last);
370 
371         /* Skip over each chunk of the transaction looking
372          * either the next descriptor block or the final commit
373          * record. */
374 
375         jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
376         err = jread(&bh, journal, next_log_block);
377         if (err)
378             goto failed;
379 
380         next_log_block++;
381         wrap(journal, next_log_block);
382 
383         /* What kind of buffer is it?
384          *
385          * If it is a descriptor block, check that it has the
386          * expected sequence number.  Otherwise, we're all done
387          * here. */
388 
389         tmp = (journal_header_t *)bh->b_data;
390 
391         if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) {
392             brelse(bh);
393             break;
394         }
395 
396         blocktype = be32_to_cpu(tmp->h_blocktype);
397         sequence = be32_to_cpu(tmp->h_sequence);
398         jbd_debug(3, "Found magic %d, sequence %d\n",
399                   blocktype, sequence);
400 
401         if (sequence != next_commit_ID) {
402             brelse(bh);
403             break;
404         }
405 
406         /* OK, we have a valid descriptor block which matches
407          * all of the sequence number checks.  What are we going
408          * to do with it?  That depends on the pass... */
409 
410         switch (blocktype) {
411         case JFS_DESCRIPTOR_BLOCK:
412             /* If it is a valid descriptor block, replay it
413              * in pass REPLAY; otherwise, just skip over the
414              * blocks it describes. */
415             if (pass != PASS_REPLAY) {
416                 next_log_block +=
417                     count_tags(bh, journal->j_blocksize);
418                 wrap(journal, next_log_block);
419                 brelse(bh);
420                 continue;
421             }
422 
423             /* A descriptor block: we can now write all of
424              * the data blocks.  Yay, useful work is finally
425              * getting done here! */
426 
427             tagp = &bh->b_data[sizeof(journal_header_t)];
428             while (((int)(tagp - bh->b_data) + (int)sizeof(journal_block_tag_t))
429                     <= journal->j_blocksize) {
430                 unsigned long io_block;
431 
432                 tag = (journal_block_tag_t *) tagp;
433                 flags = be32_to_cpu(tag->t_flags);
434 
435                 io_block = next_log_block++;
436                 wrap(journal, next_log_block);
437                 err = jread(&obh, journal, io_block);
438                 if (err) {
439                     /* Recover what we can, but
440                      * report failure at the end. */
441                     success = err;
442                     printk (KERN_ERR
443                             "JBD: IO error %d recovering "
444                             "block %ld in log\n",
445                             err, io_block);
446                 } else {
447                     unsigned long blocknr;
448 
449                     J_ASSERT(obh != NULL);
450                     blocknr = be32_to_cpu(tag->t_blocknr);
451 
452                     /* If the block has been
453                      * revoked, then we're all done
454                      * here. */
455                     if (journal_test_revoke
456                             (journal, blocknr,
457                              next_commit_ID)) {
458                         brelse(obh);
459                         ++info->nr_revoke_hits;
460                         goto skip_write;
461                     }
462 
463                     /* Find a buffer for the new
464                      * data being restored */
465                     nbh = __getblk(journal->j_fs_dev,
466                                    blocknr,
467                                    journal->j_blocksize);
468                     if (nbh == NULL) {
469                         printk(KERN_ERR
470                                "JBD: Out of memory "
471                                "during recovery.\n");
472                         err = -ENOMEM;
473                         brelse(bh);
474                         brelse(obh);
475                         goto failed;
476                     }
477 
478                     lock_buffer(nbh);
479                     memcpy(nbh->b_data, obh->b_data,
480                            journal->j_blocksize);
481                     if (flags & JFS_FLAG_ESCAPE) {
482                         *((__be32 *)bh->b_data) =
483                             cpu_to_be32(JFS_MAGIC_NUMBER);
484                     }
485 
486                     BUFFER_TRACE(nbh, "marking dirty");
487                     set_buffer_uptodate(nbh);
488                     mark_buffer_dirty(nbh);
489                     BUFFER_TRACE(nbh, "marking uptodate");
490                     ++info->nr_replays;
491                     /* ll_rw_block(WRITE, 1, &nbh); */
492                     unlock_buffer(nbh);
493                     brelse(obh);
494                     brelse(nbh);
495                 }
496 
497 skip_write:
498                 tagp += sizeof(journal_block_tag_t);
499                 if (!(flags & JFS_FLAG_SAME_UUID))
500                     tagp += 16;
501 
502                 if (flags & JFS_FLAG_LAST_TAG)
503                     break;
504             }
505 
506             brelse(bh);
507             continue;
508 
509         case JFS_COMMIT_BLOCK:
510             /* Found an expected commit block: not much to
511              * do other than move on to the next sequence
512              * number. */
513             brelse(bh);
514             next_commit_ID++;
515             continue;
516 
517         case JFS_REVOKE_BLOCK:
518             /* If we aren't in the REVOKE pass, then we can
519              * just skip over this block. */
520             if (pass != PASS_REVOKE) {
521                 brelse(bh);
522                 continue;
523             }
524 
525             err = scan_revoke_records(journal, bh,
526                                       next_commit_ID, info);
527             brelse(bh);
528             if (err)
529                 goto failed;
530             continue;
531 
532         default:
533             jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
534                       blocktype);
535             brelse(bh);
536             goto done;
537         }
538     }
539 
540 done:
541     /*
542      * We broke out of the log scan loop: either we came to the
543      * known end of the log or we found an unexpected block in the
544      * log.  If the latter happened, then we know that the "current"
545      * transaction marks the end of the valid log.
546      */
547 
548     if (pass == PASS_SCAN)
549         info->end_transaction = next_commit_ID;
550     else {
551         /* It's really bad news if different passes end up at
552          * different places (but possible due to IO errors). */
553         if (info->end_transaction != next_commit_ID) {
554             printk (KERN_ERR "JBD: recovery pass %d ended at "
555                     "transaction %u, expected %u\n",
556                     pass, next_commit_ID, info->end_transaction);
557             if (!success)
558                 success = -EIO;
559         }
560     }
561 
562     return success;
563 
564 failed:
565     return err;
566 }
567 
568 
569 /* Scan a revoke record, marking all blocks mentioned as revoked. */
570 
scan_revoke_records(journal_t * journal,struct buffer_head * bh,tid_t sequence,struct recovery_info * info)571 static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
572                                tid_t sequence, struct recovery_info *info)
573 {
574     journal_revoke_header_t *header;
575     int offset, max;
576 
577     header = (journal_revoke_header_t *) bh->b_data;
578     offset = sizeof(journal_revoke_header_t);
579     max = be32_to_cpu(header->r_count);
580 
581     while (offset < max) {
582         unsigned long blocknr;
583         int err;
584 
585         blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
586         offset += 4;
587         err = journal_set_revoke(journal, blocknr, sequence);
588         if (err)
589             return err;
590         ++info->nr_revokes;
591     }
592     return 0;
593 }
594