xref: /dragonfly/sys/vfs/hammer/hammer_recover.c (revision 279dd846)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * UNDO ALGORITHM:
37  *
38  *	The UNDO algorithm is trivial.  The nominal UNDO range in the
39  *	FIFO is determined by taking the first/next offset stored in
40  *	the volume header.  The next offset may not be correct since
41  *	UNDO flushes are not required to flush the volume header, so
42  *	the code also scans forward until it finds a discontinuous
43  *	sequence number.
44  *
45  *	The UNDOs are then scanned and executed in reverse order.  These
46  *	UNDOs are effectively just data restorations based on HAMMER offsets.
47  *
48  * REDO ALGORITHM:
49  *
50  *	REDO records are laid down in the UNDO/REDO FIFO for nominal
51  *	writes, truncations, and file extension ops.  On a per-inode
52  *	basis two types of REDO records are generated, REDO_WRITE
53  *	and REDO_TRUNC.
54  *
55  *	Essentially the recovery block will contain UNDO records backing
56  *	out partial operations and REDO records to regenerate those partial
57  *	operations guaranteed by the filesystem during recovery.
58  *
59  *	REDO generation is optional, and can also be started and then
60  *	later stopped due to excessive write()s inbetween fsyncs, or not
61  *	started at all.  Because of this the recovery code must determine
62  *	when REDOs are valid and when they are not.  Additional records are
63  *	generated to help figure it out.
64  *
65  *	The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66  *	during a flush cycle indicating which records the flush cycle
67  *	has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68  *	each flush cycle to indicate how far back in the UNDO/REDO FIFO
69  *	the recovery code must go to find the earliest applicable REDO
70  *	record.  Applicable REDO records can be far outside the nominal
71  *	UNDO recovery range, for example if a write() lays down a REDO but
72  *	the related file is not flushed for several cycles.
73  *
74  *	The SYNC reference is to a point prior to the nominal UNDO FIFO
75  *	range, creating an extended REDO range which must be scanned.
76  *
77  *	Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78  *	which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79  *	prior to the start of the nominal UNDO range are applicable.
80  *	That is, any REDO_TERM_* records in the extended range but not in
81  *	the nominal undo range will mask any redo operations for prior REDO
82  *	records.  This is necessary because once the TERM is laid down
83  *	followup operations may make additional changes to the related
84  *	records but not necessarily record them as REDOs (because REDOs are
85  *	optional).
86  *
87  *	REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88  *	must be ignored since they represent meta-data flushes which are
89  *	undone by the UNDOs in that nominal UNDO range by the recovery
90  *	code.  Only REDO_TERM_* records in the extended range but not
91  *	in the nominal undo range are applicable.
92  *
93  *	The REDO_SYNC record itself always exists in the nominal UNDO range
94  *	(this is how the extended range is determined).  For recovery
95  *	purposes the most recent REDO_SYNC record is always used if several
96  *	are found.
97  *
98  * CRASHES DURING UNDO/REDO
99  *
100  *	A crash during the UNDO phase requires no additional effort.  The
101  *	UNDOs will simply be re-run again.  The state of the UNDO/REDO fifo
102  *	remains unchanged and has no re-crash issues.
103  *
104  *	A crash during the REDO phase is more complex because the REDOs
105  *	run normal filesystem ops and generate additional UNDO/REDO records.
106  *	REDO is disabled during REDO recovery and any SYNC records generated
107  *	by flushes during REDO recovery must continue to reference the
108  *	original extended range.
109  *
110  *	If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111  *	may become impossible.  This is detected when the start of the
112  *	extended range fails to have monotonically increasing sequence
113  *	numbers leading into the nominal undo range.
114  */
115 
116 #include "hammer.h"
117 
118 /*
119  * Specify the way we want to handle stage2 errors.
120  *
121  * Following values are accepted:
122  *
123  * 0 - Run redo recovery normally and fail to mount if
124  *     the operation fails (default).
125  * 1 - Run redo recovery, but don't fail to mount if the
126  *     operation fails.
127  * 2 - Completely skip redo recovery (only for severe error
128  *     conditions and/or debugging.
129  */
130 static int hammer_skip_redo = 0;
131 TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo);
132 
133 /*
134  * Each rterm entry has a list of fifo offsets indicating termination
135  * points.  These are stripped as the scan progresses.
136  */
137 typedef struct hammer_rterm_entry {
138 	struct hammer_rterm_entry *next;
139 	hammer_off_t		fifo_offset;
140 } *hammer_rterm_entry_t;
141 
142 /*
143  * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
144  * TRUNC entries ignore the offset.
145  */
146 typedef struct hammer_rterm {
147 	RB_ENTRY(hammer_rterm)	rb_node;
148 	int64_t			redo_objid;
149 	u_int32_t		redo_localization;
150 	u_int32_t		redo_flags;
151 	hammer_off_t		redo_offset;
152 	hammer_rterm_entry_t	term_list;
153 } *hammer_rterm_t;
154 
155 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
156 struct hammer_rterm_rb_tree;
157 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
158 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
159 
160 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
161 			hammer_off_t end_off);
162 static int hammer_check_head_signature(hammer_fifo_head_t head,
163 			hammer_off_t beg_off);
164 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
165 			char *src, char *dst, int bytes);
166 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
167 			hammer_volume_t root_volume,
168 			hammer_off_t *scan_offsetp,
169 			int *errorp, struct hammer_buffer **bufferp);
170 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
171 			hammer_volume_t root_volume,
172 			hammer_off_t *scan_offsetp,
173 			int *errorp, struct hammer_buffer **bufferp);
174 #if 0
175 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
176 #endif
177 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
178 			hammer_fifo_undo_t undo);
179 static int hammer_recover_redo_rec(hammer_mount_t hmp,
180 			struct hammer_rterm_rb_tree *root,
181 			hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
182 static int hammer_recover_redo_run(hammer_mount_t hmp,
183 			struct hammer_rterm_rb_tree *root,
184 			hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
185 static void hammer_recover_redo_exec(hammer_mount_t hmp,
186 			hammer_fifo_redo_t redo);
187 
188 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
189 
190 /*
191  * Recover filesystem meta-data on mount.  This procedure figures out the
192  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
193  * resynchronized by this procedure.
194  *
195  * This procedure is run near the beginning of the mount sequence, before
196  * any B-Tree or high-level accesses are enabled, and is responsible for
197  * restoring the meta-data to a consistent state.  High level HAMMER data
198  * structures (such as the B-Tree) cannot be accessed here.
199  *
200  * NOTE: No information from the root volume has been cached in the
201  *	 hammer_mount structure yet, so we need to access the root volume's
202  *	 buffer directly.
203  *
204  * NOTE:
205  */
206 int
207 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
208 {
209 	hammer_blockmap_t rootmap;
210 	hammer_buffer_t buffer;
211 	hammer_off_t scan_offset;
212 	hammer_off_t scan_offset_save;
213 	hammer_off_t bytes;
214 	hammer_fifo_any_t head;
215 	hammer_off_t first_offset;
216 	hammer_off_t last_offset;
217 	u_int32_t seqno;
218 	int error;
219 	int degenerate_case = 0;
220 
221 	/*
222 	 * Examine the UNDO FIFO indices in the volume header.
223 	 */
224 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
225 	first_offset = rootmap->first_offset;
226 	last_offset  = rootmap->next_offset;
227 	buffer = NULL;
228 	error = 0;
229 
230 	hmp->recover_stage2_offset = 0;
231 
232 	if (first_offset > rootmap->alloc_offset ||
233 	    last_offset > rootmap->alloc_offset) {
234 		kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
235 			"%016jx, %016jx limit %016jx\n",
236 			root_volume->ondisk->vol_name,
237 			(intmax_t)first_offset,
238 			(intmax_t)last_offset,
239 			(intmax_t)rootmap->alloc_offset);
240 		error = EIO;
241 		goto done;
242 	}
243 
244 	/*
245 	 * In HAMMER version 4+ filesystems the volume header does NOT
246 	 * contain definitive UNDO FIFO state.  In particular, the
247 	 * rootmap->next_offset may not be indexed completely to the
248 	 * end of the active UNDO FIFO.
249 	 */
250 	if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
251 		/*
252 		 * To find the definitive range we must first scan backwards
253 		 * from first_offset to locate the first real record and
254 		 * extract the sequence number from it.  This record is not
255 		 * part of the active undo space.
256 		 */
257 		scan_offset = first_offset;
258 		seqno = 0;
259 
260 		for (;;) {
261 			head = hammer_recover_scan_rev(hmp, root_volume,
262 						       &scan_offset,
263 						       &error, &buffer);
264 			if (error)
265 				break;
266 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
267 				seqno = head->head.hdr_seq;
268 				break;
269 			}
270 		}
271 		if (error) {
272 			kprintf("HAMMER(%s) recovery failure "
273 				"during seqno backscan\n",
274 				root_volume->ondisk->vol_name);
275 			goto done;
276 		}
277 
278 		/*
279 		 * Scan forwards from first_offset and (seqno+1) looking
280 		 * for a sequence space discontinuity.  This denotes the
281 		 * end of the active FIFO area.
282 		 *
283 		 * NOTE: For the case where the FIFO is empty the very first
284 		 *	 record we find will be discontinuous.
285 		 *
286 		 * NOTE: Do not include trailing PADs in the scan range,
287 		 *	 and remember the returned scan_offset after a
288 		 *	 fwd iteration points to the end of the returned
289 		 *	 record.
290 		 */
291 		kprintf("HAMMER(%s) recovery check seqno=%08x\n",
292 			root_volume->ondisk->vol_name,
293 			seqno);
294 
295 		scan_offset = first_offset;
296 		scan_offset_save = scan_offset;
297 		++seqno;
298 		hmp->recover_stage2_seqno = seqno;
299 
300 		for (;;) {
301 			head = hammer_recover_scan_fwd(hmp, root_volume,
302 						       &scan_offset,
303 						       &error, &buffer);
304 			if (error)
305 				break;
306 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
307 				if (seqno != head->head.hdr_seq) {
308 					scan_offset = scan_offset_save;
309 					break;
310 				}
311 				scan_offset_save = scan_offset;
312 				++seqno;
313 			}
314 
315 #if 0
316 			/*
317 			 * If the forward scan is grossly ahead of last_offset
318 			 * then something is wrong.  last_offset is supposed
319 			 * to be flushed out
320 			 */
321 			if (last_offset >= scan_offset) {
322 				bytes = last_offset - scan_offset;
323 			} else {
324 				bytes = rootmap->alloc_offset - scan_offset +
325 					(last_offset & HAMMER_OFF_LONG_MASK);
326 			}
327 			if (bytes >
328 			    (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
329 			    4 / 5) {
330 				kprintf("HAMMER(%s) recovery forward scan is "
331 					"grossly beyond the last_offset in "
332 					"the volume header, this can't be "
333 					"right.\n",
334 					root_volume->ondisk->vol_name);
335 				error = EIO;
336 				break;
337 			}
338 #endif
339 		}
340 
341 		/*
342 		 * Store the seqno.  This will be the next seqno we lay down
343 		 * when generating new UNDOs.
344 		 */
345 		hmp->undo_seqno = seqno;
346 		if (error) {
347 			kprintf("HAMMER(%s) recovery failure "
348 				"during seqno fwdscan\n",
349 				root_volume->ondisk->vol_name);
350 			goto done;
351 		}
352 		last_offset = scan_offset;
353 		kprintf("HAMMER(%s) recovery range %016jx-%016jx\n"
354 			"HAMMER(%s) recovery nexto %016jx endseqno=%08x\n",
355 			root_volume->ondisk->vol_name,
356 			(intmax_t)first_offset,
357 			(intmax_t)last_offset,
358 			root_volume->ondisk->vol_name,
359 			(intmax_t)rootmap->next_offset,
360 			seqno);
361 	}
362 
363 	/*
364 	 * Calculate the size of the active portion of the FIFO.  If the
365 	 * FIFO is empty the filesystem is clean and no further action is
366 	 * needed.
367 	 */
368 	if (last_offset >= first_offset) {
369 		bytes = last_offset - first_offset;
370 	} else {
371 		bytes = rootmap->alloc_offset - first_offset +
372 			(last_offset & HAMMER_OFF_LONG_MASK);
373 	}
374 	if (bytes == 0) {
375 		degenerate_case = 1;
376 		error = 0;
377 		goto done;
378 	}
379 
380 	kprintf("HAMMER(%s) recovery undo  %016jx-%016jx (%jd bytes)%s\n",
381 		root_volume->ondisk->vol_name,
382 		(intmax_t)first_offset,
383 		(intmax_t)last_offset,
384 		(intmax_t)bytes,
385 		(hmp->ronly ? " (RO)" : "(RW)"));
386 	if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
387 		kprintf("Undo size is absurd, unable to mount\n");
388 		error = EIO;
389 		goto done;
390 	}
391 
392 	/*
393 	 * Scan the UNDOs backwards.
394 	 */
395 	scan_offset = last_offset;
396 
397 	while ((int64_t)bytes > 0) {
398 		KKASSERT(scan_offset != first_offset);
399 		head = hammer_recover_scan_rev(hmp, root_volume,
400 					       &scan_offset, &error, &buffer);
401 		if (error)
402 			break;
403 
404 		/*
405 		 * Normal UNDO
406 		 */
407 		error = hammer_recover_undo(hmp, root_volume, &head->undo);
408 		if (error) {
409 			kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
410 				root_volume->ondisk->vol_name,
411 				(intmax_t)scan_offset - head->head.hdr_size);
412 			break;
413 		}
414 
415 		/*
416 		 * The first REDO_SYNC record encountered (scanning backwards)
417 		 * enables REDO processing.
418 		 */
419 		if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
420 		    head->redo.redo_flags == HAMMER_REDO_SYNC) {
421 			if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
422 				kprintf("HAMMER(%s) Ignoring extra REDO_SYNC "
423 					"records in UNDO/REDO FIFO.\n",
424 					root_volume->ondisk->vol_name
425 				);
426 			} else {
427 				hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
428 				hmp->recover_stage2_offset =
429 					head->redo.redo_offset;
430 				kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n",
431 					root_volume->ondisk->vol_name,
432 					(intmax_t)head->redo.redo_offset);
433 			}
434 		}
435 
436 		bytes -= head->head.hdr_size;
437 
438 		/*
439 		 * If too many dirty buffers have built up we have to flush'm
440 		 * out.  As long as we do not flush out the volume header
441 		 * a crash here should not cause any problems.
442 		 *
443 		 * buffer must be released so the flush can assert that
444 		 * all buffers are idle.
445 		 */
446 		if (hammer_flusher_meta_limit(hmp)) {
447 			if (buffer) {
448 				hammer_rel_buffer(buffer, 0);
449 				buffer = NULL;
450 			}
451 			if (hmp->ronly == 0) {
452 				hammer_recover_flush_buffers(hmp, root_volume,
453 							     0);
454 				kprintf("HAMMER(%s) Continuing recovery\n",
455 					root_volume->ondisk->vol_name);
456 			} else {
457 				kprintf("HAMMER(%s) Recovery failure: "
458 					"Insufficient buffer cache to hold "
459 					"dirty buffers on read-only mount!\n",
460 					root_volume->ondisk->vol_name);
461 				error = EIO;
462 				break;
463 			}
464 		}
465 	}
466 	KKASSERT(error || bytes == 0);
467 done:
468 	if (buffer) {
469 		hammer_rel_buffer(buffer, 0);
470 		buffer = NULL;
471 	}
472 
473 	/*
474 	 * After completely flushing all the recovered buffers the volume
475 	 * header will also be flushed.
476 	 */
477 	if (root_volume->io.recovered == 0) {
478 		hammer_ref_volume(root_volume);
479 		root_volume->io.recovered = 1;
480 	}
481 
482 	/*
483 	 * Finish up flushing (or discarding) recovered buffers.  FIFO
484 	 * indices in the volume header are updated to the actual undo
485 	 * range but will not be collapsed until stage 2.
486 	 */
487 	if (error == 0) {
488 		hammer_modify_volume_noundo(NULL, root_volume);
489 		rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
490 		rootmap->first_offset = first_offset;
491 		rootmap->next_offset = last_offset;
492 		hammer_modify_volume_done(root_volume);
493 		if (hmp->ronly == 0)
494 			hammer_recover_flush_buffers(hmp, root_volume, 1);
495 	} else {
496 		hammer_recover_flush_buffers(hmp, root_volume, -1);
497 	}
498 	if (degenerate_case == 0) {
499 		kprintf("HAMMER(%s) recovery complete\n",
500 			root_volume->ondisk->vol_name);
501 	} else {
502 		kprintf("HAMMER(%s) mounted clean, no recovery needed\n",
503 			root_volume->ondisk->vol_name);
504 	}
505 	return (error);
506 }
507 
508 /*
509  * Execute redo operations
510  *
511  * This procedure is run at the end of the mount sequence, after the hammer
512  * mount structure has been completely initialized but before the filesystem
513  * goes live.  It can access standard cursors, the B-Tree, flush the
514  * filesystem, and so forth.
515  *
516  * This code may only be called for read-write mounts or when a mount
517  * switches from read-only to read-write.  vnodes may or may not be present.
518  *
519  * The stage1 code will have already calculated the correct FIFO range
520  * for the nominal UNDO FIFO and stored it in the rootmap.  The extended
521  * range for REDO is stored in hmp->recover_stage2_offset.
522  */
523 int
524 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
525 {
526 	hammer_blockmap_t rootmap;
527 	hammer_buffer_t buffer;
528 	hammer_off_t scan_offset;
529 	hammer_off_t oscan_offset;
530 	hammer_off_t bytes;
531 	hammer_off_t ext_bytes;
532 	hammer_fifo_any_t head;
533 	hammer_off_t first_offset;
534 	hammer_off_t last_offset;
535 	hammer_off_t ext_offset;
536 	struct hammer_rterm_rb_tree rterm_root;
537 	u_int32_t seqno;
538 	int error;
539 	int verbose = 0;
540 	int dorscan;
541 
542 	/*
543 	 * Stage 2 can only be run on a RW mount, or when the mount is
544 	 * switched from RO to RW.
545 	 */
546 	KKASSERT(hmp->ronly == 0);
547 	RB_INIT(&rterm_root);
548 
549 	if (hammer_skip_redo == 1)
550 		kprintf("HAMMER(%s) recovery redo marked as optional\n",
551 		    root_volume->ondisk->vol_name);
552 
553 	if (hammer_skip_redo == 2) {
554 		kprintf("HAMMER(%s) recovery redo skipped.\n",
555 		    root_volume->ondisk->vol_name);
556 		return (0);
557 	}
558 
559 	/*
560 	 * Examine the UNDO FIFO.  If it is empty the filesystem is clean
561 	 * and no action need be taken.
562 	 */
563 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
564 	first_offset = rootmap->first_offset;
565 	last_offset  = rootmap->next_offset;
566 	if (first_offset == last_offset) {
567 		KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
568 		return(0);
569 	}
570 
571 	/*
572 	 * Stage2 must only be run once, and will not be run at all
573 	 * if Stage1 did not find a REDO_SYNC record.
574 	 */
575 	error = 0;
576 	buffer = NULL;
577 
578 	if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
579 		goto done;
580 	hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
581 	hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
582 	ext_offset = hmp->recover_stage2_offset;
583 	if (ext_offset == 0) {
584 		kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC "
585 			"offset, ignoring\n",
586 			root_volume->ondisk->vol_name);
587 		goto done;
588 	}
589 
590 	/*
591 	 * Calculate nominal UNDO range (this is not yet the extended
592 	 * range).
593 	 */
594 	if (last_offset >= first_offset) {
595 		bytes = last_offset - first_offset;
596 	} else {
597 		bytes = rootmap->alloc_offset - first_offset +
598 			(last_offset & HAMMER_OFF_LONG_MASK);
599 	}
600 	kprintf("HAMMER(%s) recovery redo  %016jx-%016jx (%jd bytes)%s\n",
601 		root_volume->ondisk->vol_name,
602 		(intmax_t)first_offset,
603 		(intmax_t)last_offset,
604 		(intmax_t)bytes,
605 		(hmp->ronly ? " (RO)" : "(RW)"));
606 	verbose = 1;
607 	if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
608 		kprintf("Undo size is absurd, unable to mount\n");
609 		error = EIO;
610 		goto fatal;
611 	}
612 
613 	/*
614 	 * Scan the REDOs backwards collecting REDO_TERM_* information.
615 	 * This information is only collected for the extended range,
616 	 * non-inclusive of any TERMs in the nominal UNDO range.
617 	 *
618 	 * If the stage2 extended range is inside the nominal undo range
619 	 * we have nothing to scan.
620 	 *
621 	 * This must fit in memory!
622 	 */
623 	if (first_offset < last_offset) {
624 		/*
625 		 * [      first_offset........last_offset      ]
626 		 */
627 		if (ext_offset < first_offset) {
628 			dorscan = 1;
629 			ext_bytes = first_offset - ext_offset;
630 		} else if (ext_offset > last_offset) {
631 			dorscan = 1;
632 			ext_bytes = (rootmap->alloc_offset - ext_offset) +
633 				    (first_offset & HAMMER_OFF_LONG_MASK);
634 		} else {
635 			ext_bytes = -(ext_offset - first_offset);
636 			dorscan = 0;
637 		}
638 	} else {
639 		/*
640 		 * [......last_offset         first_offset.....]
641 		 */
642 		if (ext_offset < last_offset) {
643 			ext_bytes = -((rootmap->alloc_offset - first_offset) +
644 				    (ext_offset & HAMMER_OFF_LONG_MASK));
645 			dorscan = 0;
646 		} else if (ext_offset > first_offset) {
647 			ext_bytes = -(ext_offset - first_offset);
648 			dorscan = 0;
649 		} else {
650 			ext_bytes = first_offset - ext_offset;
651 			dorscan = 1;
652 		}
653 	}
654 
655 	if (dorscan) {
656 		scan_offset = first_offset;
657 		kprintf("HAMMER(%s) Find extended redo  %016jx, %jd extbytes\n",
658 			root_volume->ondisk->vol_name,
659 			(intmax_t)ext_offset,
660 			(intmax_t)ext_bytes);
661 		seqno = hmp->recover_stage2_seqno - 1;
662 		for (;;) {
663 			head = hammer_recover_scan_rev(hmp, root_volume,
664 						       &scan_offset,
665 						       &error, &buffer);
666 			if (error)
667 				break;
668 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
669 				if (head->head.hdr_seq != seqno) {
670 					error = ERANGE;
671 					break;
672 				}
673 				error = hammer_recover_redo_rec(
674 						hmp, &rterm_root,
675 						scan_offset, &head->redo);
676 				--seqno;
677 			}
678 			if (scan_offset == ext_offset)
679 				break;
680 		}
681 		if (error) {
682 			kprintf("HAMMER(%s) Find extended redo failed %d, "
683 				"unable to run REDO\n",
684 				root_volume->ondisk->vol_name,
685 				error);
686 			goto done;
687 		}
688 	} else {
689 		kprintf("HAMMER(%s) Embedded extended redo %016jx, "
690 			"%jd extbytes\n",
691 			root_volume->ondisk->vol_name,
692 			(intmax_t)ext_offset,
693 			(intmax_t)ext_bytes);
694 	}
695 
696 	/*
697 	 * Scan the REDO forwards through the entire extended range.
698 	 * Anything with a previously recorded matching TERM is discarded.
699 	 */
700 	scan_offset = ext_offset;
701 	bytes += ext_bytes;
702 
703 	/*
704 	 * NOTE: when doing a forward scan the returned scan_offset is
705 	 *	 for the record following the returned record, so we
706 	 *	 have to play a bit.
707 	 */
708 	while ((int64_t)bytes > 0) {
709 		KKASSERT(scan_offset != last_offset);
710 
711 		oscan_offset = scan_offset;
712 		head = hammer_recover_scan_fwd(hmp, root_volume,
713 					       &scan_offset, &error, &buffer);
714 		if (error)
715 			break;
716 
717 		error = hammer_recover_redo_run(hmp, &rterm_root,
718 						oscan_offset, &head->redo);
719 		if (error) {
720 			kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
721 				root_volume->ondisk->vol_name,
722 				(intmax_t)scan_offset - head->head.hdr_size);
723 			break;
724 		}
725 		bytes -= head->head.hdr_size;
726 	}
727 	KKASSERT(error || bytes == 0);
728 
729 done:
730 	if (buffer) {
731 		hammer_rel_buffer(buffer, 0);
732 		buffer = NULL;
733 	}
734 
735 	/*
736 	 * Cleanup rterm tree
737 	 */
738 	{
739 		hammer_rterm_t rterm;
740 		hammer_rterm_entry_t rte;
741 
742 		while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
743 			RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
744 			while ((rte = rterm->term_list) != NULL) {
745 				rterm->term_list = rte->next;
746 				kfree(rte, hmp->m_misc);
747 			}
748 			kfree(rterm, hmp->m_misc);
749 		}
750 	}
751 
752 	/*
753 	 * Finish up flushing (or discarding) recovered buffers by executing
754 	 * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
755 	 * case tests and forces the flush in order to update the FIFO indices.
756 	 *
757 	 * If a crash occurs during the flush the entire undo/redo will be
758 	 * re-run during recovery on the next mount.
759 	 */
760 	if (error == 0) {
761 		if (rootmap->first_offset != rootmap->next_offset)
762 			hmp->hflags |= HMNT_UNDO_DIRTY;
763 		hammer_flusher_sync(hmp);
764 	}
765 fatal:
766 	hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
767 	if (verbose) {
768 		kprintf("HAMMER(%s) End redo recovery\n",
769 			root_volume->ondisk->vol_name);
770 	}
771 
772 	if (error && hammer_skip_redo == 1)
773 		kprintf("HAMMER(%s) recovery redo error %d, "
774 		    " skipping.\n", root_volume->ondisk->vol_name,
775 		    error);
776 
777 	return (hammer_skip_redo ? 0 : error);
778 }
779 
780 /*
781  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
782  * record at *scan_offsetp or NULL if an error occured.
783  *
784  * On return *scan_offsetp will be the offset of the returned record.
785  */
786 hammer_fifo_any_t
787 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
788 			hammer_off_t *scan_offsetp,
789 			int *errorp, struct hammer_buffer **bufferp)
790 {
791 	hammer_off_t scan_offset;
792 	hammer_blockmap_t rootmap;
793 	hammer_fifo_any_t head;
794 	hammer_fifo_tail_t tail;
795 
796 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
797 	scan_offset = *scan_offsetp;
798 
799 	if (hammer_debug_general & 0x0080)
800 		kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
801 	if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
802 		scan_offset = rootmap->alloc_offset;
803 	if (scan_offset - sizeof(*tail) <
804 	    HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
805 		kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
806 			root_volume->ondisk->vol_name,
807 			(intmax_t)scan_offset);
808 		*errorp = EIO;
809 		return (NULL);
810 	}
811 	tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
812 			    errorp, bufferp);
813 	if (*errorp) {
814 		kprintf("HAMMER(%s) Unable to read UNDO TAIL "
815 			"at %016jx\n",
816 			root_volume->ondisk->vol_name,
817 			(intmax_t)scan_offset - sizeof(*tail));
818 		return (NULL);
819 	}
820 
821 	if (hammer_check_tail_signature(tail, scan_offset) != 0) {
822 		kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
823 			"at %016jx\n",
824 			root_volume->ondisk->vol_name,
825 			(intmax_t)scan_offset - sizeof(*tail));
826 		*errorp = EIO;
827 		return (NULL);
828 	}
829 	head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
830 	*scan_offsetp = scan_offset - head->head.hdr_size;
831 
832 	return (head);
833 }
834 
835 /*
836  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
837  * an error occured.
838  *
839  * On return *scan_offsetp will be the offset of the record following
840  * the returned record.
841  */
842 hammer_fifo_any_t
843 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
844 			hammer_off_t *scan_offsetp,
845 			int *errorp, struct hammer_buffer **bufferp)
846 {
847 	hammer_off_t scan_offset;
848 	hammer_blockmap_t rootmap;
849 	hammer_fifo_any_t head;
850 
851 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
852 	scan_offset = *scan_offsetp;
853 
854 	if (hammer_debug_general & 0x0080)
855 		kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
856 	if (scan_offset == rootmap->alloc_offset)
857 		scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
858 
859 	head = hammer_bread(hmp, scan_offset, errorp, bufferp);
860 	if (*errorp) {
861 		kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
862 			root_volume->ondisk->vol_name,
863 			(intmax_t)scan_offset);
864 		return (NULL);
865 	}
866 
867 	if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
868 		kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
869 			"at %016jx\n",
870 			root_volume->ondisk->vol_name,
871 			(intmax_t)scan_offset);
872 		*errorp = EIO;
873 		return (NULL);
874 	}
875 	scan_offset += head->head.hdr_size;
876 	if (scan_offset == rootmap->alloc_offset)
877 		scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
878 	*scan_offsetp = scan_offset;
879 
880 	return (head);
881 }
882 
883 /*
884  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
885  * once the head and tail has been established.
886  *
887  * This function validates the entire FIFO record wrapper.
888  */
889 static __inline
890 int
891 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
892 			hammer_off_t beg_off)
893 {
894 	hammer_off_t end_off;
895 	u_int32_t crc;
896 	int bytes;
897 
898 	/*
899 	 * Check signatures.  The tail signature is allowed to be the
900 	 * head signature only for 8-byte PADs.
901 	 */
902 	if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
903 		kprintf("HAMMER: FIFO record bad head signature "
904 			"%04x at %016jx\n",
905 			head->hdr_signature,
906 			(intmax_t)beg_off);
907 		return(2);
908 	}
909 	if (head->hdr_size < HAMMER_HEAD_ALIGN ||
910 	    (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
911 		kprintf("HAMMER: FIFO record unaligned or bad size"
912 			"%04x at %016jx\n",
913 			head->hdr_size,
914 			(intmax_t)beg_off);
915 		return(2);
916 	}
917 	end_off = beg_off + head->hdr_size;
918 
919 	if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
920 	    (size_t)(end_off - beg_off) != sizeof(*tail)) {
921 		if (head->hdr_type != tail->tail_type) {
922 			kprintf("HAMMER: FIFO record head/tail type mismatch "
923 				"%04x %04x at %016jx\n",
924 				head->hdr_type, tail->tail_type,
925 				(intmax_t)beg_off);
926 			return(2);
927 		}
928 		if (head->hdr_size != tail->tail_size) {
929 			kprintf("HAMMER: FIFO record head/tail size mismatch "
930 				"%04x %04x at %016jx\n",
931 				head->hdr_size, tail->tail_size,
932 				(intmax_t)beg_off);
933 			return(2);
934 		}
935 		if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
936 			kprintf("HAMMER: FIFO record bad tail signature "
937 				"%04x at %016jx\n",
938 				tail->tail_signature,
939 				(intmax_t)beg_off);
940 			return(3);
941 		}
942 	}
943 
944 	/*
945 	 * Non-PAD records must have a CRC and must be sized at
946 	 * least large enough to fit the head and tail.
947 	 */
948 	if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
949 		crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
950 		      crc32(head + 1, head->hdr_size - sizeof(*head));
951 		if (head->hdr_crc != crc) {
952 			kprintf("HAMMER: FIFO record CRC failed %08x %08x "
953 				"at %016jx\n",
954 				head->hdr_crc, crc,
955 				(intmax_t)beg_off);
956 			return(EIO);
957 		}
958 		if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
959 			kprintf("HAMMER: FIFO record too small "
960 				"%04x at %016jx\n",
961 				head->hdr_size,
962 				(intmax_t)beg_off);
963 			return(EIO);
964 		}
965 	}
966 
967 	/*
968 	 * Check the tail
969 	 */
970 	bytes = head->hdr_size;
971 	tail = (void *)((char *)head + bytes - sizeof(*tail));
972 	if (tail->tail_size != head->hdr_size) {
973 		kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
974 			tail->tail_size, head->hdr_size,
975 			(intmax_t)beg_off);
976 		return(EIO);
977 	}
978 	if (tail->tail_type != head->hdr_type) {
979 		kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
980 			tail->tail_type, head->hdr_type,
981 			(intmax_t)beg_off);
982 		return(EIO);
983 	}
984 
985 	return(0);
986 }
987 
988 /*
989  * Check that the FIFO record is in-bounds given the head and the
990  * hammer offset.
991  *
992  * Also checks that the head and tail structures agree with each other,
993  * but does not check beyond the signature, type, and size.
994  */
995 static int
996 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
997 {
998 	hammer_fifo_tail_t tail;
999 	hammer_off_t end_off;
1000 
1001 	/*
1002 	 * head overlaps buffer boundary.  This could be a PAD so only
1003 	 * check the minimum PAD size here.
1004 	 */
1005 	if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
1006 		return(1);
1007 
1008 	/*
1009 	 * Calculate the ending offset and make sure the record does
1010 	 * not cross a buffer boundary.
1011 	 */
1012 	end_off = beg_off + head->hdr_size;
1013 	if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1014 		return(1);
1015 	tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
1016 	return (_hammer_check_signature(head, tail, beg_off));
1017 }
1018 
1019 /*
1020  * Check that the FIFO record is in-bounds given the tail and the
1021  * hammer offset.  The offset is pointing at the ending boundary of the
1022  * record.
1023  *
1024  * Also checks that the head and tail structures agree with each other,
1025  * but does not check beyond the signature, type, and size.
1026  */
1027 static int
1028 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
1029 {
1030 	hammer_fifo_head_t head;
1031 	hammer_off_t beg_off;
1032 
1033 	/*
1034 	 * tail overlaps buffer boundary
1035 	 */
1036 	if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1037 		return(1);
1038 
1039 	/*
1040 	 * Calculate the begining offset and make sure the record does
1041 	 * not cross a buffer boundary.
1042 	 */
1043 	beg_off = end_off - tail->tail_size;
1044 	if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1045 		return(1);
1046 	head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1047 	return (_hammer_check_signature(head, tail, beg_off));
1048 }
1049 
1050 static int
1051 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1052 		    hammer_fifo_undo_t undo)
1053 {
1054 	hammer_volume_t volume;
1055 	hammer_buffer_t buffer;
1056 	hammer_off_t buf_offset;
1057 	int zone;
1058 	int error;
1059 	int vol_no;
1060 	int bytes;
1061 	u_int32_t offset;
1062 
1063 	/*
1064 	 * Only process UNDO records.  Flag if we find other records to
1065 	 * optimize stage2 recovery.
1066 	 */
1067 	if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
1068 		return(0);
1069 
1070 	/*
1071 	 * Validate the UNDO record.
1072 	 */
1073 	bytes = undo->head.hdr_size - sizeof(*undo) -
1074 		sizeof(struct hammer_fifo_tail);
1075 	if (bytes < 0 || undo->undo_data_bytes < 0 ||
1076 	    undo->undo_data_bytes > bytes) {
1077 		kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
1078 			undo->undo_data_bytes, bytes);
1079 		return(EIO);
1080 	}
1081 
1082 	bytes = undo->undo_data_bytes;
1083 
1084 	/*
1085 	 * The undo offset may only be a zone-1 or zone-2 offset.
1086 	 *
1087 	 * Currently we only support a zone-1 offset representing the
1088 	 * volume header.
1089 	 */
1090 	zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1091 	offset = undo->undo_offset & HAMMER_BUFMASK;
1092 
1093 	if (offset + bytes > HAMMER_BUFSIZE) {
1094 		kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
1095 		return (EIO);
1096 	}
1097 
1098 	switch(zone) {
1099 	case HAMMER_ZONE_RAW_VOLUME_INDEX:
1100 		vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1101 		volume = hammer_get_volume(hmp, vol_no, &error);
1102 		if (volume == NULL) {
1103 			kprintf("HAMMER: UNDO record, "
1104 				"cannot access volume %d\n", vol_no);
1105 			break;
1106 		}
1107 		hammer_modify_volume_noundo(NULL, volume);
1108 		hammer_recover_copy_undo(undo->undo_offset,
1109 					 (char *)(undo + 1),
1110 					 (char *)volume->ondisk + offset,
1111 					 bytes);
1112 		hammer_modify_volume_done(volume);
1113 
1114 		/*
1115 		 * Multiple modifications may be made to the same buffer.
1116 		 * Also, the volume header cannot be written out until
1117 		 * everything else has been flushed.  This also
1118 		 * covers the read-only case by preventing the kernel from
1119 		 * flushing the buffer.
1120 		 */
1121 		if (volume->io.recovered == 0)
1122 			volume->io.recovered = 1;
1123 		else
1124 			hammer_rel_volume(volume, 0);
1125 		break;
1126 	case HAMMER_ZONE_RAW_BUFFER_INDEX:
1127 		buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
1128 		buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1129 					   0, &error);
1130 		if (buffer == NULL) {
1131 			kprintf("HAMMER: UNDO record, "
1132 				"cannot access buffer %016jx\n",
1133 				(intmax_t)undo->undo_offset);
1134 			break;
1135 		}
1136 		hammer_modify_buffer_noundo(NULL, buffer);
1137 		hammer_recover_copy_undo(undo->undo_offset,
1138 					 (char *)(undo + 1),
1139 					 (char *)buffer->ondisk + offset,
1140 					 bytes);
1141 		hammer_modify_buffer_done(buffer);
1142 
1143 		/*
1144 		 * Multiple modifications may be made to the same buffer,
1145 		 * improve performance by delaying the flush.  This also
1146 		 * covers the read-only case by preventing the kernel from
1147 		 * flushing the buffer.
1148 		 */
1149 		if (buffer->io.recovered == 0)
1150 			buffer->io.recovered = 1;
1151 		else
1152 			hammer_rel_buffer(buffer, 0);
1153 		break;
1154 	default:
1155 		kprintf("HAMMER: Corrupt UNDO record\n");
1156 		error = EIO;
1157 	}
1158 	return (error);
1159 }
1160 
1161 static void
1162 hammer_recover_copy_undo(hammer_off_t undo_offset,
1163 			 char *src, char *dst, int bytes)
1164 {
1165 	if (hammer_debug_general & 0x0080) {
1166 		kprintf("UNDO %016jx: %d\n",
1167 			(intmax_t)undo_offset, bytes);
1168 	}
1169 #if 0
1170 	kprintf("UNDO %016jx:", (intmax_t)undo_offset);
1171 	hammer_recover_debug_dump(22, dst, bytes);
1172 	kprintf("%22s", "to:");
1173 	hammer_recover_debug_dump(22, src, bytes);
1174 #endif
1175 	bcopy(src, dst, bytes);
1176 }
1177 
1178 /*
1179  * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1180  * during the backwards scan of the extended UNDO/REDO FIFO.  This scan
1181  * does not include the nominal UNDO range, just the extended range.
1182  */
1183 int
1184 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1185 			hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1186 {
1187 	hammer_rterm_t rterm;
1188 	hammer_rterm_t nrterm;
1189 	hammer_rterm_entry_t rte;
1190 
1191 	if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1192 		return(0);
1193 	if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1194 	    redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1195 		return(0);
1196 	}
1197 
1198 	nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1199 	nrterm->redo_objid = redo->redo_objid;
1200 	nrterm->redo_localization = redo->redo_localization;
1201 	nrterm->redo_flags = redo->redo_flags;
1202 	nrterm->redo_offset = redo->redo_offset;
1203 
1204 	rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1205 	if (rterm)
1206 		kfree(nrterm, hmp->m_misc);
1207 	else
1208 		rterm = nrterm;
1209 
1210 	if (bootverbose) {
1211 		kprintf("record record %016jx objid %016jx "
1212 			"offset %016jx flags %08x\n",
1213 			(intmax_t)scan_offset,
1214 			(intmax_t)redo->redo_objid,
1215 			(intmax_t)redo->redo_offset,
1216 			(int)redo->redo_flags);
1217 	}
1218 
1219 	/*
1220 	 * Scan in reverse order, rte prepended, so the rte list will be
1221 	 * in forward order.
1222 	 */
1223 	rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1224 	rte->fifo_offset = scan_offset;
1225 	rte->next = rterm->term_list;
1226 	rterm->term_list = rte;
1227 
1228 	return(0);
1229 }
1230 
1231 /*
1232  * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1233  * the forwards scan of the entire extended UNDO/REDO FIFO range.
1234  *
1235  * Records matching previously recorded TERMs have already been committed
1236  * and are ignored.
1237  */
1238 int
1239 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1240 			hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1241 {
1242 	struct hammer_rterm rtval;
1243 	hammer_rterm_t rterm;
1244 	hammer_rterm_entry_t rte;
1245 
1246 	if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1247 		return(0);
1248 
1249 	switch(redo->redo_flags) {
1250 	case HAMMER_REDO_WRITE:
1251 	case HAMMER_REDO_TRUNC:
1252 		/*
1253 		 * We hit a REDO request.  The REDO request is only executed
1254 		 * if there is no matching TERM.
1255 		 */
1256 		bzero(&rtval, sizeof(rtval));
1257 		rtval.redo_objid = redo->redo_objid;
1258 		rtval.redo_localization = redo->redo_localization;
1259 		rtval.redo_offset = redo->redo_offset;
1260 		rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1261 				   HAMMER_REDO_TERM_WRITE :
1262 				   HAMMER_REDO_TERM_TRUNC;
1263 
1264 		rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1265 		if (rterm) {
1266 			if (bootverbose) {
1267 				kprintf("ignore record %016jx objid %016jx "
1268 					"offset %016jx flags %08x\n",
1269 					(intmax_t)scan_offset,
1270 					(intmax_t)redo->redo_objid,
1271 					(intmax_t)redo->redo_offset,
1272 					(int)redo->redo_flags);
1273 			}
1274 			break;
1275 		}
1276 		if (bootverbose) {
1277 			kprintf("run    record %016jx objid %016jx "
1278 				"offset %016jx flags %08x\n",
1279 				(intmax_t)scan_offset,
1280 				(intmax_t)redo->redo_objid,
1281 				(intmax_t)redo->redo_offset,
1282 				(int)redo->redo_flags);
1283 		}
1284 
1285 		/*
1286 		 * Redo stage2 can access a live filesystem, acquire the
1287 		 * vnode.
1288 		 */
1289 		hammer_recover_redo_exec(hmp, redo);
1290 		break;
1291 	case HAMMER_REDO_TERM_WRITE:
1292 	case HAMMER_REDO_TERM_TRUNC:
1293 		/*
1294 		 * As we encounter TERMs in the forward scan we remove
1295 		 * them.  Once the forward scan hits the nominal undo range
1296 		 * there will be no more recorded TERMs.
1297 		 */
1298 		bzero(&rtval, sizeof(rtval));
1299 		rtval.redo_objid = redo->redo_objid;
1300 		rtval.redo_localization = redo->redo_localization;
1301 		rtval.redo_flags = redo->redo_flags;
1302 		rtval.redo_offset = redo->redo_offset;
1303 
1304 		rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1305 		if (rterm) {
1306 			if ((rte = rterm->term_list) != NULL) {
1307 				KKASSERT(rte->fifo_offset == scan_offset);
1308 				rterm->term_list = rte->next;
1309 				kfree(rte, hmp->m_misc);
1310 			}
1311 		}
1312 		break;
1313 	}
1314 	return(0);
1315 }
1316 
1317 static void
1318 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1319 {
1320 	struct hammer_transaction trans;
1321 	struct vattr va;
1322 	struct hammer_inode *ip;
1323 	struct vnode *vp = NULL;
1324 	int error;
1325 
1326 	hammer_start_transaction(&trans, hmp);
1327 
1328 	ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1329 			      HAMMER_MAX_TID, redo->redo_localization,
1330 			      0, &error);
1331 	if (ip == NULL) {
1332 		kprintf("unable to find objid %016jx:%08x\n",
1333 			(intmax_t)redo->redo_objid, redo->redo_localization);
1334 		goto done2;
1335 	}
1336 	error = hammer_get_vnode(ip, &vp);
1337 	if (error) {
1338 		kprintf("unable to acquire vnode for %016jx:%08x\n",
1339 			(intmax_t)redo->redo_objid, redo->redo_localization);
1340 		goto done1;
1341 	}
1342 
1343 	switch(redo->redo_flags) {
1344 	case HAMMER_REDO_WRITE:
1345 		error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1346 		if (error) {
1347 			kprintf("vn_rdwr open %016jx:%08x returned %d\n",
1348 				(intmax_t)redo->redo_objid,
1349 				redo->redo_localization, error);
1350 			break;
1351 		}
1352 		vn_unlock(vp);
1353 		error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1354 				redo->redo_data_bytes,
1355 				redo->redo_offset, UIO_SYSSPACE,
1356 				0, proc0.p_ucred, NULL);
1357 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1358 		if (error) {
1359 			kprintf("write %016jx:%08x returned %d\n",
1360 				(intmax_t)redo->redo_objid,
1361 				redo->redo_localization, error);
1362 		}
1363 		VOP_CLOSE(vp, FREAD|FWRITE, NULL);
1364 		break;
1365 	case HAMMER_REDO_TRUNC:
1366 		VATTR_NULL(&va);
1367 		va.va_size = redo->redo_offset;
1368 		error = VOP_SETATTR(vp, &va, proc0.p_ucred);
1369 		if (error) {
1370 			kprintf("setattr offset %016jx error %d\n",
1371 				(intmax_t)redo->redo_offset, error);
1372 		}
1373 		break;
1374 	}
1375 	vput(vp);
1376 done1:
1377 	hammer_rel_inode(ip, 0);
1378 done2:
1379 	hammer_done_transaction(&trans);
1380 }
1381 
1382 /*
1383  * RB tree compare function.  Note that REDO_TERM_TRUNC ops ignore
1384  * the offset.
1385  *
1386  * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1387  */
1388 static int
1389 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1390 {
1391 	if (rt1->redo_objid < rt2->redo_objid)
1392 		return(-1);
1393 	if (rt1->redo_objid > rt2->redo_objid)
1394 		return(1);
1395 	if (rt1->redo_localization < rt2->redo_localization)
1396 		return(-1);
1397 	if (rt1->redo_localization > rt2->redo_localization)
1398 		return(1);
1399 	if (rt1->redo_flags < rt2->redo_flags)
1400 		return(-1);
1401 	if (rt1->redo_flags > rt2->redo_flags)
1402 		return(1);
1403 	if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1404 		if (rt1->redo_offset < rt2->redo_offset)
1405 			return(-1);
1406 		if (rt1->redo_offset > rt2->redo_offset)
1407 			return(1);
1408 	}
1409 	return(0);
1410 }
1411 
1412 #if 0
1413 
1414 static void
1415 hammer_recover_debug_dump(int w, char *buf, int bytes)
1416 {
1417 	int i;
1418 
1419 	for (i = 0; i < bytes; ++i) {
1420 		if (i && (i & 15) == 0)
1421 			kprintf("\n%*.*s", w, w, "");
1422 		kprintf(" %02x", (unsigned char)buf[i]);
1423 	}
1424 	kprintf("\n");
1425 }
1426 
1427 #endif
1428 
1429 /*
1430  * Flush recovered buffers from recovery operations.  The call to this
1431  * routine may be delayed if a read-only mount was made and then later
1432  * upgraded to read-write.  This routine is also called when unmounting
1433  * a read-only mount to clean out recovered (dirty) buffers which we
1434  * couldn't flush (because the mount is read-only).
1435  *
1436  * The volume header is always written last.  The UNDO FIFO will be forced
1437  * to zero-length by setting next_offset to first_offset.  This leaves the
1438  * (now stale) UNDO information used to recover the disk available for
1439  * forensic analysis.
1440  *
1441  * final is typically 0 or 1.  The volume header is only written if final
1442  * is 1.  If final is -1 the recovered buffers are discarded instead of
1443  * written and root_volume can also be passed as NULL in that case.
1444  */
1445 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1446 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1447 
1448 void
1449 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1450 			     int final)
1451 {
1452         /*
1453          * Flush the buffers out asynchronously, wait for all the I/O to
1454 	 * complete, then do it again to destroy the buffer cache buffer
1455 	 * so it doesn't alias something later on.
1456          */
1457 	RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1458 		hammer_recover_flush_buffer_callback, &final);
1459 	hammer_io_wait_all(hmp, "hmrrcw", 1);
1460 	RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1461 		hammer_recover_flush_buffer_callback, &final);
1462 
1463 	/*
1464 	 * Flush all volume headers except the root volume.  If final < 0
1465 	 * we discard all volume headers including the root volume.
1466 	 */
1467 	if (final >= 0) {
1468 		RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1469 			hammer_recover_flush_volume_callback, root_volume);
1470 	} else {
1471 		RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1472 			hammer_recover_flush_volume_callback, NULL);
1473 	}
1474 
1475 	/*
1476 	 * Finalize the root volume header.
1477 	 *
1478 	 * No interlock is needed, volume buffers are not
1479 	 * messed with by bioops.
1480 	 */
1481 	if (root_volume && root_volume->io.recovered && final > 0) {
1482 		hammer_io_wait_all(hmp, "hmrflx", 1);
1483 		root_volume->io.recovered = 0;
1484 		hammer_io_flush(&root_volume->io, 0);
1485 		hammer_rel_volume(root_volume, 0);
1486 		hammer_io_wait_all(hmp, "hmrfly", 1);
1487 	}
1488 }
1489 
1490 /*
1491  * Callback to flush volume headers.  If discarding data will be NULL and
1492  * all volume headers (including the root volume) will be discarded.
1493  * Otherwise data is the root_volume and we flush all volume headers
1494  * EXCEPT the root_volume.
1495  *
1496  * Clear any I/O error or modified condition when discarding buffers to
1497  * clean up the reference count, otherwise the buffer may have extra refs
1498  * on it.
1499  */
1500 static
1501 int
1502 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1503 {
1504 	hammer_volume_t root_volume = data;
1505 
1506 	if (volume->io.recovered && volume != root_volume) {
1507 		volume->io.recovered = 0;
1508 		if (root_volume != NULL) {
1509 			/*
1510 			 * No interlock is needed, volume buffers are not
1511 			 * messed with by bioops.
1512 			 */
1513 			hammer_io_flush(&volume->io, 0);
1514 		} else {
1515 			hammer_io_clear_error(&volume->io);
1516 			hammer_io_clear_modify(&volume->io, 1);
1517 		}
1518 		hammer_rel_volume(volume, 0);
1519 	}
1520 	return(0);
1521 }
1522 
1523 /*
1524  * Flush or discard recovered I/O buffers.
1525  *
1526  * Clear any I/O error or modified condition when discarding buffers to
1527  * clean up the reference count, otherwise the buffer may have extra refs
1528  * on it.
1529  */
1530 static
1531 int
1532 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1533 {
1534 	int final = *(int *)data;
1535 	int flush;
1536 
1537 	if (buffer->io.recovered) {
1538 		buffer->io.recovered = 0;
1539 		buffer->io.reclaim = 1;
1540 		if (final < 0) {
1541 			hammer_io_clear_error(&buffer->io);
1542 			hammer_io_clear_modify(&buffer->io, 1);
1543 		} else {
1544 			hammer_io_write_interlock(&buffer->io);
1545 			hammer_io_flush(&buffer->io, 0);
1546 			hammer_io_done_interlock(&buffer->io);
1547 		}
1548 		hammer_rel_buffer(buffer, 0);
1549 	} else {
1550 		flush = hammer_ref_interlock(&buffer->io.lock);
1551 		if (flush)
1552 			atomic_add_int(&hammer_count_refedbufs, 1);
1553 
1554 		if (final < 0) {
1555 			hammer_io_clear_error(&buffer->io);
1556 			hammer_io_clear_modify(&buffer->io, 1);
1557 		}
1558 		KKASSERT(hammer_oneref(&buffer->io.lock));
1559 		buffer->io.reclaim = 1;
1560 		hammer_rel_buffer(buffer, flush);
1561 	}
1562 	return(0);
1563 }
1564 
1565