xref: /dragonfly/sys/vfs/hammer/hammer_recover.c (revision 73610d44)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * UNDO ALGORITHM:
37  *
38  *	The UNDO algorithm is trivial.  The nominal UNDO range in the
39  *	FIFO is determined by taking the first/next offset stored in
40  *	the volume header.  The next offset may not be correct since
41  *	UNDO flushes are not required to flush the volume header, so
42  *	the code also scans forward until it finds a discontinuous
43  *	sequence number.
44  *
45  *	The UNDOs are then scanned and executed in reverse order.  These
46  *	UNDOs are effectively just data restorations based on HAMMER offsets.
47  *
48  * REDO ALGORITHM:
49  *
50  *	REDO records are laid down in the UNDO/REDO FIFO for nominal
51  *	writes, truncations, and file extension ops.  On a per-inode
52  *	basis two types of REDO records are generated, REDO_WRITE
53  *	and REDO_TRUNC.
54  *
55  *	Essentially the recovery block will contain UNDO records backing
56  *	out partial operations and REDO records to regenerate those partial
57  *	operations guaranteed by the filesystem during recovery.
58  *
59  *	REDO generation is optional, and can also be started and then
60  *	later stopped due to excessive write()s inbetween fsyncs, or not
61  *	started at all.  Because of this the recovery code must determine
62  *	when REDOs are valid and when they are not.  Additional records are
63  *	generated to help figure it out.
64  *
65  *	The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66  *	during a flush cycle indicating which records the flush cycle
67  *	has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68  *	each flush cycle to indicate how far back in the UNDO/REDO FIFO
69  *	the recovery code must go to find the earliest applicable REDO
70  *	record.  Applicable REDO records can be far outside the nominal
71  *	UNDO recovery range, for example if a write() lays down a REDO but
72  *	the related file is not flushed for several cycles.
73  *
74  *	The SYNC reference is to a point prior to the nominal UNDO FIFO
75  *	range, creating an extended REDO range which must be scanned.
76  *
77  *	Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78  *	which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79  *	prior to the start of the nominal UNDO range are applicable.
80  *	That is, any REDO_TERM_* records in the extended range but not in
81  *	the nominal undo range will mask any redo operations for prior REDO
82  *	records.  This is necessary because once the TERM is laid down
83  *	followup operations may make additional changes to the related
84  *	records but not necessarily record them as REDOs (because REDOs are
85  *	optional).
86  *
87  *	REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88  *	must be ignored since they represent meta-data flushes which are
89  *	undone by the UNDOs in that nominal UNDO range by the recovery
90  *	code.  Only REDO_TERM_* records in the extended range but not
91  *	in the nominal undo range are applicable.
92  *
93  *	The REDO_SYNC record itself always exists in the nominal UNDO range
94  *	(this is how the extended range is determined).  For recovery
95  *	purposes the most recent REDO_SYNC record is always used if several
96  *	are found.
97  *
98  * CRASHES DURING UNDO/REDO
99  *
100  *	A crash during the UNDO phase requires no additional effort.  The
101  *	UNDOs will simply be re-run again.  The state of the UNDO/REDO fifo
102  *	remains unchanged and has no re-crash issues.
103  *
104  *	A crash during the REDO phase is more complex because the REDOs
105  *	run normal filesystem ops and generate additional UNDO/REDO records.
106  *	REDO is disabled during REDO recovery and any SYNC records generated
107  *	by flushes during REDO recovery must continue to reference the
108  *	original extended range.
109  *
110  *	If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111  *	may become impossible.  This is detected when the start of the
112  *	extended range fails to have monotonically increasing sequence
113  *	numbers leading into the nominal undo range.
114  */
115 
116 #include "hammer.h"
117 
118 /*
119  * Specify the way we want to handle stage2 errors.
120  *
121  * Following values are accepted:
122  *
123  * 0 - Run redo recovery normally and fail to mount if
124  *     the operation fails (default).
125  * 1 - Run redo recovery, but don't fail to mount if the
126  *     operation fails.
127  * 2 - Completely skip redo recovery (only for severe error
128  *     conditions and/or debugging.
129  */
130 static int hammer_skip_redo = 0;
131 TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo);
132 
133 /*
134  * Each rterm entry has a list of fifo offsets indicating termination
135  * points.  These are stripped as the scan progresses.
136  */
137 typedef struct hammer_rterm_entry {
138 	struct hammer_rterm_entry *next;
139 	hammer_off_t		fifo_offset;
140 } *hammer_rterm_entry_t;
141 
142 /*
143  * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
144  * TRUNC entries ignore the offset.
145  */
146 typedef struct hammer_rterm {
147 	RB_ENTRY(hammer_rterm)	rb_node;
148 	int64_t			redo_objid;
149 	u_int32_t		redo_localization;
150 	u_int32_t		redo_flags;
151 	hammer_off_t		redo_offset;
152 	hammer_rterm_entry_t	term_list;
153 } *hammer_rterm_t;
154 
155 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
156 struct hammer_rterm_rb_tree;
157 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
158 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
159 
160 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
161 			hammer_off_t end_off);
162 static int hammer_check_head_signature(hammer_fifo_head_t head,
163 			hammer_off_t beg_off);
164 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
165 			char *src, char *dst, int bytes);
166 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
167 			hammer_volume_t root_volume,
168 			hammer_off_t *scan_offsetp,
169 			int *errorp, struct hammer_buffer **bufferp);
170 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
171 			hammer_volume_t root_volume,
172 			hammer_off_t *scan_offsetp,
173 			int *errorp, struct hammer_buffer **bufferp);
174 #if 0
175 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
176 #endif
177 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
178 			hammer_fifo_undo_t undo);
179 static int hammer_recover_redo_rec(hammer_mount_t hmp,
180 			struct hammer_rterm_rb_tree *root,
181 			hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
182 static int hammer_recover_redo_run(hammer_mount_t hmp,
183 			struct hammer_rterm_rb_tree *root,
184 			hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
185 static void hammer_recover_redo_exec(hammer_mount_t hmp,
186 			hammer_fifo_redo_t redo);
187 
188 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
189 
190 /*
191  * Recover filesystem meta-data on mount.  This procedure figures out the
192  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
193  * resynchronized by this procedure.
194  *
195  * This procedure is run near the beginning of the mount sequence, before
196  * any B-Tree or high-level accesses are enabled, and is responsible for
197  * restoring the meta-data to a consistent state.  High level HAMMER data
198  * structures (such as the B-Tree) cannot be accessed here.
199  *
200  * NOTE: No information from the root volume has been cached in the
201  *	 hammer_mount structure yet, so we need to access the root volume's
202  *	 buffer directly.
203  *
204  * NOTE:
205  */
206 int
207 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
208 {
209 	hammer_blockmap_t rootmap;
210 	hammer_buffer_t buffer;
211 	hammer_off_t scan_offset;
212 	hammer_off_t scan_offset_save;
213 	hammer_off_t bytes;
214 	hammer_fifo_any_t head;
215 	hammer_off_t first_offset;
216 	hammer_off_t last_offset;
217 	u_int32_t seqno;
218 	int error;
219 	int degenerate_case = 0;
220 
221 	/*
222 	 * Examine the UNDO FIFO indices in the volume header.
223 	 */
224 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
225 	first_offset = rootmap->first_offset;
226 	last_offset  = rootmap->next_offset;
227 	buffer = NULL;
228 	error = 0;
229 
230 	hmp->recover_stage2_offset = 0;
231 
232 	if (first_offset > rootmap->alloc_offset ||
233 	    last_offset > rootmap->alloc_offset) {
234 		hvkprintf(root_volume,
235 			"Illegal UNDO FIFO index range "
236 			"%016jx, %016jx limit %016jx\n",
237 			(intmax_t)first_offset,
238 			(intmax_t)last_offset,
239 			(intmax_t)rootmap->alloc_offset);
240 		error = EIO;
241 		goto done;
242 	}
243 
244 	/*
245 	 * In HAMMER version 4+ filesystems the volume header does NOT
246 	 * contain definitive UNDO FIFO state.  In particular, the
247 	 * rootmap->next_offset may not be indexed completely to the
248 	 * end of the active UNDO FIFO.
249 	 */
250 	if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
251 		/*
252 		 * To find the definitive range we must first scan backwards
253 		 * from first_offset to locate the first real record and
254 		 * extract the sequence number from it.  This record is not
255 		 * part of the active undo space.
256 		 */
257 		scan_offset = first_offset;
258 		seqno = 0;
259 
260 		for (;;) {
261 			head = hammer_recover_scan_rev(hmp, root_volume,
262 						       &scan_offset,
263 						       &error, &buffer);
264 			if (error)
265 				break;
266 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
267 				seqno = head->head.hdr_seq;
268 				break;
269 			}
270 		}
271 		if (error) {
272 			hvkprintf(root_volume,
273 				"recovery failure during seqno backscan\n");
274 			goto done;
275 		}
276 
277 		/*
278 		 * Scan forwards from first_offset and (seqno+1) looking
279 		 * for a sequence space discontinuity.  This denotes the
280 		 * end of the active FIFO area.
281 		 *
282 		 * NOTE: For the case where the FIFO is empty the very first
283 		 *	 record we find will be discontinuous.
284 		 *
285 		 * NOTE: Do not include trailing PADs in the scan range,
286 		 *	 and remember the returned scan_offset after a
287 		 *	 fwd iteration points to the end of the returned
288 		 *	 record.
289 		 */
290 		hvkprintf(root_volume, "recovery check seqno=%08x\n", seqno);
291 
292 		scan_offset = first_offset;
293 		scan_offset_save = scan_offset;
294 		++seqno;
295 		hmp->recover_stage2_seqno = seqno;
296 
297 		for (;;) {
298 			head = hammer_recover_scan_fwd(hmp, root_volume,
299 						       &scan_offset,
300 						       &error, &buffer);
301 			if (error)
302 				break;
303 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
304 				if (seqno != head->head.hdr_seq) {
305 					scan_offset = scan_offset_save;
306 					break;
307 				}
308 				scan_offset_save = scan_offset;
309 				++seqno;
310 			}
311 
312 #if 0
313 			/*
314 			 * If the forward scan is grossly ahead of last_offset
315 			 * then something is wrong.  last_offset is supposed
316 			 * to be flushed out
317 			 */
318 			if (last_offset >= scan_offset) {
319 				bytes = last_offset - scan_offset;
320 			} else {
321 				bytes = rootmap->alloc_offset - scan_offset +
322 					(last_offset & HAMMER_OFF_LONG_MASK);
323 			}
324 			if (bytes >
325 			    (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
326 			    4 / 5) {
327 				hvkprintf(root_volume,
328 					"recovery forward scan is "
329 					"grossly beyond the last_offset in "
330 					"the volume header, this can't be "
331 					"right.\n");
332 				error = EIO;
333 				break;
334 			}
335 #endif
336 		}
337 
338 		/*
339 		 * Store the seqno.  This will be the next seqno we lay down
340 		 * when generating new UNDOs.
341 		 */
342 		hmp->undo_seqno = seqno;
343 		if (error) {
344 			hvkprintf(root_volume,
345 				"recovery failure during seqno fwdscan\n");
346 			goto done;
347 		}
348 		last_offset = scan_offset;
349 		hvkprintf(root_volume,
350 			"recovery range %016jx-%016jx\n",
351 			(intmax_t)first_offset,
352 			(intmax_t)last_offset);
353 		hvkprintf(root_volume,
354 			"recovery nexto %016jx endseqno=%08x\n",
355 			(intmax_t)rootmap->next_offset,
356 			seqno);
357 	}
358 
359 	/*
360 	 * Calculate the size of the active portion of the FIFO.  If the
361 	 * FIFO is empty the filesystem is clean and no further action is
362 	 * needed.
363 	 */
364 	if (last_offset >= first_offset) {
365 		bytes = last_offset - first_offset;
366 	} else {
367 		bytes = rootmap->alloc_offset - first_offset +
368 			(last_offset & HAMMER_OFF_LONG_MASK);
369 	}
370 	if (bytes == 0) {
371 		degenerate_case = 1;
372 		error = 0;
373 		goto done;
374 	}
375 
376 	hvkprintf(root_volume,
377 		"recovery undo  %016jx-%016jx (%jd bytes)%s\n",
378 		(intmax_t)first_offset,
379 		(intmax_t)last_offset,
380 		(intmax_t)bytes,
381 		(hmp->ronly ? " (RO)" : "(RW)"));
382 	if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
383 		hkprintf("Undo size is absurd, unable to mount\n");
384 		error = EIO;
385 		goto done;
386 	}
387 
388 	/*
389 	 * Scan the UNDOs backwards.
390 	 */
391 	scan_offset = last_offset;
392 
393 	while ((int64_t)bytes > 0) {
394 		KKASSERT(scan_offset != first_offset);
395 		head = hammer_recover_scan_rev(hmp, root_volume,
396 					       &scan_offset, &error, &buffer);
397 		if (error)
398 			break;
399 
400 		/*
401 		 * Normal UNDO
402 		 */
403 		error = hammer_recover_undo(hmp, root_volume, &head->undo);
404 		if (error) {
405 			hvkprintf(root_volume,
406 				"UNDO record at %016jx failed\n",
407 				(intmax_t)scan_offset - head->head.hdr_size);
408 			break;
409 		}
410 
411 		/*
412 		 * The first REDO_SYNC record encountered (scanning backwards)
413 		 * enables REDO processing.
414 		 */
415 		if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
416 		    head->redo.redo_flags == HAMMER_REDO_SYNC) {
417 			if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
418 				hvkprintf(root_volume,
419 					"Ignoring extra REDO_SYNC "
420 					"records in UNDO/REDO FIFO.\n");
421 			} else {
422 				hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
423 				hmp->recover_stage2_offset =
424 					head->redo.redo_offset;
425 				hvkprintf(root_volume,
426 					"Found REDO_SYNC %016jx\n",
427 					(intmax_t)head->redo.redo_offset);
428 			}
429 		}
430 
431 		bytes -= head->head.hdr_size;
432 
433 		/*
434 		 * If too many dirty buffers have built up we have to flush'm
435 		 * out.  As long as we do not flush out the volume header
436 		 * a crash here should not cause any problems.
437 		 *
438 		 * buffer must be released so the flush can assert that
439 		 * all buffers are idle.
440 		 */
441 		if (hammer_flusher_meta_limit(hmp)) {
442 			if (buffer) {
443 				hammer_rel_buffer(buffer, 0);
444 				buffer = NULL;
445 			}
446 			if (hmp->ronly == 0) {
447 				hammer_recover_flush_buffers(hmp, root_volume,
448 							     0);
449 				hvkprintf(root_volume, "Continuing recovery\n");
450 			} else {
451 				hvkprintf(root_volume,
452 					"Recovery failure: "
453 					"Insufficient buffer cache to hold "
454 					"dirty buffers on read-only mount!\n");
455 				error = EIO;
456 				break;
457 			}
458 		}
459 	}
460 	KKASSERT(error || bytes == 0);
461 done:
462 	if (buffer) {
463 		hammer_rel_buffer(buffer, 0);
464 		buffer = NULL;
465 	}
466 
467 	/*
468 	 * After completely flushing all the recovered buffers the volume
469 	 * header will also be flushed.
470 	 */
471 	if (root_volume->io.recovered == 0) {
472 		hammer_ref_volume(root_volume);
473 		root_volume->io.recovered = 1;
474 	}
475 
476 	/*
477 	 * Finish up flushing (or discarding) recovered buffers.  FIFO
478 	 * indices in the volume header are updated to the actual undo
479 	 * range but will not be collapsed until stage 2.
480 	 */
481 	if (error == 0) {
482 		hammer_modify_volume_noundo(NULL, root_volume);
483 		rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
484 		rootmap->first_offset = first_offset;
485 		rootmap->next_offset = last_offset;
486 		hammer_modify_volume_done(root_volume);
487 		if (hmp->ronly == 0)
488 			hammer_recover_flush_buffers(hmp, root_volume, 1);
489 	} else {
490 		hammer_recover_flush_buffers(hmp, root_volume, -1);
491 	}
492 	if (degenerate_case == 0) {
493 		hvkprintf(root_volume, "recovery complete\n");
494 	} else {
495 		hvkprintf(root_volume, "mounted clean, no recovery needed\n");
496 	}
497 	return (error);
498 }
499 
500 /*
501  * Execute redo operations
502  *
503  * This procedure is run at the end of the mount sequence, after the hammer
504  * mount structure has been completely initialized but before the filesystem
505  * goes live.  It can access standard cursors, the B-Tree, flush the
506  * filesystem, and so forth.
507  *
508  * This code may only be called for read-write mounts or when a mount
509  * switches from read-only to read-write.  vnodes may or may not be present.
510  *
511  * The stage1 code will have already calculated the correct FIFO range
512  * for the nominal UNDO FIFO and stored it in the rootmap.  The extended
513  * range for REDO is stored in hmp->recover_stage2_offset.
514  */
515 int
516 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
517 {
518 	hammer_blockmap_t rootmap;
519 	hammer_buffer_t buffer;
520 	hammer_off_t scan_offset;
521 	hammer_off_t oscan_offset;
522 	hammer_off_t bytes;
523 	hammer_off_t ext_bytes;
524 	hammer_fifo_any_t head;
525 	hammer_off_t first_offset;
526 	hammer_off_t last_offset;
527 	hammer_off_t ext_offset;
528 	struct hammer_rterm_rb_tree rterm_root;
529 	u_int32_t seqno;
530 	int error;
531 	int verbose = 0;
532 	int dorscan;
533 
534 	/*
535 	 * Stage 2 can only be run on a RW mount, or when the mount is
536 	 * switched from RO to RW.
537 	 */
538 	KKASSERT(hmp->ronly == 0);
539 	RB_INIT(&rterm_root);
540 
541 	if (hammer_skip_redo == 1)
542 		hvkprintf(root_volume, "recovery redo marked as optional\n");
543 
544 	if (hammer_skip_redo == 2) {
545 		hvkprintf(root_volume, "recovery redo skipped.\n");
546 		return (0);
547 	}
548 
549 	/*
550 	 * Examine the UNDO FIFO.  If it is empty the filesystem is clean
551 	 * and no action need be taken.
552 	 */
553 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
554 	first_offset = rootmap->first_offset;
555 	last_offset  = rootmap->next_offset;
556 	if (first_offset == last_offset) {
557 		KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
558 		return(0);
559 	}
560 
561 	/*
562 	 * Stage2 must only be run once, and will not be run at all
563 	 * if Stage1 did not find a REDO_SYNC record.
564 	 */
565 	error = 0;
566 	buffer = NULL;
567 
568 	if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
569 		goto done;
570 	hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
571 	hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
572 	ext_offset = hmp->recover_stage2_offset;
573 	if (ext_offset == 0) {
574 		hvkprintf(root_volume,
575 			"REDO stage specified but no REDO_SYNC "
576 			"offset, ignoring\n");
577 		goto done;
578 	}
579 
580 	/*
581 	 * Calculate nominal UNDO range (this is not yet the extended
582 	 * range).
583 	 */
584 	if (last_offset >= first_offset) {
585 		bytes = last_offset - first_offset;
586 	} else {
587 		bytes = rootmap->alloc_offset - first_offset +
588 			(last_offset & HAMMER_OFF_LONG_MASK);
589 	}
590 	hvkprintf(root_volume,
591 		"recovery redo  %016jx-%016jx (%jd bytes)%s\n",
592 		(intmax_t)first_offset,
593 		(intmax_t)last_offset,
594 		(intmax_t)bytes,
595 		(hmp->ronly ? " (RO)" : "(RW)"));
596 	verbose = 1;
597 	if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
598 		hkprintf("Undo size is absurd, unable to mount\n");
599 		error = EIO;
600 		goto fatal;
601 	}
602 
603 	/*
604 	 * Scan the REDOs backwards collecting REDO_TERM_* information.
605 	 * This information is only collected for the extended range,
606 	 * non-inclusive of any TERMs in the nominal UNDO range.
607 	 *
608 	 * If the stage2 extended range is inside the nominal undo range
609 	 * we have nothing to scan.
610 	 *
611 	 * This must fit in memory!
612 	 */
613 	if (first_offset < last_offset) {
614 		/*
615 		 * [      first_offset........last_offset      ]
616 		 */
617 		if (ext_offset < first_offset) {
618 			dorscan = 1;
619 			ext_bytes = first_offset - ext_offset;
620 		} else if (ext_offset > last_offset) {
621 			dorscan = 1;
622 			ext_bytes = (rootmap->alloc_offset - ext_offset) +
623 				    (first_offset & HAMMER_OFF_LONG_MASK);
624 		} else {
625 			ext_bytes = -(ext_offset - first_offset);
626 			dorscan = 0;
627 		}
628 	} else {
629 		/*
630 		 * [......last_offset         first_offset.....]
631 		 */
632 		if (ext_offset < last_offset) {
633 			ext_bytes = -((rootmap->alloc_offset - first_offset) +
634 				    (ext_offset & HAMMER_OFF_LONG_MASK));
635 			dorscan = 0;
636 		} else if (ext_offset > first_offset) {
637 			ext_bytes = -(ext_offset - first_offset);
638 			dorscan = 0;
639 		} else {
640 			ext_bytes = first_offset - ext_offset;
641 			dorscan = 1;
642 		}
643 	}
644 
645 	if (dorscan) {
646 		scan_offset = first_offset;
647 		hvkprintf(root_volume,
648 			"Find extended redo  %016jx, %jd extbytes\n",
649 			(intmax_t)ext_offset,
650 			(intmax_t)ext_bytes);
651 		seqno = hmp->recover_stage2_seqno - 1;
652 		for (;;) {
653 			head = hammer_recover_scan_rev(hmp, root_volume,
654 						       &scan_offset,
655 						       &error, &buffer);
656 			if (error)
657 				break;
658 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
659 				if (head->head.hdr_seq != seqno) {
660 					error = ERANGE;
661 					break;
662 				}
663 				error = hammer_recover_redo_rec(
664 						hmp, &rterm_root,
665 						scan_offset, &head->redo);
666 				--seqno;
667 			}
668 			if (scan_offset == ext_offset)
669 				break;
670 		}
671 		if (error) {
672 			hvkprintf(root_volume,
673 				"Find extended redo failed %d, "
674 				"unable to run REDO\n",
675 				error);
676 			goto done;
677 		}
678 	} else {
679 		hvkprintf(root_volume,
680 			"Embedded extended redo %016jx, %jd extbytes\n",
681 			(intmax_t)ext_offset,
682 			(intmax_t)ext_bytes);
683 	}
684 
685 	/*
686 	 * Scan the REDO forwards through the entire extended range.
687 	 * Anything with a previously recorded matching TERM is discarded.
688 	 */
689 	scan_offset = ext_offset;
690 	bytes += ext_bytes;
691 
692 	/*
693 	 * NOTE: when doing a forward scan the returned scan_offset is
694 	 *	 for the record following the returned record, so we
695 	 *	 have to play a bit.
696 	 */
697 	while ((int64_t)bytes > 0) {
698 		KKASSERT(scan_offset != last_offset);
699 
700 		oscan_offset = scan_offset;
701 		head = hammer_recover_scan_fwd(hmp, root_volume,
702 					       &scan_offset, &error, &buffer);
703 		if (error)
704 			break;
705 
706 		error = hammer_recover_redo_run(hmp, &rterm_root,
707 						oscan_offset, &head->redo);
708 		if (error) {
709 			hvkprintf(root_volume,
710 				"UNDO record at %016jx failed\n",
711 				(intmax_t)scan_offset - head->head.hdr_size);
712 			break;
713 		}
714 		bytes -= head->head.hdr_size;
715 	}
716 	KKASSERT(error || bytes == 0);
717 
718 done:
719 	if (buffer) {
720 		hammer_rel_buffer(buffer, 0);
721 		buffer = NULL;
722 	}
723 
724 	/*
725 	 * Cleanup rterm tree
726 	 */
727 	{
728 		hammer_rterm_t rterm;
729 		hammer_rterm_entry_t rte;
730 
731 		while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
732 			RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
733 			while ((rte = rterm->term_list) != NULL) {
734 				rterm->term_list = rte->next;
735 				kfree(rte, hmp->m_misc);
736 			}
737 			kfree(rterm, hmp->m_misc);
738 		}
739 	}
740 
741 	/*
742 	 * Finish up flushing (or discarding) recovered buffers by executing
743 	 * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
744 	 * case tests and forces the flush in order to update the FIFO indices.
745 	 *
746 	 * If a crash occurs during the flush the entire undo/redo will be
747 	 * re-run during recovery on the next mount.
748 	 */
749 	if (error == 0) {
750 		if (rootmap->first_offset != rootmap->next_offset)
751 			hmp->hflags |= HMNT_UNDO_DIRTY;
752 		hammer_flusher_sync(hmp);
753 	}
754 fatal:
755 	hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
756 	if (verbose) {
757 		hvkprintf(root_volume, "End redo recovery\n");
758 	}
759 
760 	if (error && hammer_skip_redo == 1)
761 		hvkprintf(root_volume,
762 			"recovery redo error %d, skipping.\n",
763 			error);
764 
765 	return (hammer_skip_redo ? 0 : error);
766 }
767 
768 /*
769  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
770  * record at *scan_offsetp or NULL if an error occured.
771  *
772  * On return *scan_offsetp will be the offset of the returned record.
773  */
774 hammer_fifo_any_t
775 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
776 			hammer_off_t *scan_offsetp,
777 			int *errorp, struct hammer_buffer **bufferp)
778 {
779 	hammer_off_t scan_offset;
780 	hammer_blockmap_t rootmap;
781 	hammer_fifo_any_t head;
782 	hammer_fifo_tail_t tail;
783 
784 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
785 	scan_offset = *scan_offsetp;
786 
787 	if (hammer_debug_general & 0x0080)
788 		hdkprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
789 	if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
790 		scan_offset = rootmap->alloc_offset;
791 	if (scan_offset - sizeof(*tail) <
792 	    HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
793 		hvkprintf(root_volume,
794 			"UNDO record at %016jx FIFO underflow\n",
795 			(intmax_t)scan_offset);
796 		*errorp = EIO;
797 		return (NULL);
798 	}
799 	tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
800 			    errorp, bufferp);
801 	if (*errorp) {
802 		hvkprintf(root_volume,
803 			"Unable to read UNDO TAIL at %016jx\n",
804 			(intmax_t)scan_offset - sizeof(*tail));
805 		return (NULL);
806 	}
807 
808 	if (hammer_check_tail_signature(tail, scan_offset) != 0) {
809 		hvkprintf(root_volume,
810 			"Illegal UNDO TAIL signature at %016jx\n",
811 			(intmax_t)scan_offset - sizeof(*tail));
812 		*errorp = EIO;
813 		return (NULL);
814 	}
815 	head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
816 	*scan_offsetp = scan_offset - head->head.hdr_size;
817 
818 	return (head);
819 }
820 
821 /*
822  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
823  * an error occured.
824  *
825  * On return *scan_offsetp will be the offset of the record following
826  * the returned record.
827  */
828 hammer_fifo_any_t
829 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
830 			hammer_off_t *scan_offsetp,
831 			int *errorp, struct hammer_buffer **bufferp)
832 {
833 	hammer_off_t scan_offset;
834 	hammer_blockmap_t rootmap;
835 	hammer_fifo_any_t head;
836 
837 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
838 	scan_offset = *scan_offsetp;
839 
840 	if (hammer_debug_general & 0x0080)
841 		hdkprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
842 	if (scan_offset == rootmap->alloc_offset)
843 		scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
844 
845 	head = hammer_bread(hmp, scan_offset, errorp, bufferp);
846 	if (*errorp) {
847 		hvkprintf(root_volume,
848 			"Unable to read UNDO HEAD at %016jx\n",
849 			(intmax_t)scan_offset);
850 		return (NULL);
851 	}
852 
853 	if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
854 		hvkprintf(root_volume,
855 			"Illegal UNDO TAIL signature at %016jx\n",
856 			(intmax_t)scan_offset);
857 		*errorp = EIO;
858 		return (NULL);
859 	}
860 	scan_offset += head->head.hdr_size;
861 	if (scan_offset == rootmap->alloc_offset)
862 		scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
863 	*scan_offsetp = scan_offset;
864 
865 	return (head);
866 }
867 
868 /*
869  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
870  * once the head and tail has been established.
871  *
872  * This function validates the entire FIFO record wrapper.
873  */
874 static __inline
875 int
876 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
877 			hammer_off_t beg_off)
878 {
879 	hammer_off_t end_off;
880 	u_int32_t crc;
881 	int bytes;
882 
883 	/*
884 	 * Check signatures.  The tail signature is allowed to be the
885 	 * head signature only for 8-byte PADs.
886 	 */
887 	if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
888 		hkprintf("FIFO record bad head signature %04x at %016jx\n",
889 			head->hdr_signature,
890 			(intmax_t)beg_off);
891 		return(2);
892 	}
893 	if (head->hdr_size < HAMMER_HEAD_ALIGN ||
894 	    (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
895 		hkprintf("FIFO record unaligned or bad size %04x at %016jx\n",
896 			head->hdr_size,
897 			(intmax_t)beg_off);
898 		return(2);
899 	}
900 	end_off = beg_off + head->hdr_size;
901 
902 	if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
903 	    (size_t)(end_off - beg_off) != sizeof(*tail)) {
904 		if (head->hdr_type != tail->tail_type) {
905 			hkprintf("FIFO record head/tail type mismatch "
906 				"%04x %04x at %016jx\n",
907 				head->hdr_type, tail->tail_type,
908 				(intmax_t)beg_off);
909 			return(2);
910 		}
911 		if (head->hdr_size != tail->tail_size) {
912 			hkprintf("FIFO record head/tail size mismatch "
913 				"%04x %04x at %016jx\n",
914 				head->hdr_size, tail->tail_size,
915 				(intmax_t)beg_off);
916 			return(2);
917 		}
918 		if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
919 			hkprintf("FIFO record bad tail signature "
920 				"%04x at %016jx\n",
921 				tail->tail_signature,
922 				(intmax_t)beg_off);
923 			return(3);
924 		}
925 	}
926 
927 	/*
928 	 * Non-PAD records must have a CRC and must be sized at
929 	 * least large enough to fit the head and tail.
930 	 */
931 	if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
932 		crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
933 		      crc32(head + 1, head->hdr_size - sizeof(*head));
934 		if (head->hdr_crc != crc) {
935 			hkprintf("FIFO record CRC failed %08x %08x at %016jx\n",
936 				head->hdr_crc, crc,
937 				(intmax_t)beg_off);
938 			return(EIO);
939 		}
940 		if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
941 			hkprintf("FIFO record too small %04x at %016jx\n",
942 				head->hdr_size,
943 				(intmax_t)beg_off);
944 			return(EIO);
945 		}
946 	}
947 
948 	/*
949 	 * Check the tail
950 	 */
951 	bytes = head->hdr_size;
952 	tail = (void *)((char *)head + bytes - sizeof(*tail));
953 	if (tail->tail_size != head->hdr_size) {
954 		hkprintf("Bad tail size %04x vs %04x at %016jx\n",
955 			tail->tail_size, head->hdr_size,
956 			(intmax_t)beg_off);
957 		return(EIO);
958 	}
959 	if (tail->tail_type != head->hdr_type) {
960 		hkprintf("Bad tail type %04x vs %04x at %016jx\n",
961 			tail->tail_type, head->hdr_type,
962 			(intmax_t)beg_off);
963 		return(EIO);
964 	}
965 
966 	return(0);
967 }
968 
969 /*
970  * Check that the FIFO record is in-bounds given the head and the
971  * hammer offset.
972  *
973  * Also checks that the head and tail structures agree with each other,
974  * but does not check beyond the signature, type, and size.
975  */
976 static int
977 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
978 {
979 	hammer_fifo_tail_t tail;
980 	hammer_off_t end_off;
981 
982 	/*
983 	 * head overlaps buffer boundary.  This could be a PAD so only
984 	 * check the minimum PAD size here.
985 	 */
986 	if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
987 		return(1);
988 
989 	/*
990 	 * Calculate the ending offset and make sure the record does
991 	 * not cross a buffer boundary.
992 	 */
993 	end_off = beg_off + head->hdr_size;
994 	if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
995 		return(1);
996 	tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
997 	return (_hammer_check_signature(head, tail, beg_off));
998 }
999 
1000 /*
1001  * Check that the FIFO record is in-bounds given the tail and the
1002  * hammer offset.  The offset is pointing at the ending boundary of the
1003  * record.
1004  *
1005  * Also checks that the head and tail structures agree with each other,
1006  * but does not check beyond the signature, type, and size.
1007  */
1008 static int
1009 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
1010 {
1011 	hammer_fifo_head_t head;
1012 	hammer_off_t beg_off;
1013 
1014 	/*
1015 	 * tail overlaps buffer boundary
1016 	 */
1017 	if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1018 		return(1);
1019 
1020 	/*
1021 	 * Calculate the begining offset and make sure the record does
1022 	 * not cross a buffer boundary.
1023 	 */
1024 	beg_off = end_off - tail->tail_size;
1025 	if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1026 		return(1);
1027 	head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1028 	return (_hammer_check_signature(head, tail, beg_off));
1029 }
1030 
1031 static int
1032 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1033 		    hammer_fifo_undo_t undo)
1034 {
1035 	hammer_volume_t volume;
1036 	hammer_buffer_t buffer;
1037 	hammer_off_t buf_offset;
1038 	int zone;
1039 	int error;
1040 	int vol_no;
1041 	int bytes;
1042 	u_int32_t offset;
1043 
1044 	/*
1045 	 * Only process UNDO records.  Flag if we find other records to
1046 	 * optimize stage2 recovery.
1047 	 */
1048 	if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
1049 		return(0);
1050 
1051 	/*
1052 	 * Validate the UNDO record.
1053 	 */
1054 	bytes = undo->head.hdr_size - sizeof(*undo) -
1055 		sizeof(struct hammer_fifo_tail);
1056 	if (bytes < 0 || undo->undo_data_bytes < 0 ||
1057 	    undo->undo_data_bytes > bytes) {
1058 		hkprintf("Corrupt UNDO record, undo_data_bytes %d/%d\n",
1059 			undo->undo_data_bytes, bytes);
1060 		return(EIO);
1061 	}
1062 
1063 	bytes = undo->undo_data_bytes;
1064 
1065 	/*
1066 	 * The undo offset may only be a zone-1 or zone-2 offset.
1067 	 *
1068 	 * Currently we only support a zone-1 offset representing the
1069 	 * volume header.
1070 	 */
1071 	zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1072 	offset = undo->undo_offset & HAMMER_BUFMASK;
1073 
1074 	if (offset + bytes > HAMMER_BUFSIZE) {
1075 		hkprintf("Corrupt UNDO record, bad offset\n");
1076 		return (EIO);
1077 	}
1078 
1079 	switch(zone) {
1080 	case HAMMER_ZONE_RAW_VOLUME_INDEX:
1081 		vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1082 		volume = hammer_get_volume(hmp, vol_no, &error);
1083 		if (volume == NULL) {
1084 			hkprintf("UNDO record, cannot access volume %d\n",
1085 				vol_no);
1086 			break;
1087 		}
1088 		hammer_modify_volume_noundo(NULL, volume);
1089 		hammer_recover_copy_undo(undo->undo_offset,
1090 					 (char *)(undo + 1),
1091 					 (char *)volume->ondisk + offset,
1092 					 bytes);
1093 		hammer_modify_volume_done(volume);
1094 
1095 		/*
1096 		 * Multiple modifications may be made to the same buffer.
1097 		 * Also, the volume header cannot be written out until
1098 		 * everything else has been flushed.  This also
1099 		 * covers the read-only case by preventing the kernel from
1100 		 * flushing the buffer.
1101 		 */
1102 		if (volume->io.recovered == 0)
1103 			volume->io.recovered = 1;
1104 		else
1105 			hammer_rel_volume(volume, 0);
1106 		break;
1107 	case HAMMER_ZONE_RAW_BUFFER_INDEX:
1108 		buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
1109 		buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1110 					   0, &error);
1111 		if (buffer == NULL) {
1112 			hkprintf("UNDO record, cannot access buffer %016jx\n",
1113 				(intmax_t)undo->undo_offset);
1114 			break;
1115 		}
1116 		hammer_modify_buffer_noundo(NULL, buffer);
1117 		hammer_recover_copy_undo(undo->undo_offset,
1118 					 (char *)(undo + 1),
1119 					 (char *)buffer->ondisk + offset,
1120 					 bytes);
1121 		hammer_modify_buffer_done(buffer);
1122 
1123 		/*
1124 		 * Multiple modifications may be made to the same buffer,
1125 		 * improve performance by delaying the flush.  This also
1126 		 * covers the read-only case by preventing the kernel from
1127 		 * flushing the buffer.
1128 		 */
1129 		if (buffer->io.recovered == 0)
1130 			buffer->io.recovered = 1;
1131 		else
1132 			hammer_rel_buffer(buffer, 0);
1133 		break;
1134 	default:
1135 		hkprintf("Corrupt UNDO record\n");
1136 		error = EIO;
1137 	}
1138 	return (error);
1139 }
1140 
1141 static void
1142 hammer_recover_copy_undo(hammer_off_t undo_offset,
1143 			 char *src, char *dst, int bytes)
1144 {
1145 	if (hammer_debug_general & 0x0080) {
1146 		hdkprintf("UNDO %016jx: %d\n",
1147 			(intmax_t)undo_offset, bytes);
1148 	}
1149 #if 0
1150 	hkprintf("UNDO %016jx:", (intmax_t)undo_offset);
1151 	hammer_recover_debug_dump(22, dst, bytes);
1152 	kprintf("%22s", "to:");
1153 	hammer_recover_debug_dump(22, src, bytes);
1154 #endif
1155 	bcopy(src, dst, bytes);
1156 }
1157 
1158 /*
1159  * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1160  * during the backwards scan of the extended UNDO/REDO FIFO.  This scan
1161  * does not include the nominal UNDO range, just the extended range.
1162  */
1163 int
1164 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1165 			hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1166 {
1167 	hammer_rterm_t rterm;
1168 	hammer_rterm_t nrterm;
1169 	hammer_rterm_entry_t rte;
1170 
1171 	if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1172 		return(0);
1173 	if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1174 	    redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1175 		return(0);
1176 	}
1177 
1178 	nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1179 	nrterm->redo_objid = redo->redo_objid;
1180 	nrterm->redo_localization = redo->redo_localization;
1181 	nrterm->redo_flags = redo->redo_flags;
1182 	nrterm->redo_offset = redo->redo_offset;
1183 
1184 	rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1185 	if (rterm)
1186 		kfree(nrterm, hmp->m_misc);
1187 	else
1188 		rterm = nrterm;
1189 
1190 	if (bootverbose) {
1191 		hkprintf("record record %016jx objid %016jx "
1192 			"offset %016jx flags %08x\n",
1193 			(intmax_t)scan_offset,
1194 			(intmax_t)redo->redo_objid,
1195 			(intmax_t)redo->redo_offset,
1196 			(int)redo->redo_flags);
1197 	}
1198 
1199 	/*
1200 	 * Scan in reverse order, rte prepended, so the rte list will be
1201 	 * in forward order.
1202 	 */
1203 	rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1204 	rte->fifo_offset = scan_offset;
1205 	rte->next = rterm->term_list;
1206 	rterm->term_list = rte;
1207 
1208 	return(0);
1209 }
1210 
1211 /*
1212  * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1213  * the forwards scan of the entire extended UNDO/REDO FIFO range.
1214  *
1215  * Records matching previously recorded TERMs have already been committed
1216  * and are ignored.
1217  */
1218 int
1219 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1220 			hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1221 {
1222 	struct hammer_rterm rtval;
1223 	hammer_rterm_t rterm;
1224 	hammer_rterm_entry_t rte;
1225 
1226 	if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1227 		return(0);
1228 
1229 	switch(redo->redo_flags) {
1230 	case HAMMER_REDO_WRITE:
1231 	case HAMMER_REDO_TRUNC:
1232 		/*
1233 		 * We hit a REDO request.  The REDO request is only executed
1234 		 * if there is no matching TERM.
1235 		 */
1236 		bzero(&rtval, sizeof(rtval));
1237 		rtval.redo_objid = redo->redo_objid;
1238 		rtval.redo_localization = redo->redo_localization;
1239 		rtval.redo_offset = redo->redo_offset;
1240 		rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1241 				   HAMMER_REDO_TERM_WRITE :
1242 				   HAMMER_REDO_TERM_TRUNC;
1243 
1244 		rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1245 		if (rterm) {
1246 			if (bootverbose) {
1247 				hkprintf("ignore record %016jx objid %016jx "
1248 					"offset %016jx flags %08x\n",
1249 					(intmax_t)scan_offset,
1250 					(intmax_t)redo->redo_objid,
1251 					(intmax_t)redo->redo_offset,
1252 					(int)redo->redo_flags);
1253 			}
1254 			break;
1255 		}
1256 		if (bootverbose) {
1257 			hkprintf("run    record %016jx objid %016jx "
1258 				"offset %016jx flags %08x\n",
1259 				(intmax_t)scan_offset,
1260 				(intmax_t)redo->redo_objid,
1261 				(intmax_t)redo->redo_offset,
1262 				(int)redo->redo_flags);
1263 		}
1264 
1265 		/*
1266 		 * Redo stage2 can access a live filesystem, acquire the
1267 		 * vnode.
1268 		 */
1269 		hammer_recover_redo_exec(hmp, redo);
1270 		break;
1271 	case HAMMER_REDO_TERM_WRITE:
1272 	case HAMMER_REDO_TERM_TRUNC:
1273 		/*
1274 		 * As we encounter TERMs in the forward scan we remove
1275 		 * them.  Once the forward scan hits the nominal undo range
1276 		 * there will be no more recorded TERMs.
1277 		 */
1278 		bzero(&rtval, sizeof(rtval));
1279 		rtval.redo_objid = redo->redo_objid;
1280 		rtval.redo_localization = redo->redo_localization;
1281 		rtval.redo_flags = redo->redo_flags;
1282 		rtval.redo_offset = redo->redo_offset;
1283 
1284 		rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1285 		if (rterm) {
1286 			if ((rte = rterm->term_list) != NULL) {
1287 				KKASSERT(rte->fifo_offset == scan_offset);
1288 				rterm->term_list = rte->next;
1289 				kfree(rte, hmp->m_misc);
1290 			}
1291 		}
1292 		break;
1293 	}
1294 	return(0);
1295 }
1296 
1297 static void
1298 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1299 {
1300 	struct hammer_transaction trans;
1301 	struct vattr va;
1302 	struct hammer_inode *ip;
1303 	struct vnode *vp = NULL;
1304 	int error;
1305 
1306 	hammer_start_transaction(&trans, hmp);
1307 
1308 	ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1309 			      HAMMER_MAX_TID, redo->redo_localization,
1310 			      0, &error);
1311 	if (ip == NULL) {
1312 		hkprintf("unable to find objid %016jx:%08x\n",
1313 			(intmax_t)redo->redo_objid, redo->redo_localization);
1314 		goto done2;
1315 	}
1316 	error = hammer_get_vnode(ip, &vp);
1317 	if (error) {
1318 		hkprintf("unable to acquire vnode for %016jx:%08x\n",
1319 			(intmax_t)redo->redo_objid, redo->redo_localization);
1320 		goto done1;
1321 	}
1322 
1323 	switch(redo->redo_flags) {
1324 	case HAMMER_REDO_WRITE:
1325 		error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1326 		if (error) {
1327 			hkprintf("vn_rdwr open %016jx:%08x returned %d\n",
1328 				(intmax_t)redo->redo_objid,
1329 				redo->redo_localization, error);
1330 			break;
1331 		}
1332 		vn_unlock(vp);
1333 		error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1334 				redo->redo_data_bytes,
1335 				redo->redo_offset, UIO_SYSSPACE,
1336 				0, proc0.p_ucred, NULL);
1337 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1338 		if (error) {
1339 			hkprintf("write %016jx:%08x returned %d\n",
1340 				(intmax_t)redo->redo_objid,
1341 				redo->redo_localization, error);
1342 		}
1343 		VOP_CLOSE(vp, FREAD|FWRITE, NULL);
1344 		break;
1345 	case HAMMER_REDO_TRUNC:
1346 		VATTR_NULL(&va);
1347 		va.va_size = redo->redo_offset;
1348 		error = VOP_SETATTR(vp, &va, proc0.p_ucred);
1349 		if (error) {
1350 			hkprintf("setattr offset %016jx error %d\n",
1351 				(intmax_t)redo->redo_offset, error);
1352 		}
1353 		break;
1354 	}
1355 	vput(vp);
1356 done1:
1357 	hammer_rel_inode(ip, 0);
1358 done2:
1359 	hammer_done_transaction(&trans);
1360 }
1361 
1362 /*
1363  * RB tree compare function.  Note that REDO_TERM_TRUNC ops ignore
1364  * the offset.
1365  *
1366  * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1367  */
1368 static int
1369 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1370 {
1371 	if (rt1->redo_objid < rt2->redo_objid)
1372 		return(-1);
1373 	if (rt1->redo_objid > rt2->redo_objid)
1374 		return(1);
1375 	if (rt1->redo_localization < rt2->redo_localization)
1376 		return(-1);
1377 	if (rt1->redo_localization > rt2->redo_localization)
1378 		return(1);
1379 	if (rt1->redo_flags < rt2->redo_flags)
1380 		return(-1);
1381 	if (rt1->redo_flags > rt2->redo_flags)
1382 		return(1);
1383 	if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1384 		if (rt1->redo_offset < rt2->redo_offset)
1385 			return(-1);
1386 		if (rt1->redo_offset > rt2->redo_offset)
1387 			return(1);
1388 	}
1389 	return(0);
1390 }
1391 
1392 #if 0
1393 
1394 static void
1395 hammer_recover_debug_dump(int w, char *buf, int bytes)
1396 {
1397 	int i;
1398 
1399 	for (i = 0; i < bytes; ++i) {
1400 		if (i && (i & 15) == 0)
1401 			kprintf("\n%*.*s", w, w, "");
1402 		kprintf(" %02x", (unsigned char)buf[i]);
1403 	}
1404 	kprintf("\n");
1405 }
1406 
1407 #endif
1408 
1409 /*
1410  * Flush recovered buffers from recovery operations.  The call to this
1411  * routine may be delayed if a read-only mount was made and then later
1412  * upgraded to read-write.  This routine is also called when unmounting
1413  * a read-only mount to clean out recovered (dirty) buffers which we
1414  * couldn't flush (because the mount is read-only).
1415  *
1416  * The volume header is always written last.  The UNDO FIFO will be forced
1417  * to zero-length by setting next_offset to first_offset.  This leaves the
1418  * (now stale) UNDO information used to recover the disk available for
1419  * forensic analysis.
1420  *
1421  * final is typically 0 or 1.  The volume header is only written if final
1422  * is 1.  If final is -1 the recovered buffers are discarded instead of
1423  * written and root_volume can also be passed as NULL in that case.
1424  */
1425 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1426 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1427 
1428 void
1429 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1430 			     int final)
1431 {
1432         /*
1433          * Flush the buffers out asynchronously, wait for all the I/O to
1434 	 * complete, then do it again to destroy the buffer cache buffer
1435 	 * so it doesn't alias something later on.
1436          */
1437 	RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1438 		hammer_recover_flush_buffer_callback, &final);
1439 	hammer_io_wait_all(hmp, "hmrrcw", 1);
1440 	RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1441 		hammer_recover_flush_buffer_callback, &final);
1442 
1443 	/*
1444 	 * Flush all volume headers except the root volume.  If final < 0
1445 	 * we discard all volume headers including the root volume.
1446 	 */
1447 	if (final >= 0) {
1448 		RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1449 			hammer_recover_flush_volume_callback, root_volume);
1450 	} else {
1451 		RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1452 			hammer_recover_flush_volume_callback, NULL);
1453 	}
1454 
1455 	/*
1456 	 * Finalize the root volume header.
1457 	 *
1458 	 * No interlock is needed, volume buffers are not
1459 	 * messed with by bioops.
1460 	 */
1461 	if (root_volume && root_volume->io.recovered && final > 0) {
1462 		hammer_io_wait_all(hmp, "hmrflx", 1);
1463 		root_volume->io.recovered = 0;
1464 		hammer_io_flush(&root_volume->io, 0);
1465 		hammer_rel_volume(root_volume, 0);
1466 		hammer_io_wait_all(hmp, "hmrfly", 1);
1467 	}
1468 }
1469 
1470 /*
1471  * Callback to flush volume headers.  If discarding data will be NULL and
1472  * all volume headers (including the root volume) will be discarded.
1473  * Otherwise data is the root_volume and we flush all volume headers
1474  * EXCEPT the root_volume.
1475  *
1476  * Clear any I/O error or modified condition when discarding buffers to
1477  * clean up the reference count, otherwise the buffer may have extra refs
1478  * on it.
1479  */
1480 static
1481 int
1482 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1483 {
1484 	hammer_volume_t root_volume = data;
1485 
1486 	if (volume->io.recovered && volume != root_volume) {
1487 		volume->io.recovered = 0;
1488 		if (root_volume != NULL) {
1489 			/*
1490 			 * No interlock is needed, volume buffers are not
1491 			 * messed with by bioops.
1492 			 */
1493 			hammer_io_flush(&volume->io, 0);
1494 		} else {
1495 			hammer_io_clear_error(&volume->io);
1496 			hammer_io_clear_modify(&volume->io, 1);
1497 		}
1498 		hammer_rel_volume(volume, 0);
1499 	}
1500 	return(0);
1501 }
1502 
1503 /*
1504  * Flush or discard recovered I/O buffers.
1505  *
1506  * Clear any I/O error or modified condition when discarding buffers to
1507  * clean up the reference count, otherwise the buffer may have extra refs
1508  * on it.
1509  */
1510 static
1511 int
1512 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1513 {
1514 	int final = *(int *)data;
1515 	int flush;
1516 
1517 	if (buffer->io.recovered) {
1518 		buffer->io.recovered = 0;
1519 		buffer->io.reclaim = 1;
1520 		if (final < 0) {
1521 			hammer_io_clear_error(&buffer->io);
1522 			hammer_io_clear_modify(&buffer->io, 1);
1523 		} else {
1524 			hammer_io_write_interlock(&buffer->io);
1525 			hammer_io_flush(&buffer->io, 0);
1526 			hammer_io_done_interlock(&buffer->io);
1527 		}
1528 		hammer_rel_buffer(buffer, 0);
1529 	} else {
1530 		flush = hammer_ref_interlock(&buffer->io.lock);
1531 		if (flush)
1532 			atomic_add_int(&hammer_count_refedbufs, 1);
1533 
1534 		if (final < 0) {
1535 			hammer_io_clear_error(&buffer->io);
1536 			hammer_io_clear_modify(&buffer->io, 1);
1537 		}
1538 		KKASSERT(hammer_oneref(&buffer->io.lock));
1539 		buffer->io.reclaim = 1;
1540 		hammer_rel_buffer(buffer, flush);
1541 	}
1542 	return(0);
1543 }
1544 
1545