xref: /dragonfly/sys/vfs/hammer/hammer_recover.c (revision 52f9f0d9)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * UNDO ALGORITHM:
37  *
38  *	The UNDO algorithm is trivial.  The nominal UNDO range in the
39  *	FIFO is determined by taking the first/next offset stored in
40  *	the volume header.  The next offset may not be correct since
41  *	UNDO flushes are not required to flush the volume header, so
42  *	the code also scans forward until it finds a discontinuous
43  *	sequence number.
44  *
45  *	The UNDOs are then scanned and executed in reverse order.  These
46  *	UNDOs are effectively just data restorations based on HAMMER offsets.
47  *
48  * REDO ALGORITHM:
49  *
50  *	REDO records are laid down in the UNDO/REDO FIFO for nominal
51  *	writes, truncations, and file extension ops.  On a per-inode
52  *	basis two types of REDO records are generated, REDO_WRITE
53  *	and REDO_TRUNC.
54  *
55  *	Essentially the recovery block will contain UNDO records backing
56  *	out partial operations and REDO records to regenerate those partial
57  *	operations guaranteed by the filesystem during recovery.
58  *
59  *	REDO generation is optional, and can also be started and then
60  *	later stopped due to excessive write()s inbetween fsyncs, or not
61  *	started at all.  Because of this the recovery code must determine
62  *	when REDOs are valid and when they are not.  Additional records are
63  *	generated to help figure it out.
64  *
65  *	The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated
66  *	during a flush cycle indicating which records the flush cycle
67  *	has synched meta-data for, and HAMMER_REDO_SYNC is generated in
68  *	each flush cycle to indicate how far back in the UNDO/REDO FIFO
69  *	the recovery code must go to find the earliest applicable REDO
70  *	record.  Applicable REDO records can be far outside the nominal
71  *	UNDO recovery range, for example if a write() lays down a REDO but
72  *	the related file is not flushed for several cycles.
73  *
74  *	The SYNC reference is to a point prior to the nominal UNDO FIFO
75  *	range, creating an extended REDO range which must be scanned.
76  *
77  *	Any REDO_WRITE/REDO_TRUNC encountered within the extended range
78  *	which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records
79  *	prior to the start of the nominal UNDO range are applicable.
80  *	That is, any REDO_TERM_* records in the extended range but not in
81  *	the nominal undo range will mask any redo operations for prior REDO
82  *	records.  This is necessary because once the TERM is laid down
83  *	followup operations may make additional changes to the related
84  *	records but not necessarily record them as REDOs (because REDOs are
85  *	optional).
86  *
87  *	REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range
88  *	must be ignored since they represent meta-data flushes which are
89  *	undone by the UNDOs in that nominal UNDO range by the recovery
90  *	code.  Only REDO_TERM_* records in the extended range but not
91  *	in the nominal undo range are applicable.
92  *
93  *	The REDO_SYNC record itself always exists in the nominal UNDO range
94  *	(this is how the extended range is determined).  For recovery
95  *	purposes the most recent REDO_SYNC record is always used if several
96  *	are found.
97  *
98  * CRASHES DURING UNDO/REDO
99  *
100  *	A crash during the UNDO phase requires no additional effort.  The
101  *	UNDOs will simply be re-run again.  The state of the UNDO/REDO fifo
102  *	remains unchanged and has no re-crash issues.
103  *
104  *	A crash during the REDO phase is more complex because the REDOs
105  *	run normal filesystem ops and generate additional UNDO/REDO records.
106  *	REDO is disabled during REDO recovery and any SYNC records generated
107  *	by flushes during REDO recovery must continue to reference the
108  *	original extended range.
109  *
110  *	If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery
111  *	may become impossible.  This is detected when the start of the
112  *	extended range fails to have monotonically increasing sequence
113  *	numbers leading into the nominal undo range.
114  */
115 
116 #include "hammer.h"
117 
118 /*
119  * Each rterm entry has a list of fifo offsets indicating termination
120  * points.  These are stripped as the scan progresses.
121  */
122 typedef struct hammer_rterm_entry {
123 	struct hammer_rterm_entry *next;
124 	hammer_off_t		fifo_offset;
125 } *hammer_rterm_entry_t;
126 
127 /*
128  * rterm entries sorted in RB tree are indexed by objid, flags, and offset.
129  * TRUNC entries ignore the offset.
130  */
131 typedef struct hammer_rterm {
132 	RB_ENTRY(hammer_rterm)	rb_node;
133 	int64_t			redo_objid;
134 	u_int32_t		redo_localization;
135 	u_int32_t		redo_flags;
136 	hammer_off_t		redo_offset;
137 	hammer_rterm_entry_t	term_list;
138 } *hammer_rterm_t;
139 
140 static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2);
141 struct hammer_rterm_rb_tree;
142 RB_HEAD(hammer_rterm_rb_tree, hammer_rterm);
143 RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
144 
145 static int hammer_check_tail_signature(hammer_fifo_tail_t tail,
146 			hammer_off_t end_off);
147 static int hammer_check_head_signature(hammer_fifo_head_t head,
148 			hammer_off_t beg_off);
149 static void hammer_recover_copy_undo(hammer_off_t undo_offset,
150 			char *src, char *dst, int bytes);
151 static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp,
152 			hammer_volume_t root_volume,
153 			hammer_off_t *scan_offsetp,
154 			int *errorp, struct hammer_buffer **bufferp);
155 static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp,
156 			hammer_volume_t root_volume,
157 			hammer_off_t *scan_offsetp,
158 			int *errorp, struct hammer_buffer **bufferp);
159 #if 0
160 static void hammer_recover_debug_dump(int w, char *buf, int bytes);
161 #endif
162 static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
163 			hammer_fifo_undo_t undo);
164 static int hammer_recover_redo_rec(hammer_mount_t hmp,
165 			struct hammer_rterm_rb_tree *root,
166 			hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
167 static int hammer_recover_redo_run(hammer_mount_t hmp,
168 			struct hammer_rterm_rb_tree *root,
169 			hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo);
170 static void hammer_recover_redo_exec(hammer_mount_t hmp,
171 			hammer_fifo_redo_t redo);
172 
173 RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp);
174 
175 /*
176  * Recover filesystem meta-data on mount.  This procedure figures out the
177  * UNDO FIFO range and runs the UNDOs backwards.  The FIFO pointers are not
178  * resynchronized by this procedure.
179  *
180  * This procedure is run near the beginning of the mount sequence, before
181  * any B-Tree or high-level accesses are enabled, and is responsible for
182  * restoring the meta-data to a consistent state.  High level HAMMER data
183  * structures (such as the B-Tree) cannot be accessed here.
184  *
185  * NOTE: No information from the root volume has been cached in the
186  *	 hammer_mount structure yet, so we need to access the root volume's
187  *	 buffer directly.
188  *
189  * NOTE:
190  */
191 int
192 hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume)
193 {
194 	hammer_blockmap_t rootmap;
195 	hammer_buffer_t buffer;
196 	hammer_off_t scan_offset;
197 	hammer_off_t scan_offset_save;
198 	hammer_off_t bytes;
199 	hammer_fifo_any_t head;
200 	hammer_off_t first_offset;
201 	hammer_off_t last_offset;
202 	u_int32_t seqno;
203 	int error;
204 	int degenerate_case = 0;
205 
206 	/*
207 	 * Examine the UNDO FIFO indices in the volume header.
208 	 */
209 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
210 	first_offset = rootmap->first_offset;
211 	last_offset  = rootmap->next_offset;
212 	buffer = NULL;
213 	error = 0;
214 
215 	hmp->recover_stage2_offset = 0;
216 
217 	if (first_offset > rootmap->alloc_offset ||
218 	    last_offset > rootmap->alloc_offset) {
219 		kprintf("HAMMER(%s) Illegal UNDO FIFO index range "
220 			"%016jx, %016jx limit %016jx\n",
221 			root_volume->ondisk->vol_name,
222 			(intmax_t)first_offset,
223 			(intmax_t)last_offset,
224 			(intmax_t)rootmap->alloc_offset);
225 		error = EIO;
226 		goto done;
227 	}
228 
229 	/*
230 	 * In HAMMER version 4+ filesystems the volume header does NOT
231 	 * contain definitive UNDO FIFO state.  In particular, the
232 	 * rootmap->next_offset may not be indexed completely to the
233 	 * end of the active UNDO FIFO.
234 	 */
235 	if (hmp->version >= HAMMER_VOL_VERSION_FOUR) {
236 		/*
237 		 * To find the definitive range we must first scan backwards
238 		 * from first_offset to locate the first real record and
239 		 * extract the sequence number from it.  This record is not
240 		 * part of the active undo space.
241 		 */
242 		scan_offset = first_offset;
243 		seqno = 0;
244 
245 		for (;;) {
246 			head = hammer_recover_scan_rev(hmp, root_volume,
247 						       &scan_offset,
248 						       &error, &buffer);
249 			if (error)
250 				break;
251 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
252 				seqno = head->head.hdr_seq;
253 				break;
254 			}
255 		}
256 		if (error) {
257 			kprintf("HAMMER(%s) recovery failure "
258 				"during seqno backscan\n",
259 				root_volume->ondisk->vol_name);
260 			goto done;
261 		}
262 
263 		/*
264 		 * Scan forwards from first_offset and (seqno+1) looking
265 		 * for a sequence space discontinuity.  This denotes the
266 		 * end of the active FIFO area.
267 		 *
268 		 * NOTE: For the case where the FIFO is empty the very first
269 		 *	 record we find will be discontinuous.
270 		 *
271 		 * NOTE: Do not include trailing PADs in the scan range,
272 		 *	 and remember the returned scan_offset after a
273 		 *	 fwd iteration points to the end of the returned
274 		 *	 record.
275 		 */
276 		kprintf("HAMMER(%s) recovery check seqno=%08x\n",
277 			root_volume->ondisk->vol_name,
278 			seqno);
279 
280 		scan_offset = first_offset;
281 		scan_offset_save = scan_offset;
282 		++seqno;
283 		hmp->recover_stage2_seqno = seqno;
284 
285 		for (;;) {
286 			head = hammer_recover_scan_fwd(hmp, root_volume,
287 						       &scan_offset,
288 						       &error, &buffer);
289 			if (error)
290 				break;
291 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
292 				if (seqno != head->head.hdr_seq) {
293 					scan_offset = scan_offset_save;
294 					break;
295 				}
296 				scan_offset_save = scan_offset;
297 				++seqno;
298 			}
299 
300 #if 0
301 			/*
302 			 * If the forward scan is grossly ahead of last_offset
303 			 * then something is wrong.  last_offset is supposed
304 			 * to be flushed out
305 			 */
306 			if (last_offset >= scan_offset) {
307 				bytes = last_offset - scan_offset;
308 			} else {
309 				bytes = rootmap->alloc_offset - scan_offset +
310 					(last_offset & HAMMER_OFF_LONG_MASK);
311 			}
312 			if (bytes >
313 			    (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK) *
314 			    4 / 5) {
315 				kprintf("HAMMER(%s) recovery forward scan is "
316 					"grossly beyond the last_offset in "
317 					"the volume header, this can't be "
318 					"right.\n",
319 					root_volume->ondisk->vol_name);
320 				error = EIO;
321 				break;
322 			}
323 #endif
324 		}
325 
326 		/*
327 		 * Store the seqno.  This will be the next seqno we lay down
328 		 * when generating new UNDOs.
329 		 */
330 		hmp->undo_seqno = seqno;
331 		if (error) {
332 			kprintf("HAMMER(%s) recovery failure "
333 				"during seqno fwdscan\n",
334 				root_volume->ondisk->vol_name);
335 			goto done;
336 		}
337 		last_offset = scan_offset;
338 		kprintf("HAMMER(%s) recovery range %016jx-%016jx\n"
339 			"HAMMER(%s) recovery nexto %016jx endseqno=%08x\n",
340 			root_volume->ondisk->vol_name,
341 			(intmax_t)first_offset,
342 			(intmax_t)last_offset,
343 			root_volume->ondisk->vol_name,
344 			(intmax_t)rootmap->next_offset,
345 			seqno);
346 	}
347 
348 	/*
349 	 * Calculate the size of the active portion of the FIFO.  If the
350 	 * FIFO is empty the filesystem is clean and no further action is
351 	 * needed.
352 	 */
353 	if (last_offset >= first_offset) {
354 		bytes = last_offset - first_offset;
355 	} else {
356 		bytes = rootmap->alloc_offset - first_offset +
357 			(last_offset & HAMMER_OFF_LONG_MASK);
358 	}
359 	if (bytes == 0) {
360 		degenerate_case = 1;
361 		error = 0;
362 		goto done;
363 	}
364 
365 	kprintf("HAMMER(%s) recovery undo  %016jx-%016jx (%jd bytes)%s\n",
366 		root_volume->ondisk->vol_name,
367 		(intmax_t)first_offset,
368 		(intmax_t)last_offset,
369 		(intmax_t)bytes,
370 		(hmp->ronly ? " (RO)" : "(RW)"));
371 	if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
372 		kprintf("Undo size is absurd, unable to mount\n");
373 		error = EIO;
374 		goto done;
375 	}
376 
377 	/*
378 	 * Scan the UNDOs backwards.
379 	 */
380 	scan_offset = last_offset;
381 
382 	while ((int64_t)bytes > 0) {
383 		KKASSERT(scan_offset != first_offset);
384 		head = hammer_recover_scan_rev(hmp, root_volume,
385 					       &scan_offset, &error, &buffer);
386 		if (error)
387 			break;
388 
389 		/*
390 		 * Normal UNDO
391 		 */
392 		error = hammer_recover_undo(hmp, root_volume, &head->undo);
393 		if (error) {
394 			kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
395 				root_volume->ondisk->vol_name,
396 				(intmax_t)scan_offset - head->head.hdr_size);
397 			break;
398 		}
399 
400 		/*
401 		 * The first REDO_SYNC record encountered (scanning backwards)
402 		 * enables REDO processing.
403 		 */
404 		if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO &&
405 		    head->redo.redo_flags == HAMMER_REDO_SYNC) {
406 			if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) {
407 				kprintf("HAMMER(%s) Ignoring extra REDO_SYNC "
408 					"records in UNDO/REDO FIFO.\n",
409 					root_volume->ondisk->vol_name
410 				);
411 			} else {
412 				hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ;
413 				hmp->recover_stage2_offset =
414 					head->redo.redo_offset;
415 				kprintf("HAMMER(%s) Found REDO_SYNC %016jx\n",
416 					root_volume->ondisk->vol_name,
417 					(intmax_t)head->redo.redo_offset);
418 			}
419 		}
420 
421 		bytes -= head->head.hdr_size;
422 
423 		/*
424 		 * If too many dirty buffers have built up we have to flush'm
425 		 * out.  As long as we do not flush out the volume header
426 		 * a crash here should not cause any problems.
427 		 *
428 		 * buffer must be released so the flush can assert that
429 		 * all buffers are idle.
430 		 */
431 		if (hammer_flusher_meta_limit(hmp)) {
432 			if (buffer) {
433 				hammer_rel_buffer(buffer, 0);
434 				buffer = NULL;
435 			}
436 			if (hmp->ronly == 0) {
437 				hammer_recover_flush_buffers(hmp, root_volume,
438 							     0);
439 				kprintf("HAMMER(%s) Continuing recovery\n",
440 					root_volume->ondisk->vol_name);
441 			} else {
442 				kprintf("HAMMER(%s) Recovery failure: Insufficient buffer cache to hold dirty buffers on read-only mount!\n",
443 					root_volume->ondisk->vol_name);
444 				error = EIO;
445 				break;
446 			}
447 		}
448 	}
449 	KKASSERT(error || bytes == 0);
450 done:
451 	if (buffer) {
452 		hammer_rel_buffer(buffer, 0);
453 		buffer = NULL;
454 	}
455 
456 	/*
457 	 * After completely flushing all the recovered buffers the volume
458 	 * header will also be flushed.
459 	 */
460 	if (root_volume->io.recovered == 0) {
461 		hammer_ref_volume(root_volume);
462 		root_volume->io.recovered = 1;
463 	}
464 
465 	/*
466 	 * Finish up flushing (or discarding) recovered buffers.  FIFO
467 	 * indices in the volume header are updated to the actual undo
468 	 * range but will not be collapsed until stage 2.
469 	 */
470 	if (error == 0) {
471 		hammer_modify_volume(NULL, root_volume, NULL, 0);
472 		rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
473 		rootmap->first_offset = first_offset;
474 		rootmap->next_offset = last_offset;
475 		hammer_modify_volume_done(root_volume);
476 		if (hmp->ronly == 0)
477 			hammer_recover_flush_buffers(hmp, root_volume, 1);
478 	} else {
479 		hammer_recover_flush_buffers(hmp, root_volume, -1);
480 	}
481 	if (degenerate_case == 0) {
482 		kprintf("HAMMER(%s) recovery complete\n",
483 			root_volume->ondisk->vol_name);
484 	} else {
485 		kprintf("HAMMER(%s) mounted clean, no recovery needed\n",
486 			root_volume->ondisk->vol_name);
487 	}
488 	return (error);
489 }
490 
491 /*
492  * Execute redo operations
493  *
494  * This procedure is run at the end of the mount sequence, after the hammer
495  * mount structure has been completely initialized but before the filesystem
496  * goes live.  It can access standard cursors, the B-Tree, flush the
497  * filesystem, and so forth.
498  *
499  * This code may only be called for read-write mounts or when a mount
500  * switches from read-only to read-write.  vnodes may or may not be present.
501  *
502  * The stage1 code will have already calculated the correct FIFO range
503  * for the nominal UNDO FIFO and stored it in the rootmap.  The extended
504  * range for REDO is stored in hmp->recover_stage2_offset.
505  */
506 int
507 hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume)
508 {
509 	hammer_blockmap_t rootmap;
510 	hammer_buffer_t buffer;
511 	hammer_off_t scan_offset;
512 	hammer_off_t oscan_offset;
513 	hammer_off_t bytes;
514 	hammer_off_t ext_bytes;
515 	hammer_fifo_any_t head;
516 	hammer_off_t first_offset;
517 	hammer_off_t last_offset;
518 	hammer_off_t ext_offset;
519 	struct hammer_rterm_rb_tree rterm_root;
520 	u_int32_t seqno;
521 	int error;
522 	int verbose = 0;
523 	int dorscan;
524 
525 	/*
526 	 * Stage 2 can only be run on a RW mount, or when the mount is
527 	 * switched from RO to RW.
528 	 */
529 	KKASSERT(hmp->ronly == 0);
530 	RB_INIT(&rterm_root);
531 
532 	/*
533 	 * Examine the UNDO FIFO.  If it is empty the filesystem is clean
534 	 * and no action need be taken.
535 	 */
536 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
537 	first_offset = rootmap->first_offset;
538 	last_offset  = rootmap->next_offset;
539 	if (first_offset == last_offset) {
540 		KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0);
541 		return(0);
542 	}
543 
544 	/*
545 	 * Stage2 must only be run once, and will not be run at all
546 	 * if Stage1 did not find a REDO_SYNC record.
547 	 */
548 	error = 0;
549 	buffer = NULL;
550 
551 	if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0)
552 		goto done;
553 	hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ;
554 	hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN;
555 	ext_offset = hmp->recover_stage2_offset;
556 	if (ext_offset == 0) {
557 		kprintf("HAMMER(%s) REDO stage specified but no REDO_SYNC "
558 			"offset, ignoring\n",
559 			root_volume->ondisk->vol_name);
560 		goto done;
561 	}
562 
563 	/*
564 	 * Calculate nominal UNDO range (this is not yet the extended
565 	 * range).
566 	 */
567 	if (last_offset >= first_offset) {
568 		bytes = last_offset - first_offset;
569 	} else {
570 		bytes = rootmap->alloc_offset - first_offset +
571 			(last_offset & HAMMER_OFF_LONG_MASK);
572 	}
573 	kprintf("HAMMER(%s) recovery redo  %016jx-%016jx (%jd bytes)%s\n",
574 		root_volume->ondisk->vol_name,
575 		(intmax_t)first_offset,
576 		(intmax_t)last_offset,
577 		(intmax_t)bytes,
578 		(hmp->ronly ? " (RO)" : "(RW)"));
579 	verbose = 1;
580 	if (bytes > (rootmap->alloc_offset & HAMMER_OFF_LONG_MASK)) {
581 		kprintf("Undo size is absurd, unable to mount\n");
582 		error = EIO;
583 		goto fatal;
584 	}
585 
586 	/*
587 	 * Scan the REDOs backwards collecting REDO_TERM_* information.
588 	 * This information is only collected for the extended range,
589 	 * non-inclusive of any TERMs in the nominal UNDO range.
590 	 *
591 	 * If the stage2 extended range is inside the nominal undo range
592 	 * we have nothing to scan.
593 	 *
594 	 * This must fit in memory!
595 	 */
596 	if (first_offset < last_offset) {
597 		/*
598 		 * [      first_offset........last_offset      ]
599 		 */
600 		if (ext_offset < first_offset) {
601 			dorscan = 1;
602 			ext_bytes = first_offset - ext_offset;
603 		} else if (ext_offset > last_offset) {
604 			dorscan = 1;
605 			ext_bytes = (rootmap->alloc_offset - ext_offset) +
606 				    (first_offset & HAMMER_OFF_LONG_MASK);
607 		} else {
608 			ext_bytes = -(ext_offset - first_offset);
609 			dorscan = 0;
610 		}
611 	} else {
612 		/*
613 		 * [......last_offset         first_offset.....]
614 		 */
615 		if (ext_offset < last_offset) {
616 			ext_bytes = -((rootmap->alloc_offset - first_offset) +
617 				    (ext_offset & HAMMER_OFF_LONG_MASK));
618 			dorscan = 0;
619 		} else if (ext_offset > first_offset) {
620 			ext_bytes = -(ext_offset - first_offset);
621 			dorscan = 0;
622 		} else {
623 			ext_bytes = first_offset - ext_offset;
624 			dorscan = 1;
625 		}
626 	}
627 
628 	if (dorscan) {
629 		scan_offset = first_offset;
630 		kprintf("HAMMER(%s) Find extended redo  %016jx, %jd extbytes\n",
631 			root_volume->ondisk->vol_name,
632 			(intmax_t)ext_offset,
633 			(intmax_t)ext_bytes);
634 		seqno = hmp->recover_stage2_seqno - 1;
635 		for (;;) {
636 			head = hammer_recover_scan_rev(hmp, root_volume,
637 						       &scan_offset,
638 						       &error, &buffer);
639 			if (error)
640 				break;
641 			if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) {
642 				if (head->head.hdr_seq != seqno) {
643 					error = ERANGE;
644 					break;
645 				}
646 				error = hammer_recover_redo_rec(
647 						hmp, &rterm_root,
648 						scan_offset, &head->redo);
649 				--seqno;
650 			}
651 			if (scan_offset == ext_offset)
652 				break;
653 		}
654 		if (error) {
655 			kprintf("HAMMER(%s) Find extended redo failed %d, "
656 				"unable to run REDO\n",
657 				root_volume->ondisk->vol_name,
658 				error);
659 			goto done;
660 		}
661 	} else {
662 		kprintf("HAMMER(%s) Embedded extended redo %016jx, "
663 			"%jd extbytes\n",
664 			root_volume->ondisk->vol_name,
665 			(intmax_t)ext_offset,
666 			(intmax_t)ext_bytes);
667 	}
668 
669 	/*
670 	 * Scan the REDO forwards through the entire extended range.
671 	 * Anything with a previously recorded matching TERM is discarded.
672 	 */
673 	scan_offset = ext_offset;
674 	bytes += ext_bytes;
675 
676 	/*
677 	 * NOTE: when doing a forward scan the returned scan_offset is
678 	 *	 for the record following the returned record, so we
679 	 *	 have to play a bit.
680 	 */
681 	while ((int64_t)bytes > 0) {
682 		KKASSERT(scan_offset != last_offset);
683 
684 		oscan_offset = scan_offset;
685 		head = hammer_recover_scan_fwd(hmp, root_volume,
686 					       &scan_offset, &error, &buffer);
687 		if (error)
688 			break;
689 
690 		error = hammer_recover_redo_run(hmp, &rterm_root,
691 						oscan_offset, &head->redo);
692 		if (error) {
693 			kprintf("HAMMER(%s) UNDO record at %016jx failed\n",
694 				root_volume->ondisk->vol_name,
695 				(intmax_t)scan_offset - head->head.hdr_size);
696 			break;
697 		}
698 		bytes -= head->head.hdr_size;
699 	}
700 	KKASSERT(error || bytes == 0);
701 
702 done:
703 	if (buffer) {
704 		hammer_rel_buffer(buffer, 0);
705 		buffer = NULL;
706 	}
707 
708 	/*
709 	 * Cleanup rterm tree
710 	 */
711 	{
712 		hammer_rterm_t rterm;
713 		hammer_rterm_entry_t rte;
714 
715 		while ((rterm = RB_ROOT(&rterm_root)) != NULL) {
716 			RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm);
717 			while ((rte = rterm->term_list) != NULL) {
718 				rterm->term_list = rte->next;
719 				kfree(rte, hmp->m_misc);
720 			}
721 			kfree(rterm, hmp->m_misc);
722 		}
723 	}
724 
725 	/*
726 	 * Finish up flushing (or discarding) recovered buffers by executing
727 	 * a normal flush cycle.  Setting HMNT_UNDO_DIRTY bypasses degenerate
728 	 * case tests and forces the flush in order to update the FIFO indices.
729 	 *
730 	 * If a crash occurs during the flush the entire undo/redo will be
731 	 * re-run during recovery on the next mount.
732 	 */
733 	if (error == 0) {
734 		if (rootmap->first_offset != rootmap->next_offset)
735 			hmp->hflags |= HMNT_UNDO_DIRTY;
736 		hammer_flusher_sync(hmp);
737 	}
738 fatal:
739 	hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN;
740 	if (verbose) {
741 		kprintf("HAMMER(%s) End redo recovery\n",
742 			root_volume->ondisk->vol_name);
743 	}
744 	return (error);
745 }
746 
747 /*
748  * Scan backwards from *scan_offsetp, return the FIFO record prior to the
749  * record at *scan_offsetp or NULL if an error occured.
750  *
751  * On return *scan_offsetp will be the offset of the returned record.
752  */
753 hammer_fifo_any_t
754 hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume,
755 			hammer_off_t *scan_offsetp,
756 			int *errorp, struct hammer_buffer **bufferp)
757 {
758 	hammer_off_t scan_offset;
759 	hammer_blockmap_t rootmap;
760 	hammer_fifo_any_t head;
761 	hammer_fifo_tail_t tail;
762 
763 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
764 	scan_offset = *scan_offsetp;
765 
766 	if (hammer_debug_general & 0x0080)
767 		kprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset);
768 	if (scan_offset == HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0))
769 		scan_offset = rootmap->alloc_offset;
770 	if (scan_offset - sizeof(*tail) <
771 	    HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0)) {
772 		kprintf("HAMMER(%s) UNDO record at %016jx FIFO underflow\n",
773 			root_volume->ondisk->vol_name,
774 			(intmax_t)scan_offset);
775 		*errorp = EIO;
776 		return (NULL);
777 	}
778 	tail = hammer_bread(hmp, scan_offset - sizeof(*tail),
779 			    errorp, bufferp);
780 	if (*errorp) {
781 		kprintf("HAMMER(%s) Unable to read UNDO TAIL "
782 			"at %016jx\n",
783 			root_volume->ondisk->vol_name,
784 			(intmax_t)scan_offset - sizeof(*tail));
785 		return (NULL);
786 	}
787 
788 	if (hammer_check_tail_signature(tail, scan_offset) != 0) {
789 		kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
790 			"at %016jx\n",
791 			root_volume->ondisk->vol_name,
792 			(intmax_t)scan_offset - sizeof(*tail));
793 		*errorp = EIO;
794 		return (NULL);
795 	}
796 	head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
797 	*scan_offsetp = scan_offset - head->head.hdr_size;
798 
799 	return (head);
800 }
801 
802 /*
803  * Scan forwards from *scan_offsetp, return the FIFO record or NULL if
804  * an error occured.
805  *
806  * On return *scan_offsetp will be the offset of the record following
807  * the returned record.
808  */
809 hammer_fifo_any_t
810 hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume,
811 			hammer_off_t *scan_offsetp,
812 			int *errorp, struct hammer_buffer **bufferp)
813 {
814 	hammer_off_t scan_offset;
815 	hammer_blockmap_t rootmap;
816 	hammer_fifo_any_t head;
817 
818 	rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX];
819 	scan_offset = *scan_offsetp;
820 
821 	if (hammer_debug_general & 0x0080)
822 		kprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset);
823 	if (scan_offset == rootmap->alloc_offset)
824 		scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
825 
826 	head = hammer_bread(hmp, scan_offset, errorp, bufferp);
827 	if (*errorp) {
828 		kprintf("HAMMER(%s) Unable to read UNDO HEAD at %016jx\n",
829 			root_volume->ondisk->vol_name,
830 			(intmax_t)scan_offset);
831 		return (NULL);
832 	}
833 
834 	if (hammer_check_head_signature(&head->head, scan_offset) != 0) {
835 		kprintf("HAMMER(%s) Illegal UNDO TAIL signature "
836 			"at %016jx\n",
837 			root_volume->ondisk->vol_name,
838 			(intmax_t)scan_offset);
839 		*errorp = EIO;
840 		return (NULL);
841 	}
842 	scan_offset += head->head.hdr_size;
843 	if (scan_offset == rootmap->alloc_offset)
844 		scan_offset = HAMMER_ZONE_ENCODE(HAMMER_ZONE_UNDO_INDEX, 0);
845 	*scan_offsetp = scan_offset;
846 
847 	return (head);
848 }
849 
850 /*
851  * Helper function for hammer_check_{head,tail}_signature().  Check stuff
852  * once the head and tail has been established.
853  *
854  * This function validates the entire FIFO record wrapper.
855  */
856 static __inline
857 int
858 _hammer_check_signature(hammer_fifo_head_t head, hammer_fifo_tail_t tail,
859 			hammer_off_t beg_off)
860 {
861 	hammer_off_t end_off;
862 	u_int32_t crc;
863 	int bytes;
864 
865 	/*
866 	 * Check signatures.  The tail signature is allowed to be the
867 	 * head signature only for 8-byte PADs.
868 	 */
869 	if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) {
870 		kprintf("HAMMER: FIFO record bad head signature "
871 			"%04x at %016jx\n",
872 			head->hdr_signature,
873 			(intmax_t)beg_off);
874 		return(2);
875 	}
876 	if (head->hdr_size < HAMMER_HEAD_ALIGN ||
877 	    (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) {
878 		kprintf("HAMMER: FIFO record unaligned or bad size"
879 			"%04x at %016jx\n",
880 			head->hdr_size,
881 			(intmax_t)beg_off);
882 		return(2);
883 	}
884 	end_off = beg_off + head->hdr_size;
885 
886 	if (head->hdr_type != HAMMER_HEAD_TYPE_PAD ||
887 	    (size_t)(end_off - beg_off) != sizeof(*tail)) {
888 		if (head->hdr_type != tail->tail_type) {
889 			kprintf("HAMMER: FIFO record head/tail type mismatch "
890 				"%04x %04x at %016jx\n",
891 				head->hdr_type, tail->tail_type,
892 				(intmax_t)beg_off);
893 			return(2);
894 		}
895 		if (head->hdr_size != tail->tail_size) {
896 			kprintf("HAMMER: FIFO record head/tail size mismatch "
897 				"%04x %04x at %016jx\n",
898 				head->hdr_size, tail->tail_size,
899 				(intmax_t)beg_off);
900 			return(2);
901 		}
902 		if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) {
903 			kprintf("HAMMER: FIFO record bad tail signature "
904 				"%04x at %016jx\n",
905 				tail->tail_signature,
906 				(intmax_t)beg_off);
907 			return(3);
908 		}
909 	}
910 
911 	/*
912 	 * Non-PAD records must have a CRC and must be sized at
913 	 * least large enough to fit the head and tail.
914 	 */
915 	if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) {
916 		crc = crc32(head, HAMMER_FIFO_HEAD_CRCOFF) ^
917 		      crc32(head + 1, head->hdr_size - sizeof(*head));
918 		if (head->hdr_crc != crc) {
919 			kprintf("HAMMER: FIFO record CRC failed %08x %08x "
920 				"at %016jx\n",
921 				head->hdr_crc, crc,
922 				(intmax_t)beg_off);
923 			return(EIO);
924 		}
925 		if (head->hdr_size < sizeof(*head) + sizeof(*tail)) {
926 			kprintf("HAMMER: FIFO record too small "
927 				"%04x at %016jx\n",
928 				head->hdr_size,
929 				(intmax_t)beg_off);
930 			return(EIO);
931 		}
932 	}
933 
934 	/*
935 	 * Check the tail
936 	 */
937 	bytes = head->hdr_size;
938 	tail = (void *)((char *)head + bytes - sizeof(*tail));
939 	if (tail->tail_size != head->hdr_size) {
940 		kprintf("HAMMER: Bad tail size %04x vs %04x at %016jx\n",
941 			tail->tail_size, head->hdr_size,
942 			(intmax_t)beg_off);
943 		return(EIO);
944 	}
945 	if (tail->tail_type != head->hdr_type) {
946 		kprintf("HAMMER: Bad tail type %04x vs %04x at %016jx\n",
947 			tail->tail_type, head->hdr_type,
948 			(intmax_t)beg_off);
949 		return(EIO);
950 	}
951 
952 	return(0);
953 }
954 
955 /*
956  * Check that the FIFO record is in-bounds given the head and the
957  * hammer offset.
958  *
959  * Also checks that the head and tail structures agree with each other,
960  * but does not check beyond the signature, type, and size.
961  */
962 static int
963 hammer_check_head_signature(hammer_fifo_head_t head, hammer_off_t beg_off)
964 {
965 	hammer_fifo_tail_t tail;
966 	hammer_off_t end_off;
967 
968 	/*
969 	 * head overlaps buffer boundary.  This could be a PAD so only
970 	 * check the minimum PAD size here.
971 	 */
972 	if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64)
973 		return(1);
974 
975 	/*
976 	 * Calculate the ending offset and make sure the record does
977 	 * not cross a buffer boundary.
978 	 */
979 	end_off = beg_off + head->hdr_size;
980 	if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
981 		return(1);
982 	tail = (void *)((char *)head + head->hdr_size - sizeof(*tail));
983 	return (_hammer_check_signature(head, tail, beg_off));
984 }
985 
986 /*
987  * Check that the FIFO record is in-bounds given the tail and the
988  * hammer offset.  The offset is pointing at the ending boundary of the
989  * record.
990  *
991  * Also checks that the head and tail structures agree with each other,
992  * but does not check beyond the signature, type, and size.
993  */
994 static int
995 hammer_check_tail_signature(hammer_fifo_tail_t tail, hammer_off_t end_off)
996 {
997 	hammer_fifo_head_t head;
998 	hammer_off_t beg_off;
999 
1000 	/*
1001 	 * tail overlaps buffer boundary
1002 	 */
1003 	if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1004 		return(1);
1005 
1006 	/*
1007 	 * Calculate the begining offset and make sure the record does
1008 	 * not cross a buffer boundary.
1009 	 */
1010 	beg_off = end_off - tail->tail_size;
1011 	if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64)
1012 		return(1);
1013 	head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size);
1014 	return (_hammer_check_signature(head, tail, beg_off));
1015 }
1016 
1017 static int
1018 hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume,
1019 		    hammer_fifo_undo_t undo)
1020 {
1021 	hammer_volume_t volume;
1022 	hammer_buffer_t buffer;
1023 	hammer_off_t buf_offset;
1024 	int zone;
1025 	int error;
1026 	int vol_no;
1027 	int bytes;
1028 	u_int32_t offset;
1029 
1030 	/*
1031 	 * Only process UNDO records.  Flag if we find other records to
1032 	 * optimize stage2 recovery.
1033 	 */
1034 	if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO)
1035 		return(0);
1036 
1037 	/*
1038 	 * Validate the UNDO record.
1039 	 */
1040 	bytes = undo->head.hdr_size - sizeof(*undo) -
1041 		sizeof(struct hammer_fifo_tail);
1042 	if (bytes < 0 || undo->undo_data_bytes < 0 ||
1043 	    undo->undo_data_bytes > bytes) {
1044 		kprintf("HAMMER: Corrupt UNDO record, undo_data_bytes %d/%d\n",
1045 			undo->undo_data_bytes, bytes);
1046 		return(EIO);
1047 	}
1048 
1049 	bytes = undo->undo_data_bytes;
1050 
1051 	/*
1052 	 * The undo offset may only be a zone-1 or zone-2 offset.
1053 	 *
1054 	 * Currently we only support a zone-1 offset representing the
1055 	 * volume header.
1056 	 */
1057 	zone = HAMMER_ZONE_DECODE(undo->undo_offset);
1058 	offset = undo->undo_offset & HAMMER_BUFMASK;
1059 
1060 	if (offset + bytes > HAMMER_BUFSIZE) {
1061 		kprintf("HAMMER: Corrupt UNDO record, bad offset\n");
1062 		return (EIO);
1063 	}
1064 
1065 	switch(zone) {
1066 	case HAMMER_ZONE_RAW_VOLUME_INDEX:
1067 		vol_no = HAMMER_VOL_DECODE(undo->undo_offset);
1068 		volume = hammer_get_volume(hmp, vol_no, &error);
1069 		if (volume == NULL) {
1070 			kprintf("HAMMER: UNDO record, "
1071 				"cannot access volume %d\n", vol_no);
1072 			break;
1073 		}
1074 		hammer_modify_volume(NULL, volume, NULL, 0);
1075 		hammer_recover_copy_undo(undo->undo_offset,
1076 					 (char *)(undo + 1),
1077 					 (char *)volume->ondisk + offset,
1078 					 bytes);
1079 		hammer_modify_volume_done(volume);
1080 
1081 		/*
1082 		 * Multiple modifications may be made to the same buffer.
1083 		 * Also, the volume header cannot be written out until
1084 		 * everything else has been flushed.  This also
1085 		 * covers the read-only case by preventing the kernel from
1086 		 * flushing the buffer.
1087 		 */
1088 		if (volume->io.recovered == 0)
1089 			volume->io.recovered = 1;
1090 		else
1091 			hammer_rel_volume(volume, 0);
1092 		break;
1093 	case HAMMER_ZONE_RAW_BUFFER_INDEX:
1094 		buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64;
1095 		buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE,
1096 					   0, &error);
1097 		if (buffer == NULL) {
1098 			kprintf("HAMMER: UNDO record, "
1099 				"cannot access buffer %016jx\n",
1100 				(intmax_t)undo->undo_offset);
1101 			break;
1102 		}
1103 		hammer_modify_buffer(NULL, buffer, NULL, 0);
1104 		hammer_recover_copy_undo(undo->undo_offset,
1105 					 (char *)(undo + 1),
1106 					 (char *)buffer->ondisk + offset,
1107 					 bytes);
1108 		hammer_modify_buffer_done(buffer);
1109 
1110 		/*
1111 		 * Multiple modifications may be made to the same buffer,
1112 		 * improve performance by delaying the flush.  This also
1113 		 * covers the read-only case by preventing the kernel from
1114 		 * flushing the buffer.
1115 		 */
1116 		if (buffer->io.recovered == 0)
1117 			buffer->io.recovered = 1;
1118 		else
1119 			hammer_rel_buffer(buffer, 0);
1120 		break;
1121 	default:
1122 		kprintf("HAMMER: Corrupt UNDO record\n");
1123 		error = EIO;
1124 	}
1125 	return (error);
1126 }
1127 
1128 static void
1129 hammer_recover_copy_undo(hammer_off_t undo_offset,
1130 			 char *src, char *dst, int bytes)
1131 {
1132 	if (hammer_debug_general & 0x0080) {
1133 		kprintf("UNDO %016jx: %d\n",
1134 			(intmax_t)undo_offset, bytes);
1135 	}
1136 #if 0
1137 	kprintf("UNDO %016jx:", (intmax_t)undo_offset);
1138 	hammer_recover_debug_dump(22, dst, bytes);
1139 	kprintf("%22s", "to:");
1140 	hammer_recover_debug_dump(22, src, bytes);
1141 #endif
1142 	bcopy(src, dst, bytes);
1143 }
1144 
1145 /*
1146  * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations
1147  * during the backwards scan of the extended UNDO/REDO FIFO.  This scan
1148  * does not include the nominal UNDO range, just the extended range.
1149  */
1150 int
1151 hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1152 			hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1153 {
1154 	hammer_rterm_t rterm;
1155 	hammer_rterm_t nrterm;
1156 	hammer_rterm_entry_t rte;
1157 
1158 	if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1159 		return(0);
1160 	if (redo->redo_flags != HAMMER_REDO_TERM_WRITE &&
1161 	    redo->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1162 		return(0);
1163 	}
1164 
1165 	nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO);
1166 	nrterm->redo_objid = redo->redo_objid;
1167 	nrterm->redo_localization = redo->redo_localization;
1168 	nrterm->redo_flags = redo->redo_flags;
1169 	nrterm->redo_offset = redo->redo_offset;
1170 
1171 	rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm);
1172 	if (rterm)
1173 		kfree(nrterm, hmp->m_misc);
1174 	else
1175 		rterm = nrterm;
1176 
1177 	if (bootverbose) {
1178 		kprintf("record record %016jx objid %016jx "
1179 			"offset %016jx flags %08x\n",
1180 			(intmax_t)scan_offset,
1181 			(intmax_t)redo->redo_objid,
1182 			(intmax_t)redo->redo_offset,
1183 			(int)redo->redo_flags);
1184 	}
1185 
1186 	/*
1187 	 * Scan in reverse order, rte prepended, so the rte list will be
1188 	 * in forward order.
1189 	 */
1190 	rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO);
1191 	rte->fifo_offset = scan_offset;
1192 	rte->next = rterm->term_list;
1193 	rterm->term_list = rte;
1194 
1195 	return(0);
1196 }
1197 
1198 /*
1199  * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during
1200  * the forwards scan of the entire extended UNDO/REDO FIFO range.
1201  *
1202  * Records matching previously recorded TERMs have already been committed
1203  * and are ignored.
1204  */
1205 int
1206 hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root,
1207 			hammer_off_t scan_offset, hammer_fifo_redo_t redo)
1208 {
1209 	struct hammer_rterm rtval;
1210 	hammer_rterm_t rterm;
1211 	hammer_rterm_entry_t rte;
1212 
1213 	if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO)
1214 		return(0);
1215 
1216 	switch(redo->redo_flags) {
1217 	case HAMMER_REDO_WRITE:
1218 	case HAMMER_REDO_TRUNC:
1219 		/*
1220 		 * We hit a REDO request.  The REDO request is only executed
1221 		 * if there is no matching TERM.
1222 		 */
1223 		bzero(&rtval, sizeof(rtval));
1224 		rtval.redo_objid = redo->redo_objid;
1225 		rtval.redo_localization = redo->redo_localization;
1226 		rtval.redo_offset = redo->redo_offset;
1227 		rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ?
1228 				   HAMMER_REDO_TERM_WRITE :
1229 				   HAMMER_REDO_TERM_TRUNC;
1230 
1231 		rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1232 		if (rterm) {
1233 			if (bootverbose) {
1234 				kprintf("ignore record %016jx objid %016jx "
1235 					"offset %016jx flags %08x\n",
1236 					(intmax_t)scan_offset,
1237 					(intmax_t)redo->redo_objid,
1238 					(intmax_t)redo->redo_offset,
1239 					(int)redo->redo_flags);
1240 			}
1241 			break;
1242 		}
1243 		if (bootverbose) {
1244 			kprintf("run    record %016jx objid %016jx "
1245 				"offset %016jx flags %08x\n",
1246 				(intmax_t)scan_offset,
1247 				(intmax_t)redo->redo_objid,
1248 				(intmax_t)redo->redo_offset,
1249 				(int)redo->redo_flags);
1250 		}
1251 
1252 		/*
1253 		 * Redo stage2 can access a live filesystem, acquire the
1254 		 * vnode.
1255 		 */
1256 		hammer_recover_redo_exec(hmp, redo);
1257 		break;
1258 	case HAMMER_REDO_TERM_WRITE:
1259 	case HAMMER_REDO_TERM_TRUNC:
1260 		/*
1261 		 * As we encounter TERMs in the forward scan we remove
1262 		 * them.  Once the forward scan hits the nominal undo range
1263 		 * there will be no more recorded TERMs.
1264 		 */
1265 		bzero(&rtval, sizeof(rtval));
1266 		rtval.redo_objid = redo->redo_objid;
1267 		rtval.redo_localization = redo->redo_localization;
1268 		rtval.redo_flags = redo->redo_flags;
1269 		rtval.redo_offset = redo->redo_offset;
1270 
1271 		rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval);
1272 		if (rterm) {
1273 			if ((rte = rterm->term_list) != NULL) {
1274 				KKASSERT(rte->fifo_offset == scan_offset);
1275 				rterm->term_list = rte->next;
1276 				kfree(rte, hmp->m_misc);
1277 			}
1278 		}
1279 		break;
1280 	}
1281 	return(0);
1282 }
1283 
1284 static void
1285 hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo)
1286 {
1287 	struct hammer_transaction trans;
1288 	struct vattr va;
1289 	struct hammer_inode *ip;
1290 	struct vnode *vp = NULL;
1291 	int error;
1292 
1293 	hammer_start_transaction(&trans, hmp);
1294 
1295 	ip = hammer_get_inode(&trans, NULL, redo->redo_objid,
1296 			      HAMMER_MAX_TID, redo->redo_localization,
1297 			      0, &error);
1298 	if (ip == NULL) {
1299 		kprintf("unable to find objid %016jx:%08x\n",
1300 			(intmax_t)redo->redo_objid, redo->redo_localization);
1301 		goto done2;
1302 	}
1303 	error = hammer_get_vnode(ip, &vp);
1304 	if (error) {
1305 		kprintf("unable to acquire vnode for %016jx:%08x\n",
1306 			(intmax_t)redo->redo_objid, redo->redo_localization);
1307 		goto done1;
1308 	}
1309 
1310 	switch(redo->redo_flags) {
1311 	case HAMMER_REDO_WRITE:
1312 		error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL);
1313 		if (error) {
1314 			kprintf("vn_rdwr open %016jx:%08x returned %d\n",
1315 				(intmax_t)redo->redo_objid,
1316 				redo->redo_localization, error);
1317 			break;
1318 		}
1319 		vn_unlock(vp);
1320 		error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1),
1321 				redo->redo_data_bytes,
1322 				redo->redo_offset, UIO_SYSSPACE,
1323 				0, proc0.p_ucred, NULL);
1324 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1325 		if (error) {
1326 			kprintf("write %016jx:%08x returned %d\n",
1327 				(intmax_t)redo->redo_objid,
1328 				redo->redo_localization, error);
1329 		}
1330 		VOP_CLOSE(vp, FREAD|FWRITE);
1331 		break;
1332 	case HAMMER_REDO_TRUNC:
1333 		VATTR_NULL(&va);
1334 		va.va_size = redo->redo_offset;
1335 		error = VOP_SETATTR(vp, &va, proc0.p_ucred);
1336 		if (error) {
1337 			kprintf("setattr offset %016jx error %d\n",
1338 				(intmax_t)redo->redo_offset, error);
1339 		}
1340 		break;
1341 	}
1342 	vput(vp);
1343 done1:
1344 	hammer_rel_inode(ip, 0);
1345 done2:
1346 	hammer_done_transaction(&trans);
1347 }
1348 
1349 /*
1350  * RB tree compare function.  Note that REDO_TERM_TRUNC ops ignore
1351  * the offset.
1352  *
1353  * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc.
1354  */
1355 static int
1356 hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2)
1357 {
1358 	if (rt1->redo_objid < rt2->redo_objid)
1359 		return(-1);
1360 	if (rt1->redo_objid > rt2->redo_objid)
1361 		return(1);
1362 	if (rt1->redo_localization < rt2->redo_localization)
1363 		return(-1);
1364 	if (rt1->redo_localization > rt2->redo_localization)
1365 		return(1);
1366 	if (rt1->redo_flags < rt2->redo_flags)
1367 		return(-1);
1368 	if (rt1->redo_flags > rt2->redo_flags)
1369 		return(1);
1370 	if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) {
1371 		if (rt1->redo_offset < rt2->redo_offset)
1372 			return(-1);
1373 		if (rt1->redo_offset > rt2->redo_offset)
1374 			return(1);
1375 	}
1376 	return(0);
1377 }
1378 
1379 #if 0
1380 
1381 static void
1382 hammer_recover_debug_dump(int w, char *buf, int bytes)
1383 {
1384 	int i;
1385 
1386 	for (i = 0; i < bytes; ++i) {
1387 		if (i && (i & 15) == 0)
1388 			kprintf("\n%*.*s", w, w, "");
1389 		kprintf(" %02x", (unsigned char)buf[i]);
1390 	}
1391 	kprintf("\n");
1392 }
1393 
1394 #endif
1395 
1396 /*
1397  * Flush recovered buffers from recovery operations.  The call to this
1398  * routine may be delayed if a read-only mount was made and then later
1399  * upgraded to read-write.  This routine is also called when unmounting
1400  * a read-only mount to clean out recovered (dirty) buffers which we
1401  * couldn't flush (because the mount is read-only).
1402  *
1403  * The volume header is always written last.  The UNDO FIFO will be forced
1404  * to zero-length by setting next_offset to first_offset.  This leaves the
1405  * (now stale) UNDO information used to recover the disk available for
1406  * forensic analysis.
1407  *
1408  * final is typically 0 or 1.  The volume header is only written if final
1409  * is 1.  If final is -1 the recovered buffers are discarded instead of
1410  * written and root_volume can also be passed as NULL in that case.
1411  */
1412 static int hammer_recover_flush_volume_callback(hammer_volume_t, void *);
1413 static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *);
1414 
1415 void
1416 hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume,
1417 			     int final)
1418 {
1419         /*
1420          * Flush the buffers out asynchronously, wait for all the I/O to
1421 	 * complete, then do it again to destroy the buffer cache buffer
1422 	 * so it doesn't alias something later on.
1423          */
1424 	RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1425 		hammer_recover_flush_buffer_callback, &final);
1426 	hammer_io_wait_all(hmp, "hmrrcw", 1);
1427 	RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL,
1428 		hammer_recover_flush_buffer_callback, &final);
1429 
1430 	/*
1431 	 * Flush all volume headers except the root volume.  If final < 0
1432 	 * we discard all volume headers including the root volume.
1433 	 */
1434 	if (final >= 0) {
1435 		RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1436 			hammer_recover_flush_volume_callback, root_volume);
1437 	} else {
1438 		RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1439 			hammer_recover_flush_volume_callback, NULL);
1440 	}
1441 
1442 	/*
1443 	 * Finalize the root volume header.
1444 	 *
1445 	 * No interlock is needed, volume buffers are not
1446 	 * messed with by bioops.
1447 	 */
1448 	if (root_volume && root_volume->io.recovered && final > 0) {
1449 		hammer_io_wait_all(hmp, "hmrflx", 1);
1450 		root_volume->io.recovered = 0;
1451 		hammer_io_flush(&root_volume->io, 0);
1452 		hammer_rel_volume(root_volume, 0);
1453 		hammer_io_wait_all(hmp, "hmrfly", 1);
1454 	}
1455 }
1456 
1457 /*
1458  * Callback to flush volume headers.  If discarding data will be NULL and
1459  * all volume headers (including the root volume) will be discarded.
1460  * Otherwise data is the root_volume and we flush all volume headers
1461  * EXCEPT the root_volume.
1462  *
1463  * Clear any I/O error or modified condition when discarding buffers to
1464  * clean up the reference count, otherwise the buffer may have extra refs
1465  * on it.
1466  */
1467 static
1468 int
1469 hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data)
1470 {
1471 	hammer_volume_t root_volume = data;
1472 
1473 	if (volume->io.recovered && volume != root_volume) {
1474 		volume->io.recovered = 0;
1475 		if (root_volume != NULL) {
1476 			/*
1477 			 * No interlock is needed, volume buffers are not
1478 			 * messed with by bioops.
1479 			 */
1480 			hammer_io_flush(&volume->io, 0);
1481 		} else {
1482 			hammer_io_clear_error(&volume->io);
1483 			hammer_io_clear_modify(&volume->io, 1);
1484 		}
1485 		hammer_rel_volume(volume, 0);
1486 	}
1487 	return(0);
1488 }
1489 
1490 /*
1491  * Flush or discard recovered I/O buffers.
1492  *
1493  * Clear any I/O error or modified condition when discarding buffers to
1494  * clean up the reference count, otherwise the buffer may have extra refs
1495  * on it.
1496  */
1497 static
1498 int
1499 hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data)
1500 {
1501 	int final = *(int *)data;
1502 	int flush;
1503 
1504 	if (buffer->io.recovered) {
1505 		buffer->io.recovered = 0;
1506 		buffer->io.reclaim = 1;
1507 		if (final < 0) {
1508 			hammer_io_clear_error(&buffer->io);
1509 			hammer_io_clear_modify(&buffer->io, 1);
1510 		} else {
1511 			hammer_io_write_interlock(&buffer->io);
1512 			hammer_io_flush(&buffer->io, 0);
1513 			hammer_io_done_interlock(&buffer->io);
1514 		}
1515 		hammer_rel_buffer(buffer, 0);
1516 	} else {
1517 		flush = hammer_ref_interlock(&buffer->io.lock);
1518 		if (flush)
1519 			atomic_add_int(&hammer_count_refedbufs, 1);
1520 
1521 		if (final < 0) {
1522 			hammer_io_clear_error(&buffer->io);
1523 			hammer_io_clear_modify(&buffer->io, 1);
1524 		}
1525 		KKASSERT(hammer_oneref(&buffer->io.lock));
1526 		buffer->io.reclaim = 1;
1527 		hammer_rel_buffer(buffer, flush);
1528 	}
1529 	return(0);
1530 }
1531 
1532