xref: /dragonfly/sys/vfs/hammer/hammer_ondisk.c (revision 6b5c5d0d)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.28 2008/02/08 08:30:59 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41 
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47 
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
51 static int hammer_load_node(hammer_node_t node);
52 static hammer_off_t hammer_advance_fifo(hammer_volume_t volume,
53 		hammer_off_t off, int32_t bytes);
54 
55 static hammer_off_t hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len,
56 		int32_t data_len, struct hammer_buffer **rec_bufferp,
57 		u_int16_t hdr_type, int can_cross,
58 		struct hammer_buffer **data2_bufferp, int *errorp);
59 
60 /*
61  * Red-Black tree support for various structures
62  */
63 static int
64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
65 {
66 	if (ip1->obj_id < ip2->obj_id)
67 		return(-1);
68 	if (ip1->obj_id > ip2->obj_id)
69 		return(1);
70 	if (ip1->obj_asof < ip2->obj_asof)
71 		return(-1);
72 	if (ip1->obj_asof > ip2->obj_asof)
73 		return(1);
74 	return(0);
75 }
76 
77 static int
78 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
79 {
80 	if (info->obj_id < ip->obj_id)
81 		return(-1);
82 	if (info->obj_id > ip->obj_id)
83 		return(1);
84 	if (info->obj_asof < ip->obj_asof)
85 		return(-1);
86 	if (info->obj_asof > ip->obj_asof)
87 		return(1);
88 	return(0);
89 }
90 
91 static int
92 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
93 {
94 	if (vol1->vol_no < vol2->vol_no)
95 		return(-1);
96 	if (vol1->vol_no > vol2->vol_no)
97 		return(1);
98 	return(0);
99 }
100 
101 static int
102 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
103 {
104 	if (buf1->buf_offset < buf2->buf_offset)
105 		return(-1);
106 	if (buf1->buf_offset > buf2->buf_offset)
107 		return(1);
108 	return(0);
109 }
110 
111 static int
112 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
113 {
114 	if (node1->node_offset < node2->node_offset)
115 		return(-1);
116 	if (node1->node_offset > node2->node_offset)
117 		return(1);
118 	return(0);
119 }
120 
121 /*
122  * Note: The lookup function for hammer_ino_rb_tree winds up being named
123  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
124  * functions are normal, e.g. hammer_buf_rb_tree_RB_LOOKUP(root, buf_offset).
125  */
126 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
127 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
128 		hammer_inode_info_cmp, hammer_inode_info_t);
129 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
130 	     hammer_vol_rb_compare, int32_t, vol_no);
131 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
132 	     hammer_buf_rb_compare, hammer_off_t, buf_offset);
133 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
134 	     hammer_nod_rb_compare, hammer_off_t, node_offset);
135 
136 /************************************************************************
137  *				VOLUMES					*
138  ************************************************************************
139  *
140  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
141  * code on failure.  Volumes must be loaded at mount time, get_volume() will
142  * not load a new volume.
143  *
144  * Calls made to hammer_load_volume() or single-threaded
145  */
146 int
147 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
148 {
149 	struct mount *mp;
150 	hammer_volume_t volume;
151 	struct hammer_volume_ondisk *ondisk;
152 	struct nlookupdata nd;
153 	struct buf *bp = NULL;
154 	int error;
155 	int ronly;
156 
157 	mp = hmp->mp;
158 	ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
159 
160 	/*
161 	 * Allocate a volume structure
162 	 */
163 	++hammer_count_volumes;
164 	volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
165 	volume->vol_name = kstrdup(volname, M_HAMMER);
166 	volume->hmp = hmp;
167 	hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
168 	volume->io.offset = 0LL;
169 
170 	/*
171 	 * Get the device vnode
172 	 */
173 	error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
174 	if (error == 0)
175 		error = nlookup(&nd);
176 	if (error == 0)
177 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
178 	nlookup_done(&nd);
179 	if (error == 0) {
180 		if (vn_isdisk(volume->devvp, &error)) {
181 			error = vfs_mountedon(volume->devvp);
182 		}
183 	}
184 	if (error == 0 &&
185 	    count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) {
186 		error = EBUSY;
187 	}
188 	if (error == 0) {
189 		vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
190 		error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
191 		if (error == 0) {
192 			error = VOP_OPEN(volume->devvp,
193 					 (ronly ? FREAD : FREAD|FWRITE),
194 					 FSCRED, NULL);
195 		}
196 		vn_unlock(volume->devvp);
197 	}
198 	if (error) {
199 		hammer_free_volume(volume);
200 		return(error);
201 	}
202 	volume->devvp->v_rdev->si_mountpoint = mp;
203 
204 	/*
205 	 * Extract the volume number from the volume header and do various
206 	 * sanity checks.
207 	 */
208 	error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
209 	if (error)
210 		goto late_failure;
211 	ondisk = (void *)bp->b_data;
212 	if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) {
213 		kprintf("hammer_mount: volume %s has an invalid header\n",
214 			volume->vol_name);
215 		error = EFTYPE;
216 		goto late_failure;
217 	}
218 	volume->vol_no = ondisk->vol_no;
219 	volume->buffer_base = ondisk->vol_buf_beg;
220 	volume->vol_flags = ondisk->vol_flags;
221 	volume->nblocks = ondisk->vol_nblocks;
222 	volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no,
223 				    ondisk->vol_buf_end - ondisk->vol_buf_beg);
224 	RB_INIT(&volume->rb_bufs_root);
225 
226 	hmp->mp->mnt_stat.f_blocks += volume->nblocks;
227 
228 	if (RB_EMPTY(&hmp->rb_vols_root)) {
229 		hmp->fsid = ondisk->vol_fsid;
230 	} else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
231 		kprintf("hammer_mount: volume %s's fsid does not match "
232 			"other volumes\n", volume->vol_name);
233 		error = EFTYPE;
234 		goto late_failure;
235 	}
236 
237 	/*
238 	 * Insert the volume structure into the red-black tree.
239 	 */
240 	if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
241 		kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
242 			volume->vol_name, volume->vol_no);
243 		error = EEXIST;
244 	}
245 
246 	/*
247 	 * Set the root volume .  HAMMER special cases rootvol the structure.
248 	 * We do not hold a ref because this would prevent related I/O
249 	 * from being flushed.
250 	 */
251 	if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
252 		hmp->rootvol = volume;
253 		if (bp) {
254 			brelse(bp);
255 			bp = NULL;
256 		}
257 		hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
258 	}
259 late_failure:
260 	if (bp)
261 		brelse(bp);
262 	if (error) {
263 		/*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
264 		VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
265 		hammer_free_volume(volume);
266 	}
267 	return (error);
268 }
269 
270 /*
271  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
272  * so returns -1 on failure.
273  */
274 int
275 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
276 {
277 	struct hammer_mount *hmp = volume->hmp;
278 	int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
279 
280 	/*
281 	 * Sync clusters, sync volume
282 	 */
283 
284 	hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
285 
286 	/*
287 	 * Clean up the root volume pointer, which is held unlocked in hmp.
288 	 */
289 	if (hmp->rootvol == volume)
290 		hmp->rootvol = NULL;
291 
292 	/*
293 	 * Unload clusters and super-clusters.  Unloading a super-cluster
294 	 * also unloads related clusters, but the filesystem may not be
295 	 * using super-clusters so unload clusters anyway.
296 	 */
297 	RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
298 			hammer_unload_buffer, NULL);
299 	hammer_io_waitdep(&volume->io);
300 
301 	/*
302 	 * Release our buffer and flush anything left in the buffer cache.
303 	 */
304 	hammer_io_release(&volume->io, 2);
305 
306 	/*
307 	 * There should be no references on the volume, no clusters, and
308 	 * no super-clusters.
309 	 */
310 	KKASSERT(volume->io.lock.refs == 0);
311 	KKASSERT(RB_EMPTY(&volume->rb_bufs_root));
312 
313 	volume->ondisk = NULL;
314 	if (volume->devvp) {
315 		if (ronly) {
316 			vinvalbuf(volume->devvp, 0, 0, 0);
317 			VOP_CLOSE(volume->devvp, FREAD);
318 		} else {
319 			vinvalbuf(volume->devvp, V_SAVE, 0, 0);
320 			VOP_CLOSE(volume->devvp, FREAD|FWRITE);
321 		}
322 	}
323 
324 	/*
325 	 * Destroy the structure
326 	 */
327 	RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
328 	hammer_free_volume(volume);
329 	return(0);
330 }
331 
332 static
333 void
334 hammer_free_volume(hammer_volume_t volume)
335 {
336 	if (volume->vol_name) {
337 		kfree(volume->vol_name, M_HAMMER);
338 		volume->vol_name = NULL;
339 	}
340 	if (volume->devvp) {
341 		if (vn_isdisk(volume->devvp, NULL) &&
342 		    volume->devvp->v_rdev &&
343 		    volume->devvp->v_rdev->si_mountpoint == volume->hmp->mp
344 		) {
345 			volume->devvp->v_rdev->si_mountpoint = NULL;
346 		}
347 		vrele(volume->devvp);
348 		volume->devvp = NULL;
349 	}
350 	--hammer_count_volumes;
351 	kfree(volume, M_HAMMER);
352 }
353 
354 /*
355  * Get a HAMMER volume.  The volume must already exist.
356  */
357 hammer_volume_t
358 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
359 {
360 	struct hammer_volume *volume;
361 
362 	/*
363 	 * Locate the volume structure
364 	 */
365 	volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
366 	if (volume == NULL) {
367 		*errorp = ENOENT;
368 		return(NULL);
369 	}
370 	hammer_ref(&volume->io.lock);
371 
372 	/*
373 	 * Deal with on-disk info
374 	 */
375 	if (volume->ondisk == NULL || volume->io.loading) {
376 		*errorp = hammer_load_volume(volume);
377 		if (*errorp) {
378 			hammer_rel_volume(volume, 1);
379 			volume = NULL;
380 		}
381 	} else {
382 		*errorp = 0;
383 	}
384 	return(volume);
385 }
386 
387 int
388 hammer_ref_volume(hammer_volume_t volume)
389 {
390 	int error;
391 
392 	hammer_ref(&volume->io.lock);
393 
394 	/*
395 	 * Deal with on-disk info
396 	 */
397 	if (volume->ondisk == NULL || volume->io.loading) {
398 		error = hammer_load_volume(volume);
399 		if (error)
400 			hammer_rel_volume(volume, 1);
401 	} else {
402 		error = 0;
403 	}
404 	return (error);
405 }
406 
407 hammer_volume_t
408 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
409 {
410 	hammer_volume_t volume;
411 
412 	volume = hmp->rootvol;
413 	KKASSERT(volume != NULL);
414 	hammer_ref(&volume->io.lock);
415 
416 	/*
417 	 * Deal with on-disk info
418 	 */
419 	if (volume->ondisk == NULL || volume->io.loading) {
420 		*errorp = hammer_load_volume(volume);
421 		if (*errorp) {
422 			hammer_rel_volume(volume, 1);
423 			volume = NULL;
424 		}
425 	} else {
426 		*errorp = 0;
427 	}
428 	return (volume);
429 }
430 
431 /*
432  * Load a volume's on-disk information.  The volume must be referenced and
433  * not locked.  We temporarily acquire an exclusive lock to interlock
434  * against releases or multiple get's.
435  */
436 static int
437 hammer_load_volume(hammer_volume_t volume)
438 {
439 	struct hammer_volume_ondisk *ondisk;
440 	int error;
441 
442 	hammer_lock_ex(&volume->io.lock);
443 	KKASSERT(volume->io.loading == 0);
444 	volume->io.loading = 1;
445 
446 	if (volume->ondisk == NULL) {
447 		error = hammer_io_read(volume->devvp, &volume->io);
448 		if (error) {
449 			volume->io.loading = 0;
450 			hammer_unlock(&volume->io.lock);
451 			return (error);
452 		}
453 		volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
454 	} else {
455 		error = 0;
456 	}
457 	volume->io.loading = 0;
458 	hammer_unlock(&volume->io.lock);
459 	return(0);
460 }
461 
462 /*
463  * Release a volume.  Call hammer_io_release on the last reference.  We have
464  * to acquire an exclusive lock to interlock against volume->ondisk tests
465  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
466  * lock to be held.
467  *
468  * Volumes are not unloaded from memory during normal operation.
469  */
470 void
471 hammer_rel_volume(hammer_volume_t volume, int flush)
472 {
473 	if (volume->io.lock.refs == 1) {
474 		hammer_lock_ex(&volume->io.lock);
475 		if (volume->io.lock.refs == 1) {
476 			volume->ondisk = NULL;
477 			hammer_io_release(&volume->io, flush);
478 		} else if (flush) {
479 			hammer_io_flush(&volume->io);
480 		}
481 		hammer_unlock(&volume->io.lock);
482 	}
483 	hammer_unref(&volume->io.lock);
484 }
485 
486 /************************************************************************
487  *				BUFFERS					*
488  ************************************************************************
489  *
490  * Manage buffers.  Note that a buffer holds a reference to its associated
491  * cluster, and its cluster will hold a reference to the cluster's volume.
492  */
493 hammer_buffer_t
494 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
495 		  int isnew, int *errorp)
496 {
497 	hammer_buffer_t buffer;
498 	hammer_volume_t volume;
499 	int vol_no;
500 
501 	buf_offset &= ~HAMMER_BUFMASK64;
502 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
503 	vol_no = HAMMER_VOL_DECODE(buf_offset);
504 	volume = hammer_get_volume(hmp, vol_no, errorp);
505 	if (volume == NULL)
506 		return(NULL);
507 	/*
508 	 * NOTE: buf_offset and maxbuf_off are both full offset
509 	 * specifications.
510 	 */
511 	KKASSERT(buf_offset < volume->maxbuf_off);
512 
513 	/*
514 	 * Locate and lock the buffer structure, creating one if necessary.
515 	 */
516 again:
517 	buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
518 			   buf_offset);
519 	if (buffer == NULL) {
520 		++hammer_count_buffers;
521 		buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
522 		buffer->buf_offset = buf_offset;
523 		buffer->volume = volume;
524 		hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
525 		buffer->io.offset = volume->ondisk->vol_buf_beg +
526 				    (buf_offset & HAMMER_OFF_SHORT_MASK);
527 		TAILQ_INIT(&buffer->clist);
528 		hammer_ref(&buffer->io.lock);
529 
530 		/*
531 		 * Insert the buffer into the RB tree and handle late
532 		 * collisions.
533 		 */
534 		if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) {
535 			hammer_unref(&buffer->io.lock);
536 			--hammer_count_buffers;
537 			kfree(buffer, M_HAMMER);
538 			goto again;
539 		}
540 		hammer_ref(&volume->io.lock);
541 	} else {
542 		hammer_ref(&buffer->io.lock);
543 	}
544 
545 	/*
546 	 * Deal with on-disk info
547 	 */
548 	if (buffer->ondisk == NULL || buffer->io.loading) {
549 		*errorp = hammer_load_buffer(buffer, isnew);
550 		if (*errorp) {
551 			hammer_rel_buffer(buffer, 1);
552 			buffer = NULL;
553 		}
554 	} else {
555 		*errorp = 0;
556 	}
557 	hammer_rel_volume(volume, 0);
558 	return(buffer);
559 }
560 
561 static int
562 hammer_load_buffer(hammer_buffer_t buffer, int isnew)
563 {
564 	hammer_volume_t volume;
565 	void *ondisk;
566 	int error;
567 
568 	/*
569 	 * Load the buffer's on-disk info
570 	 */
571 	volume = buffer->volume;
572 	hammer_lock_ex(&buffer->io.lock);
573 	KKASSERT(buffer->io.loading == 0);
574 	buffer->io.loading = 1;
575 
576 	if (buffer->ondisk == NULL) {
577 		if (isnew) {
578 			error = hammer_io_new(volume->devvp, &buffer->io);
579 		} else {
580 			error = hammer_io_read(volume->devvp, &buffer->io);
581 		}
582 		if (error) {
583 			buffer->io.loading = 0;
584 			hammer_unlock(&buffer->io.lock);
585 			return (error);
586 		}
587 		buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
588 	} else if (isnew) {
589 		error = hammer_io_new(volume->devvp, &buffer->io);
590 	} else {
591 		error = 0;
592 	}
593 	if (error == 0 && isnew) {
594 		hammer_modify_buffer(buffer, NULL, 0);
595 		/* additional initialization goes here */
596 	}
597 	buffer->io.loading = 0;
598 	hammer_unlock(&buffer->io.lock);
599 	return (error);
600 }
601 
602 /*
603  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
604  */
605 int
606 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
607 {
608 	hammer_ref(&buffer->io.lock);
609 	hammer_flush_buffer_nodes(buffer);
610 	KKASSERT(buffer->io.lock.refs == 1);
611 	hammer_rel_buffer(buffer, 2);
612 	return(0);
613 }
614 
615 /*
616  * Reference a buffer that is either already referenced or via a specially
617  * handled pointer (aka cursor->buffer).
618  */
619 int
620 hammer_ref_buffer(hammer_buffer_t buffer)
621 {
622 	int error;
623 
624 	hammer_ref(&buffer->io.lock);
625 	if (buffer->ondisk == NULL || buffer->io.loading) {
626 		error = hammer_load_buffer(buffer, 0);
627 		if (error) {
628 			hammer_rel_buffer(buffer, 1);
629 			/*
630 			 * NOTE: buffer pointer can become stale after
631 			 * the above release.
632 			 */
633 		}
634 	} else {
635 		error = 0;
636 	}
637 	return(error);
638 }
639 
640 /*
641  * Release a buffer.  We have to deal with several places where
642  * another thread can ref the buffer.
643  *
644  * Only destroy the structure itself if the related buffer cache buffer
645  * was disassociated from it.  This ties the management of the structure
646  * to the buffer cache subsystem.  buffer->ondisk determines whether the
647  * embedded io is referenced or not.
648  */
649 void
650 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
651 {
652 	hammer_volume_t volume;
653 
654 	if (buffer->io.lock.refs == 1) {
655 		hammer_lock_ex(&buffer->io.lock);
656 		if (buffer->io.lock.refs == 1) {
657 			hammer_io_release(&buffer->io, flush);
658 
659 			if (buffer->io.bp == NULL &&
660 			    buffer->io.lock.refs == 1) {
661 				hammer_flush_buffer_nodes(buffer);
662 				KKASSERT(TAILQ_EMPTY(&buffer->clist));
663 				volume = buffer->volume;
664 				RB_REMOVE(hammer_buf_rb_tree,
665 					  &volume->rb_bufs_root, buffer);
666 				buffer->volume = NULL; /* sanity */
667 				--hammer_count_buffers;
668 				kfree(buffer, M_HAMMER);
669 				hammer_rel_volume(volume, 0);
670 				return;
671 			}
672 		} else if (flush) {
673 			hammer_io_flush(&buffer->io);
674 		}
675 		hammer_unlock(&buffer->io.lock);
676 	}
677 	hammer_unref(&buffer->io.lock);
678 }
679 
680 /*
681  * Access the filesystem buffer containing the specified hammer offset.
682  * buf_offset is a conglomeration of the volume number and vol_buf_beg
683  * relative buffer offset.  It must also have bit 55 set to be valid.
684  * (see hammer_off_t in hammer_disk.h).
685  *
686  * Any prior buffer in *bufferp will be released and replaced by the
687  * requested buffer.
688  */
689 void *
690 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp,
691 	     struct hammer_buffer **bufferp)
692 {
693 	hammer_buffer_t buffer;
694 	int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
695 
696 	buf_offset &= ~HAMMER_BUFMASK64;
697 
698 	buffer = *bufferp;
699 	if (buffer == NULL || buffer->buf_offset != buf_offset) {
700 		if (buffer)
701 			hammer_rel_buffer(buffer, 0);
702 		buffer = hammer_get_buffer(hmp, buf_offset, 0, errorp);
703 		*bufferp = buffer;
704 	} else {
705 		*errorp = 0;
706 	}
707 
708 	/*
709 	 * Return a pointer to the buffer data.
710 	 */
711 	if (buffer == NULL)
712 		return(NULL);
713 	else
714 		return((char *)buffer->ondisk + xoff);
715 }
716 
717 /*
718  * Access the filesystem buffer containing the specified hammer offset.
719  * No disk read operation occurs.  The result buffer may contain garbage.
720  *
721  * Any prior buffer in *bufferp will be released and replaced by the
722  * requested buffer.
723  */
724 void *
725 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp,
726 	     struct hammer_buffer **bufferp)
727 {
728 	hammer_buffer_t buffer;
729 	int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
730 
731 	buf_offset &= ~HAMMER_BUFMASK64;
732 
733 	buffer = *bufferp;
734 	if (buffer == NULL || buffer->buf_offset != buf_offset) {
735 		if (buffer)
736 			hammer_rel_buffer(buffer, 0);
737 		buffer = hammer_get_buffer(hmp, buf_offset, 1, errorp);
738 		*bufferp = buffer;
739 	} else {
740 		*errorp = 0;
741 	}
742 
743 	/*
744 	 * Return a pointer to the buffer data.
745 	 */
746 	if (buffer == NULL)
747 		return(NULL);
748 	else
749 		return((char *)buffer->ondisk + xoff);
750 }
751 
752 /************************************************************************
753  *				NODES					*
754  ************************************************************************
755  *
756  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
757  * method used by the HAMMER filesystem.
758  *
759  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
760  * associated with its buffer, and will only referenced the buffer while
761  * the node itself is referenced.
762  *
763  * A hammer_node can also be passively associated with other HAMMER
764  * structures, such as inodes, while retaining 0 references.  These
765  * associations can be cleared backwards using a pointer-to-pointer in
766  * the hammer_node.
767  *
768  * This allows the HAMMER implementation to cache hammer_nodes long-term
769  * and short-cut a great deal of the infrastructure's complexity.  In
770  * most cases a cached node can be reacquired without having to dip into
771  * either the buffer or cluster management code.
772  *
773  * The caller must pass a referenced cluster on call and will retain
774  * ownership of the reference on return.  The node will acquire its own
775  * additional references, if necessary.
776  */
777 hammer_node_t
778 hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset, int *errorp)
779 {
780 	hammer_volume_t volume;
781 	hammer_node_t node;
782 	int32_t vol_no;
783 
784 	KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) ==
785 		 HAMMER_ZONE_RAW_BUFFER);
786 	vol_no = HAMMER_VOL_DECODE(node_offset);
787 	volume = hammer_get_volume(hmp, vol_no, errorp);
788 	if (volume == NULL)
789 		return(NULL);
790 
791 	/*
792 	 * Locate the structure, allocating one if necessary.
793 	 */
794 again:
795 	node = RB_LOOKUP(hammer_nod_rb_tree, &volume->rb_nods_root,
796 			 node_offset);
797 	if (node == NULL) {
798 		++hammer_count_nodes;
799 		node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
800 		node->node_offset = node_offset;
801 		node->volume = volume;	/* not directly referenced */
802 		if (RB_INSERT(hammer_nod_rb_tree, &volume->rb_nods_root,
803 			      node)) {
804 			--hammer_count_nodes;
805 			kfree(node, M_HAMMER);
806 			goto again;
807 		}
808 	}
809 	hammer_ref(&node->lock);
810 	*errorp = hammer_load_node(node);
811 	if (*errorp) {
812 		hammer_rel_node(node);
813 		node = NULL;
814 	}
815 	hammer_rel_volume(volume, 0);
816 	return(node);
817 }
818 
819 /*
820  * Reference an already-referenced node.
821  */
822 int
823 hammer_ref_node(hammer_node_t node)
824 {
825 	int error;
826 
827 	KKASSERT(node->lock.refs > 0);
828 	hammer_ref(&node->lock);
829 	if ((error = hammer_load_node(node)) != 0)
830 		hammer_rel_node(node);
831 	return(error);
832 }
833 
834 /*
835  * Load a node's on-disk data reference.
836  */
837 static int
838 hammer_load_node(hammer_node_t node)
839 {
840 	hammer_buffer_t buffer;
841 	int error;
842 
843 	if (node->ondisk)
844 		return(0);
845 	error = 0;
846 	hammer_lock_ex(&node->lock);
847 	if (node->ondisk == NULL) {
848 		/*
849 		 * This is a little confusing but the jist is that
850 		 * node->buffer determines whether the node is on
851 		 * the buffer's clist and node->ondisk determines
852 		 * whether the buffer is referenced.
853 		 */
854 		if ((buffer = node->buffer) != NULL) {
855 			error = hammer_ref_buffer(buffer);
856 		} else {
857 			buffer = hammer_get_buffer(node->volume->hmp,
858 						   node->node_offset, 0,
859 						   &error);
860 			if (buffer) {
861 				KKASSERT(error == 0);
862 				TAILQ_INSERT_TAIL(&buffer->clist,
863 						  node, entry);
864 				node->buffer = buffer;
865 			}
866 		}
867 		if (error == 0) {
868 			node->ondisk = (void *)((char *)buffer->ondisk +
869 			       (node->node_offset & HAMMER_BUFMASK));
870 		}
871 	}
872 	hammer_unlock(&node->lock);
873 	return (error);
874 }
875 
876 /*
877  * Safely reference a node, interlock against flushes via the IO subsystem.
878  */
879 hammer_node_t
880 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
881 		     int *errorp)
882 {
883 	hammer_node_t node;
884 
885 	if ((node = *cache) != NULL)
886 		hammer_ref(&node->lock);
887 	if (node) {
888 		*errorp = hammer_load_node(node);
889 		if (*errorp) {
890 			hammer_rel_node(node);
891 			node = NULL;
892 		}
893 	} else {
894 		*errorp = ENOENT;
895 	}
896 	return(node);
897 }
898 
899 /*
900  * Release a hammer_node.  On the last release the node dereferences
901  * its underlying buffer and may or may not be destroyed.
902  */
903 void
904 hammer_rel_node(hammer_node_t node)
905 {
906 	hammer_buffer_t buffer;
907 
908 	/*
909 	 * If this isn't the last ref just decrement the ref count and
910 	 * return.
911 	 */
912 	if (node->lock.refs > 1) {
913 		hammer_unref(&node->lock);
914 		return;
915 	}
916 
917 	/*
918 	 * If there is no ondisk info or no buffer the node failed to load,
919 	 * remove the last reference and destroy the node.
920 	 */
921 	if (node->ondisk == NULL) {
922 		hammer_unref(&node->lock);
923 		hammer_flush_node(node);
924 		/* node is stale now */
925 		return;
926 	}
927 
928 	/*
929 	 * Do final cleanups and then either destroy the node and leave it
930 	 * passively cached.  The buffer reference is removed regardless.
931 	 */
932 	buffer = node->buffer;
933 	node->ondisk = NULL;
934 
935 	if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
936 		hammer_unref(&node->lock);
937 		hammer_rel_buffer(buffer, 0);
938 		return;
939 	}
940 
941 	/*
942 	 * Destroy the node if it has been marked for deletion.  We mark
943 	 * it as being free.  Note that the disk space is physically
944 	 * freed when the fifo cycles back through the node.
945 	 */
946 	if (node->flags & HAMMER_NODE_DELETED)
947 		hammer_free_fifo(node->volume->hmp, node->node_offset);
948 
949 	/*
950 	 * Destroy the node.  Record pertainant data because the node
951 	 * becomes stale the instant we flush it.
952 	 */
953 	hammer_unref(&node->lock);
954 	hammer_flush_node(node);
955 	/* node is stale */
956 	hammer_rel_buffer(buffer, 0);
957 }
958 
959 /*
960  * Passively cache a referenced hammer_node in *cache.  The caller may
961  * release the node on return.
962  */
963 void
964 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
965 {
966 	hammer_node_t old;
967 
968 	/*
969 	 * If the node is being deleted, don't cache it!
970 	 */
971 	if (node->flags & HAMMER_NODE_DELETED)
972 		return;
973 
974 	/*
975 	 * Cache the node.  If we previously cached a different node we
976 	 * have to give HAMMER a chance to destroy it.
977 	 */
978 again:
979 	if (node->cache1 != cache) {
980 		if (node->cache2 != cache) {
981 			if ((old = *cache) != NULL) {
982 				KKASSERT(node->lock.refs != 0);
983 				hammer_uncache_node(cache);
984 				goto again;
985 			}
986 			if (node->cache2)
987 				*node->cache2 = NULL;
988 			node->cache2 = node->cache1;
989 			node->cache1 = cache;
990 			*cache = node;
991 		} else {
992 			struct hammer_node **tmp;
993 			tmp = node->cache1;
994 			node->cache1 = node->cache2;
995 			node->cache2 = tmp;
996 		}
997 	}
998 }
999 
1000 void
1001 hammer_uncache_node(struct hammer_node **cache)
1002 {
1003 	hammer_node_t node;
1004 
1005 	if ((node = *cache) != NULL) {
1006 		*cache = NULL;
1007 		if (node->cache1 == cache) {
1008 			node->cache1 = node->cache2;
1009 			node->cache2 = NULL;
1010 		} else if (node->cache2 == cache) {
1011 			node->cache2 = NULL;
1012 		} else {
1013 			panic("hammer_uncache_node: missing cache linkage");
1014 		}
1015 		if (node->cache1 == NULL && node->cache2 == NULL)
1016 			hammer_flush_node(node);
1017 	}
1018 }
1019 
1020 /*
1021  * Remove a node's cache references and destroy the node if it has no
1022  * other references or backing store.
1023  */
1024 void
1025 hammer_flush_node(hammer_node_t node)
1026 {
1027 	hammer_buffer_t buffer;
1028 
1029 	if (node->cache1)
1030 		*node->cache1 = NULL;
1031 	if (node->cache2)
1032 		*node->cache2 = NULL;
1033 	if (node->lock.refs == 0 && node->ondisk == NULL) {
1034 		RB_REMOVE(hammer_nod_rb_tree, &node->volume->rb_nods_root,
1035 			  node);
1036 		if ((buffer = node->buffer) != NULL) {
1037 			node->buffer = NULL;
1038 			TAILQ_REMOVE(&buffer->clist, node, entry);
1039 			/* buffer is unreferenced because ondisk is NULL */
1040 		}
1041 		--hammer_count_nodes;
1042 		kfree(node, M_HAMMER);
1043 	}
1044 }
1045 
1046 /*
1047  * Flush passively cached B-Tree nodes associated with this buffer.
1048  * This is only called when the buffer is about to be destroyed, so
1049  * none of the nodes should have any references.
1050  */
1051 void
1052 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1053 {
1054 	hammer_node_t node;
1055 
1056 	while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1057 		KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1058 		hammer_ref(&node->lock);
1059 		node->flags |= HAMMER_NODE_FLUSH;
1060 		hammer_rel_node(node);
1061 	}
1062 }
1063 
1064 
1065 /************************************************************************
1066  *				ALLOCATORS				*
1067  ************************************************************************/
1068 
1069 /*
1070  * Allocate a B-Tree node.
1071  */
1072 hammer_node_t
1073 hammer_alloc_btree(hammer_mount_t hmp, int *errorp)
1074 {
1075 	hammer_buffer_t buffer = NULL;
1076 	hammer_node_t node = NULL;
1077 	hammer_off_t node_offset;
1078 
1079 	node_offset = hammer_alloc_fifo(hmp, sizeof(struct hammer_node_ondisk),
1080 				        0, &buffer, HAMMER_HEAD_TYPE_BTREE,
1081 					0, NULL,
1082 					errorp);
1083 	if (*errorp == 0)
1084 		node = hammer_get_node(hmp, node_offset, errorp);
1085 	if (buffer)
1086 		hammer_rel_buffer(buffer, 0);
1087 	return(node);
1088 }
1089 
1090 /*
1091  * The returned buffers are already appropriately marked as being modified.
1092  * If the caller marks them again unnecessary undo records may be generated.
1093  *
1094  * The core record (rec_len) cannot cross a buffer boundary.  The record + data
1095  * is only allowed to cross a buffer boundary for HAMMER_RECTYPE_DATA
1096  */
1097 void *
1098 hammer_alloc_record(hammer_mount_t hmp,
1099                         hammer_off_t *rec_offp, u_int8_t rec_type,
1100                         int32_t rec_len, struct hammer_buffer **rec_bufferp,
1101                         hammer_off_t *data_offp, int32_t data_len,
1102                         void **data1p, void **data2p, int32_t *data2_index,
1103                         struct hammer_buffer **data2_bufferp,
1104                         int *errorp)
1105 {
1106 	int32_t aligned_rec_len, n;
1107 	hammer_off_t rec_offset;
1108 	hammer_record_ondisk_t rec;
1109 	int can_cross;
1110 
1111 	aligned_rec_len = (rec_len + HAMMER_HEAD_ALIGN_MASK) &
1112 			  ~HAMMER_HEAD_ALIGN_MASK;
1113 	can_cross = (rec_type == HAMMER_RECTYPE_DATA);
1114 
1115 	rec_offset = hammer_alloc_fifo(hmp, aligned_rec_len, data_len,
1116 				       rec_bufferp, HAMMER_HEAD_TYPE_RECORD,
1117 				       can_cross, data2_bufferp, errorp);
1118 	if (*errorp)
1119 		return(NULL);
1120 
1121 	/*
1122 	 * Basic return values.
1123 	 */
1124 	*rec_offp = rec_offset;
1125 	if (data_offp)
1126 		*data_offp = rec_offset + aligned_rec_len;
1127 	rec = (void *)((char *)(*rec_bufferp)->ondisk +
1128 		       ((int32_t)rec_offset & HAMMER_BUFMASK));
1129 	if (data_len)
1130 		rec->base.data_off = rec_offset + aligned_rec_len;
1131 	rec->base.data_len = data_len;
1132 	if (data1p)
1133 		*data1p = (void *)((char *)rec + aligned_rec_len);
1134 	if (data2_index) {
1135 		n = ((int32_t)rec_offset & HAMMER_BUFMASK) +
1136 		     aligned_rec_len + data_len;
1137 		if (n > HAMMER_BUFSIZE) {
1138 			*data2_index = data_len - (n - HAMMER_BUFSIZE);
1139 			KKASSERT(can_cross != 0);
1140 			*data2p = (*data2_bufferp)->ondisk;
1141 		} else {
1142 			*data2_index = data_len;
1143 			*data2p = NULL;
1144 		}
1145 	} else {
1146 		KKASSERT(data2p == NULL);
1147 	}
1148 	return(rec);
1149 }
1150 
1151 /*
1152  * Generate an undo fifo entry and return the buffer to the caller (XXX).
1153  * The caller must create a dependancy to ensure that the undo record is
1154  * flushed before the modified buffer is flushed.
1155  */
1156 int
1157 hammer_generate_undo(hammer_mount_t hmp, hammer_off_t off, void *base, int len)
1158 {
1159 	hammer_off_t rec_offset;
1160 	hammer_fifo_undo_t undo;
1161 	hammer_buffer_t buffer = NULL;
1162 	int error;
1163 
1164 	rec_offset = hammer_alloc_fifo(hmp, sizeof(*undo), len,
1165 				       &buffer, HAMMER_HEAD_TYPE_UNDO,
1166 				       0, NULL, &error);
1167 	if (error == 0) {
1168 		undo = (void *)((char *)buffer->ondisk +
1169 				((int32_t)rec_offset & HAMMER_BUFMASK));
1170 		undo->undo_offset = off;
1171 		bcopy(base, undo + 1, len);
1172 	}
1173 	if (buffer)
1174 		hammer_rel_buffer(buffer, 0);
1175 	return(error);
1176 }
1177 
1178 /*
1179  * Allocate space from the FIFO.  The first rec_len bytes will be zero'd.
1180  * The entire space is marked modified (the caller should not remark it as
1181  * that will cause unnecessary undo records to be added).
1182  */
1183 static
1184 hammer_off_t
1185 hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len, int32_t data_len,
1186 		  struct hammer_buffer **rec_bufferp, u_int16_t hdr_type,
1187 		  int can_cross,
1188 		  struct hammer_buffer **data2_bufferp, int *errorp)
1189 {
1190 	hammer_volume_t root_volume;
1191 	hammer_volume_t end_volume;
1192 	hammer_volume_ondisk_t ondisk;
1193 	hammer_fifo_head_t head;
1194 	hammer_off_t end_off = 0;
1195 	hammer_off_t tmp_off = 0;
1196 	int32_t end_vol_no;
1197 	int32_t tmp_vol_no;
1198 	int32_t xoff;
1199 	int32_t aligned_bytes;
1200 	int must_pad;
1201 
1202 	aligned_bytes = (rec_len + data_len + HAMMER_HEAD_ALIGN_MASK) &
1203 			~HAMMER_HEAD_ALIGN_MASK;
1204 
1205 	root_volume = hammer_get_root_volume(hmp, errorp);
1206 	while (root_volume) {
1207 		hammer_modify_volume(root_volume, NULL, 0);
1208 		ondisk = root_volume->ondisk;
1209 
1210 		end_off = ondisk->vol0_fifo_end;
1211 		end_vol_no = HAMMER_VOL_DECODE(end_off);
1212 
1213 		end_volume = hammer_get_volume(hmp, end_vol_no, errorp);
1214 		if (*errorp)
1215 			goto done;
1216 
1217 		/*
1218 		 * Check to see if we ran out of space.  Include some extra
1219 		 * room.
1220 		 *
1221 		 * vol0_fifo_end cannot be advanced into the same buffer
1222 		 * that vol0_fifo_beg resides in.  This allows us to
1223 		 * instantiate a new buffer without reading it in.
1224 		 *
1225 		 * XXX messy.
1226 		 */
1227 		tmp_off = ondisk->vol0_fifo_beg & ~HAMMER_BUFMASK64;
1228 		tmp_vol_no = HAMMER_VOL_DECODE(tmp_off);
1229 		if ((tmp_off & HAMMER_OFF_SHORT_MASK) == 0) {
1230 			if (end_vol_no + 1 == tmp_vol_no) {
1231 				tmp_vol_no = end_vol_no;
1232 				tmp_off = end_volume->maxbuf_off;
1233 			} else if (end_vol_no + 1 == hmp->nvolumes &&
1234 				   tmp_vol_no == 0) {
1235 				tmp_vol_no = end_vol_no;
1236 				tmp_off = end_volume->maxbuf_off;
1237 			}
1238 		}
1239 		hammer_rel_volume(end_volume, 0);
1240 
1241 		/*
1242 		 * XXX dummy head at end of fifo
1243 		 */
1244 		if (end_vol_no == tmp_vol_no &&
1245 		    end_off < tmp_off &&
1246 		    end_off + aligned_bytes + sizeof(*head) >= tmp_off) {
1247 			*errorp = ENOSPC;
1248 			goto done;
1249 		}
1250 
1251 		if ((int32_t)end_off & HAMMER_BUFMASK)
1252 			head = hammer_bread(hmp, end_off, errorp, rec_bufferp);
1253 		else
1254 			head = hammer_bnew(hmp, end_off, errorp, rec_bufferp);
1255 		if (*errorp)
1256 			goto done;
1257 
1258 		/*
1259 		 * Load the buffer, retry if someone else squeeked in
1260 		 * while we were blocked.
1261 		 */
1262 
1263 		if (ondisk->vol0_fifo_end != end_off)
1264 			continue;
1265 
1266 		/*
1267 		 * Ok, we're gonna do something.  Modify the buffer
1268 		 */
1269 		hammer_modify_buffer(*rec_bufferp, NULL, 0);
1270 		if (ondisk->vol0_fifo_end != end_off)
1271 			continue;
1272 		xoff = (int32_t)end_off & HAMMER_BUFMASK;
1273 
1274 		/*
1275 		 * The non-data portion of the fifo record cannot cross
1276 		 * a buffer boundary.
1277 		 *
1278 		 * The entire record cannot cross a buffer boundary if
1279 		 * can_cross is 0.
1280 		 *
1281 		 * It is illegal for a record to cross a volume boundary.
1282 		 *
1283 		 * It is illegal for a record to cross a recovery boundary
1284 		 * (this is so recovery code is guaranteed a record rather
1285 		 * then data at certain points).
1286 		 *
1287 		 * Add a pad record and loop if it does.
1288 		 */
1289 		must_pad = 0;
1290 		if (xoff + rec_len > HAMMER_BUFSIZE)
1291 			must_pad = 1;
1292 		if (can_cross == 0) {
1293 			if (xoff + aligned_bytes > HAMMER_BUFSIZE)
1294 				must_pad = 1;
1295 		} else {
1296 			if (xoff + aligned_bytes > HAMMER_BUFSIZE &&
1297 			    (end_off + aligned_bytes) >=
1298 			    (*rec_bufferp)->volume->maxbuf_off) {
1299 				must_pad = 1;
1300 			}
1301 			if ((end_off ^ (end_off + aligned_bytes)) &
1302 			    HAMMER_OFF_SHORT_REC_MASK) {
1303 				must_pad = 1;
1304 			}
1305 		}
1306 		if (must_pad) {
1307 			must_pad = HAMMER_BUFSIZE - xoff;
1308 			head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1309 			head->hdr_type = HAMMER_HEAD_TYPE_PAD;
1310 			head->hdr_fwd_link = must_pad;
1311 			head->hdr_seq = 0; /* XXX seq */
1312 			KKASSERT((must_pad & 7) == 0);
1313 			ondisk->vol0_fifo_end =
1314 				hammer_advance_fifo((*rec_bufferp)->volume,
1315 						    end_off, must_pad);
1316 			/* XXX rev_link */
1317 			continue;
1318 		}
1319 
1320 		if (xoff + aligned_bytes > HAMMER_BUFSIZE) {
1321 			KKASSERT(xoff + aligned_bytes <= HAMMER_BUFSIZE * 2);
1322 			hammer_bnew(hmp, end_off + (HAMMER_BUFSIZE - xoff),
1323 				    errorp, data2_bufferp);
1324 			hammer_modify_buffer(*data2_bufferp, NULL, 0);
1325 			if (*errorp)
1326 				goto done;
1327 		}
1328 
1329 		head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1330 		head->hdr_type = hdr_type;
1331 		head->hdr_fwd_link = aligned_bytes / 64;
1332 		head->hdr_rev_link = -1; /* XXX */
1333 		head->hdr_crc = 0;
1334 		head->hdr_seq = 0;	/* XXX */
1335 		ondisk->vol0_fifo_end =
1336 			hammer_advance_fifo((*rec_bufferp)->volume,
1337 					    end_off, aligned_bytes);
1338 done:
1339 		hammer_rel_volume(root_volume, 0);
1340 		break;
1341 	}
1342 	if (*errorp)
1343 		end_off = 0;
1344 	return(end_off);
1345 }
1346 
1347 /*
1348  * Mark a fifo record as having been freed.  XXX needs undo.
1349  */
1350 void
1351 hammer_free_fifo(hammer_mount_t hmp, hammer_off_t fifo_offset)
1352 {
1353 	hammer_buffer_t buffer = NULL;
1354 	hammer_fifo_head_t head;
1355 	int error;
1356 
1357 	head = hammer_bread(hmp, fifo_offset, &error, &buffer);
1358 	if (head) {
1359 		hammer_modify_buffer(buffer, &head->hdr_type,
1360 				     sizeof(head->hdr_type));
1361 		head->hdr_type |= HAMMER_HEAD_TYPEF_FREED;
1362 	}
1363 	if (buffer)
1364 		hammer_rel_buffer(buffer, 0);
1365 }
1366 
1367 /*
1368  * Attempt to rewind the FIFO
1369  *
1370  * This routine is allowed to do nothing.
1371  */
1372 void
1373 hammer_unwind_fifo(hammer_mount_t hmp, hammer_off_t rec_offset)
1374 {
1375 }
1376 
1377 /*
1378  * Advance the FIFO a certain number of bytes.
1379  */
1380 static
1381 hammer_off_t
1382 hammer_advance_fifo(hammer_volume_t volume, hammer_off_t off, int32_t bytes)
1383 {
1384 	int32_t vol_no;
1385 
1386 	off += bytes;
1387 	KKASSERT(off <= volume->maxbuf_off);
1388 	KKASSERT((off & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
1389 	if (off == volume->maxbuf_off) {
1390 		vol_no = volume->vol_no + 1;
1391 		if (vol_no == volume->hmp->nvolumes)
1392 			vol_no = 0;
1393 		off = HAMMER_ENCODE_RAW_BUFFER(vol_no, 0);
1394 	}
1395 	return(off);
1396 }
1397 
1398 /*
1399  * Sync dirty buffers to the media
1400  */
1401 
1402 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
1403 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
1404 
1405 int
1406 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1407 {
1408 	struct hammer_sync_info info;
1409 
1410 	info.error = 0;
1411 	info.waitfor = waitfor;
1412 
1413 	vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
1414 		      hammer_sync_scan1, hammer_sync_scan2, &info);
1415 
1416 	RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1417 		hammer_sync_volume, &info);
1418 	return(info.error);
1419 }
1420 
1421 static int
1422 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1423 {
1424 	struct hammer_inode *ip;
1425 
1426 	ip = VTOI(vp);
1427 	if (vp->v_type == VNON || ip == NULL ||
1428 	    ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1429 	     RB_EMPTY(&vp->v_rbdirty_tree))) {
1430 		return(-1);
1431 	}
1432 	return(0);
1433 }
1434 
1435 static int
1436 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1437 {
1438 	struct hammer_sync_info *info = data;
1439 	struct hammer_inode *ip;
1440 	int error;
1441 
1442 	ip = VTOI(vp);
1443 	if (vp->v_type == VNON || vp->v_type == VBAD ||
1444 	    ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1445 	     RB_EMPTY(&vp->v_rbdirty_tree))) {
1446 		return(0);
1447 	}
1448 	error = VOP_FSYNC(vp, info->waitfor);
1449 	if (error)
1450 		info->error = error;
1451 	return(0);
1452 }
1453 
1454 int
1455 hammer_sync_volume(hammer_volume_t volume, void *data)
1456 {
1457 	struct hammer_sync_info *info = data;
1458 
1459 	hammer_ref(&volume->io.lock);
1460 	RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
1461 		hammer_sync_buffer, info);
1462 	hammer_rel_volume(volume, 1);
1463 	return(0);
1464 }
1465 
1466 int
1467 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
1468 {
1469 	hammer_ref(&buffer->io.lock);
1470 	hammer_rel_buffer(buffer, 1);
1471 	return(0);
1472 }
1473 
1474 /*
1475  * Generic buffer initialization.  Initialize the A-list into an all-allocated
1476  * state with the free block limit properly set.
1477  *
1478  * Note that alloc_new_buffer() will free the appropriate block range via
1479  * the appropriate cluster alist, so the free count is properly propogated.
1480  */
1481 void
1482 hammer_init_fifo(hammer_fifo_head_t head, u_int16_t type)
1483 {
1484 	head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1485 	head->hdr_type = type;
1486 	head->hdr_rev_link = 0;
1487 	head->hdr_fwd_link = 0;
1488 	head->hdr_crc = 0;
1489 	head->hdr_seq = 0;
1490 }
1491 
1492