xref: /dragonfly/sys/vfs/hammer/hammer_ondisk.c (revision 3641b7ca)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.49 2008/06/03 18:47:25 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41 
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47 
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
51 static int hammer_load_node(hammer_node_t node, int isnew);
52 
53 /*
54  * Red-Black tree support for various structures
55  */
56 static int
57 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
58 {
59 	if (ip1->obj_id < ip2->obj_id)
60 		return(-1);
61 	if (ip1->obj_id > ip2->obj_id)
62 		return(1);
63 	if (ip1->obj_asof < ip2->obj_asof)
64 		return(-1);
65 	if (ip1->obj_asof > ip2->obj_asof)
66 		return(1);
67 	return(0);
68 }
69 
70 static int
71 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
72 {
73 	if (info->obj_id < ip->obj_id)
74 		return(-1);
75 	if (info->obj_id > ip->obj_id)
76 		return(1);
77 	if (info->obj_asof < ip->obj_asof)
78 		return(-1);
79 	if (info->obj_asof > ip->obj_asof)
80 		return(1);
81 	return(0);
82 }
83 
84 static int
85 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
86 {
87 	if (vol1->vol_no < vol2->vol_no)
88 		return(-1);
89 	if (vol1->vol_no > vol2->vol_no)
90 		return(1);
91 	return(0);
92 }
93 
94 static int
95 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
96 {
97 	if (buf1->zone2_offset < buf2->zone2_offset)
98 		return(-1);
99 	if (buf1->zone2_offset > buf2->zone2_offset)
100 		return(1);
101 	return(0);
102 }
103 
104 static int
105 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
106 {
107 	if (node1->node_offset < node2->node_offset)
108 		return(-1);
109 	if (node1->node_offset > node2->node_offset)
110 		return(1);
111 	return(0);
112 }
113 
114 /*
115  * Note: The lookup function for hammer_ino_rb_tree winds up being named
116  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
117  * functions are normal, e.g. hammer_buf_rb_tree_RB_LOOKUP(root, zone2_offset).
118  */
119 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
120 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
121 		hammer_inode_info_cmp, hammer_inode_info_t);
122 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
123 	     hammer_vol_rb_compare, int32_t, vol_no);
124 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
125 	     hammer_buf_rb_compare, hammer_off_t, zone2_offset);
126 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
127 	     hammer_nod_rb_compare, hammer_off_t, node_offset);
128 
129 /************************************************************************
130  *				VOLUMES					*
131  ************************************************************************
132  *
133  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
134  * code on failure.  Volumes must be loaded at mount time, get_volume() will
135  * not load a new volume.
136  *
137  * Calls made to hammer_load_volume() or single-threaded
138  */
139 int
140 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
141 {
142 	struct mount *mp;
143 	hammer_volume_t volume;
144 	struct hammer_volume_ondisk *ondisk;
145 	struct nlookupdata nd;
146 	struct buf *bp = NULL;
147 	int error;
148 	int ronly;
149 	int setmp = 0;
150 
151 	mp = hmp->mp;
152 	ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
153 
154 	/*
155 	 * Allocate a volume structure
156 	 */
157 	++hammer_count_volumes;
158 	volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
159 	volume->vol_name = kstrdup(volname, M_HAMMER);
160 	hammer_io_init(&volume->io, hmp, HAMMER_STRUCTURE_VOLUME);
161 	volume->io.offset = 0LL;
162 
163 	/*
164 	 * Get the device vnode
165 	 */
166 	error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
167 	if (error == 0)
168 		error = nlookup(&nd);
169 	if (error == 0)
170 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
171 	nlookup_done(&nd);
172 	if (error == 0) {
173 		if (vn_isdisk(volume->devvp, &error)) {
174 			error = vfs_mountedon(volume->devvp);
175 		}
176 	}
177 	if (error == 0 &&
178 	    count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) {
179 		error = EBUSY;
180 	}
181 	if (error == 0) {
182 		vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
183 		error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
184 		if (error == 0) {
185 			error = VOP_OPEN(volume->devvp,
186 					 (ronly ? FREAD : FREAD|FWRITE),
187 					 FSCRED, NULL);
188 		}
189 		vn_unlock(volume->devvp);
190 	}
191 	if (error) {
192 		hammer_free_volume(volume);
193 		return(error);
194 	}
195 	volume->devvp->v_rdev->si_mountpoint = mp;
196 	setmp = 1;
197 
198 	/*
199 	 * Extract the volume number from the volume header and do various
200 	 * sanity checks.
201 	 */
202 	error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
203 	if (error)
204 		goto late_failure;
205 	ondisk = (void *)bp->b_data;
206 	if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) {
207 		kprintf("hammer_mount: volume %s has an invalid header\n",
208 			volume->vol_name);
209 		error = EFTYPE;
210 		goto late_failure;
211 	}
212 	volume->vol_no = ondisk->vol_no;
213 	volume->buffer_base = ondisk->vol_buf_beg;
214 	volume->vol_flags = ondisk->vol_flags;
215 	volume->nblocks = ondisk->vol_nblocks;
216 	volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no,
217 				    ondisk->vol_buf_end - ondisk->vol_buf_beg);
218 	volume->maxraw_off = ondisk->vol_buf_end;
219 	RB_INIT(&volume->rb_bufs_root);
220 
221 	if (RB_EMPTY(&hmp->rb_vols_root)) {
222 		hmp->fsid = ondisk->vol_fsid;
223 	} else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
224 		kprintf("hammer_mount: volume %s's fsid does not match "
225 			"other volumes\n", volume->vol_name);
226 		error = EFTYPE;
227 		goto late_failure;
228 	}
229 
230 	/*
231 	 * Insert the volume structure into the red-black tree.
232 	 */
233 	if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
234 		kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
235 			volume->vol_name, volume->vol_no);
236 		error = EEXIST;
237 	}
238 
239 	/*
240 	 * Set the root volume .  HAMMER special cases rootvol the structure.
241 	 * We do not hold a ref because this would prevent related I/O
242 	 * from being flushed.
243 	 */
244 	if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
245 		hmp->rootvol = volume;
246 		if (bp) {
247 			brelse(bp);
248 			bp = NULL;
249 		}
250 		hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
251 		hmp->mp->mnt_stat.f_blocks += ondisk->vol0_stat_bigblocks *
252 			(HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE);
253 		hmp->mp->mnt_vstat.f_blocks += ondisk->vol0_stat_bigblocks *
254 			(HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE);
255 	}
256 late_failure:
257 	if (bp)
258 		brelse(bp);
259 	if (error) {
260 		/*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
261 		if (setmp)
262 			volume->devvp->v_rdev->si_mountpoint = NULL;
263 		VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
264 		hammer_free_volume(volume);
265 	}
266 	return (error);
267 }
268 
269 /*
270  * This is called for each volume when updating the mount point from
271  * read-write to read-only or vise-versa.
272  */
273 int
274 hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused)
275 {
276 	if (volume->devvp) {
277 		vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
278 		if (volume->io.hmp->ronly) {
279 			/* do not call vinvalbuf */
280 			VOP_OPEN(volume->devvp, FREAD, FSCRED, NULL);
281 			VOP_CLOSE(volume->devvp, FREAD|FWRITE);
282 		} else {
283 			/* do not call vinvalbuf */
284 			VOP_OPEN(volume->devvp, FREAD|FWRITE, FSCRED, NULL);
285 			VOP_CLOSE(volume->devvp, FREAD);
286 		}
287 		vn_unlock(volume->devvp);
288 	}
289 	return(0);
290 }
291 
292 /*
293  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
294  * so returns -1 on failure.
295  */
296 int
297 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
298 {
299 	struct hammer_mount *hmp = volume->io.hmp;
300 	int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
301 
302 	/*
303 	 * Clean up the root volume pointer, which is held unlocked in hmp.
304 	 */
305 	if (hmp->rootvol == volume)
306 		hmp->rootvol = NULL;
307 
308 	/*
309 	 * Unload buffers.
310 	 */
311 	RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
312 			hammer_unload_buffer, NULL);
313 
314 	/*
315 	 * Release our buffer and flush anything left in the buffer cache.
316 	 */
317 	volume->io.waitdep = 1;
318 	hammer_io_release(&volume->io, 1);
319 
320 	/*
321 	 * There should be no references on the volume, no clusters, and
322 	 * no super-clusters.
323 	 */
324 	KKASSERT(volume->io.lock.refs == 0);
325 	KKASSERT(RB_EMPTY(&volume->rb_bufs_root));
326 
327 	volume->ondisk = NULL;
328 	if (volume->devvp) {
329 		if (volume->devvp->v_rdev &&
330 		    volume->devvp->v_rdev->si_mountpoint == hmp->mp
331 		) {
332 			volume->devvp->v_rdev->si_mountpoint = NULL;
333 		}
334 		if (ronly) {
335 			vinvalbuf(volume->devvp, 0, 0, 0);
336 			VOP_CLOSE(volume->devvp, FREAD);
337 		} else {
338 			vinvalbuf(volume->devvp, V_SAVE, 0, 0);
339 			VOP_CLOSE(volume->devvp, FREAD|FWRITE);
340 		}
341 	}
342 
343 	/*
344 	 * Destroy the structure
345 	 */
346 	RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
347 	hammer_free_volume(volume);
348 	return(0);
349 }
350 
351 static
352 void
353 hammer_free_volume(hammer_volume_t volume)
354 {
355 	if (volume->vol_name) {
356 		kfree(volume->vol_name, M_HAMMER);
357 		volume->vol_name = NULL;
358 	}
359 	if (volume->devvp) {
360 		vrele(volume->devvp);
361 		volume->devvp = NULL;
362 	}
363 	--hammer_count_volumes;
364 	kfree(volume, M_HAMMER);
365 }
366 
367 /*
368  * Get a HAMMER volume.  The volume must already exist.
369  */
370 hammer_volume_t
371 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
372 {
373 	struct hammer_volume *volume;
374 
375 	/*
376 	 * Locate the volume structure
377 	 */
378 	volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
379 	if (volume == NULL) {
380 		*errorp = ENOENT;
381 		return(NULL);
382 	}
383 	hammer_ref(&volume->io.lock);
384 
385 	/*
386 	 * Deal with on-disk info
387 	 */
388 	if (volume->ondisk == NULL || volume->io.loading) {
389 		*errorp = hammer_load_volume(volume);
390 		if (*errorp) {
391 			hammer_rel_volume(volume, 1);
392 			volume = NULL;
393 		}
394 	} else {
395 		*errorp = 0;
396 	}
397 	return(volume);
398 }
399 
400 int
401 hammer_ref_volume(hammer_volume_t volume)
402 {
403 	int error;
404 
405 	hammer_ref(&volume->io.lock);
406 
407 	/*
408 	 * Deal with on-disk info
409 	 */
410 	if (volume->ondisk == NULL || volume->io.loading) {
411 		error = hammer_load_volume(volume);
412 		if (error)
413 			hammer_rel_volume(volume, 1);
414 	} else {
415 		error = 0;
416 	}
417 	return (error);
418 }
419 
420 hammer_volume_t
421 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
422 {
423 	hammer_volume_t volume;
424 
425 	volume = hmp->rootvol;
426 	KKASSERT(volume != NULL);
427 	hammer_ref(&volume->io.lock);
428 
429 	/*
430 	 * Deal with on-disk info
431 	 */
432 	if (volume->ondisk == NULL || volume->io.loading) {
433 		*errorp = hammer_load_volume(volume);
434 		if (*errorp) {
435 			hammer_rel_volume(volume, 1);
436 			volume = NULL;
437 		}
438 	} else {
439 		*errorp = 0;
440 	}
441 	return (volume);
442 }
443 
444 /*
445  * Load a volume's on-disk information.  The volume must be referenced and
446  * not locked.  We temporarily acquire an exclusive lock to interlock
447  * against releases or multiple get's.
448  */
449 static int
450 hammer_load_volume(hammer_volume_t volume)
451 {
452 	int error;
453 
454 	++volume->io.loading;
455 	hammer_lock_ex(&volume->io.lock);
456 
457 	if (volume->ondisk == NULL) {
458 		error = hammer_io_read(volume->devvp, &volume->io,
459 				       volume->maxraw_off);
460 		if (error == 0)
461 			volume->ondisk = (void *)volume->io.bp->b_data;
462 	} else {
463 		error = 0;
464 	}
465 	--volume->io.loading;
466 	hammer_unlock(&volume->io.lock);
467 	return(error);
468 }
469 
470 /*
471  * Release a volume.  Call hammer_io_release on the last reference.  We have
472  * to acquire an exclusive lock to interlock against volume->ondisk tests
473  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
474  * lock to be held.
475  *
476  * Volumes are not unloaded from memory during normal operation.
477  */
478 void
479 hammer_rel_volume(hammer_volume_t volume, int flush)
480 {
481 	crit_enter();
482 	if (volume->io.lock.refs == 1) {
483 		++volume->io.loading;
484 		hammer_lock_ex(&volume->io.lock);
485 		if (volume->io.lock.refs == 1) {
486 			volume->ondisk = NULL;
487 			hammer_io_release(&volume->io, flush);
488 		}
489 		--volume->io.loading;
490 		hammer_unlock(&volume->io.lock);
491 	}
492 	hammer_unref(&volume->io.lock);
493 	crit_exit();
494 }
495 
496 /************************************************************************
497  *				BUFFERS					*
498  ************************************************************************
499  *
500  * Manage buffers.  Currently all blockmap-backed zones are translated
501  * to zone-2 buffer offsets.
502  */
503 hammer_buffer_t
504 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
505 		  int isnew, int *errorp)
506 {
507 	hammer_buffer_t buffer;
508 	hammer_volume_t volume;
509 	hammer_off_t	zoneX_offset;
510 	hammer_io_type_t iotype;
511 	int vol_no;
512 	int zone;
513 
514 	zoneX_offset = buf_offset;
515 	zone = HAMMER_ZONE_DECODE(buf_offset);
516 
517 	/*
518 	 * What is the buffer class?
519 	 */
520 	switch(zone) {
521 	case HAMMER_ZONE_LARGE_DATA_INDEX:
522 	case HAMMER_ZONE_SMALL_DATA_INDEX:
523 		iotype = HAMMER_STRUCTURE_DATA_BUFFER;
524 		break;
525 	case HAMMER_ZONE_UNDO_INDEX:
526 		iotype = HAMMER_STRUCTURE_UNDO_BUFFER;
527 		break;
528 	default:
529 		iotype = HAMMER_STRUCTURE_META_BUFFER;
530 		break;
531 	}
532 
533 	/*
534 	 * Handle blockmap offset translations
535 	 */
536 	if (zone >= HAMMER_ZONE_BTREE_INDEX) {
537 		buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp);
538 		KKASSERT(*errorp == 0);
539 	} else if (zone == HAMMER_ZONE_UNDO_INDEX) {
540 		buf_offset = hammer_undo_lookup(hmp, buf_offset, errorp);
541 		KKASSERT(*errorp == 0);
542 	}
543 
544 	/*
545 	 * Locate the buffer given its zone-2 offset.
546 	 */
547 	buf_offset &= ~HAMMER_BUFMASK64;
548 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
549 	vol_no = HAMMER_VOL_DECODE(buf_offset);
550 	volume = hammer_get_volume(hmp, vol_no, errorp);
551 	if (volume == NULL)
552 		return(NULL);
553 
554 	/*
555 	 * NOTE: buf_offset and maxbuf_off are both full zone-2 offset
556 	 * specifications.
557 	 */
558 	KKASSERT(buf_offset < volume->maxbuf_off);
559 
560 	/*
561 	 * Locate and lock the buffer structure, creating one if necessary.
562 	 */
563 again:
564 	buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
565 			   buf_offset);
566 	if (buffer == NULL) {
567 		++hammer_count_buffers;
568 		buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
569 		buffer->zone2_offset = buf_offset;
570 		buffer->volume = volume;
571 
572 		hammer_io_init(&buffer->io, hmp, iotype);
573 		buffer->io.offset = volume->ondisk->vol_buf_beg +
574 				    (buf_offset & HAMMER_OFF_SHORT_MASK);
575 		TAILQ_INIT(&buffer->clist);
576 		hammer_ref(&buffer->io.lock);
577 
578 		/*
579 		 * Insert the buffer into the RB tree and handle late
580 		 * collisions.
581 		 */
582 		if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) {
583 			hammer_unref(&buffer->io.lock);
584 			--hammer_count_buffers;
585 			kfree(buffer, M_HAMMER);
586 			goto again;
587 		}
588 		hammer_ref(&volume->io.lock);
589 	} else {
590 		hammer_ref(&buffer->io.lock);
591 
592 		/*
593 		 * The buffer is no longer loose if it has a ref.
594 		 */
595 		if (buffer->io.mod_list == &hmp->lose_list) {
596 			TAILQ_REMOVE(buffer->io.mod_list, &buffer->io,
597 				     mod_entry);
598 			buffer->io.mod_list = NULL;
599 		}
600 		if (buffer->io.lock.refs == 1)
601 			hammer_io_reinit(&buffer->io, iotype);
602 		else
603 			KKASSERT(buffer->io.type == iotype);
604 	}
605 
606 	/*
607 	 * Cache the blockmap translation
608 	 */
609 	if ((zoneX_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_RAW_BUFFER)
610 		buffer->zoneX_offset = zoneX_offset;
611 
612 	/*
613 	 * Deal with on-disk info
614 	 */
615 	if (buffer->ondisk == NULL || buffer->io.loading) {
616 		*errorp = hammer_load_buffer(buffer, isnew);
617 		if (*errorp) {
618 			hammer_rel_buffer(buffer, 1);
619 			buffer = NULL;
620 		}
621 	} else {
622 		*errorp = 0;
623 	}
624 	hammer_rel_volume(volume, 0);
625 	return(buffer);
626 }
627 
628 /*
629  * Clear the cached zone-X translation for a buffer.
630  */
631 void
632 hammer_clrxlate_buffer(hammer_mount_t hmp, hammer_off_t buf_offset)
633 {
634 	hammer_buffer_t buffer;
635 	hammer_volume_t volume;
636 	int vol_no;
637 	int error;
638 
639 	buf_offset &= ~HAMMER_BUFMASK64;
640 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
641 	vol_no = HAMMER_VOL_DECODE(buf_offset);
642 	volume = hammer_get_volume(hmp, vol_no, &error);
643 	if (volume == NULL)
644 		return;
645 	buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
646 			   buf_offset);
647 	if (buffer)
648 		buffer->zoneX_offset = 0;
649 	hammer_rel_volume(volume, 0);
650 }
651 
652 static int
653 hammer_load_buffer(hammer_buffer_t buffer, int isnew)
654 {
655 	hammer_volume_t volume;
656 	int error;
657 
658 	/*
659 	 * Load the buffer's on-disk info
660 	 */
661 	volume = buffer->volume;
662 	++buffer->io.loading;
663 	hammer_lock_ex(&buffer->io.lock);
664 
665 	if (hammer_debug_io & 0x0001) {
666 		kprintf("load_buffer %016llx %016llx\n",
667 			buffer->zoneX_offset, buffer->zone2_offset);
668 	}
669 
670 	if (buffer->ondisk == NULL) {
671 		if (isnew) {
672 			error = hammer_io_new(volume->devvp, &buffer->io);
673 		} else {
674 			error = hammer_io_read(volume->devvp, &buffer->io,
675 					       volume->maxraw_off);
676 		}
677 		if (error == 0)
678 			buffer->ondisk = (void *)buffer->io.bp->b_data;
679 	} else if (isnew) {
680 		error = hammer_io_new(volume->devvp, &buffer->io);
681 	} else {
682 		error = 0;
683 	}
684 	--buffer->io.loading;
685 	hammer_unlock(&buffer->io.lock);
686 	return (error);
687 }
688 
689 /*
690  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
691  */
692 int
693 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
694 {
695 	hammer_ref(&buffer->io.lock);
696 	hammer_flush_buffer_nodes(buffer);
697 	KKASSERT(buffer->io.lock.refs == 1);
698 	hammer_rel_buffer(buffer, 2);
699 	return(0);
700 }
701 
702 /*
703  * Reference a buffer that is either already referenced or via a specially
704  * handled pointer (aka cursor->buffer).
705  */
706 int
707 hammer_ref_buffer(hammer_buffer_t buffer)
708 {
709 	int error;
710 
711 	hammer_ref(&buffer->io.lock);
712 
713 	/*
714 	 * No longer loose
715 	 */
716 	if (buffer->io.mod_list == &buffer->io.hmp->lose_list) {
717 		TAILQ_REMOVE(buffer->io.mod_list, &buffer->io, mod_entry);
718 		buffer->io.mod_list = NULL;
719 	}
720 
721 	if (buffer->ondisk == NULL || buffer->io.loading) {
722 		error = hammer_load_buffer(buffer, 0);
723 		if (error) {
724 			hammer_rel_buffer(buffer, 1);
725 			/*
726 			 * NOTE: buffer pointer can become stale after
727 			 * the above release.
728 			 */
729 		}
730 	} else {
731 		error = 0;
732 	}
733 	return(error);
734 }
735 
736 /*
737  * Release a buffer.  We have to deal with several places where
738  * another thread can ref the buffer.
739  *
740  * Only destroy the structure itself if the related buffer cache buffer
741  * was disassociated from it.  This ties the management of the structure
742  * to the buffer cache subsystem.  buffer->ondisk determines whether the
743  * embedded io is referenced or not.
744  */
745 void
746 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
747 {
748 	hammer_volume_t volume;
749 	int freeme = 0;
750 
751 	crit_enter();
752 	if (buffer->io.lock.refs == 1) {
753 		++buffer->io.loading;	/* force interlock check */
754 		hammer_lock_ex(&buffer->io.lock);
755 		if (buffer->io.lock.refs == 1) {
756 			hammer_io_release(&buffer->io, flush);
757 			hammer_flush_buffer_nodes(buffer);
758 			KKASSERT(TAILQ_EMPTY(&buffer->clist));
759 
760 			if (buffer->io.bp == NULL &&
761 			    buffer->io.lock.refs == 1) {
762 				/*
763 				 * Final cleanup
764 				 */
765 				volume = buffer->volume;
766 				RB_REMOVE(hammer_buf_rb_tree,
767 					  &volume->rb_bufs_root, buffer);
768 				buffer->volume = NULL; /* sanity */
769 				hammer_rel_volume(volume, 0);
770 				freeme = 1;
771 			}
772 		}
773 		--buffer->io.loading;
774 		hammer_unlock(&buffer->io.lock);
775 	}
776 	hammer_unref(&buffer->io.lock);
777 	crit_exit();
778 	if (freeme) {
779 		KKASSERT(buffer->io.mod_list == NULL);
780 		--hammer_count_buffers;
781 		kfree(buffer, M_HAMMER);
782 	}
783 }
784 
785 /*
786  * Remove the zoneX translation cache for a buffer given its zone-2 offset.
787  */
788 void
789 hammer_uncache_buffer(hammer_mount_t hmp, hammer_off_t buf_offset)
790 {
791 	hammer_volume_t volume;
792 	hammer_buffer_t buffer;
793 	int vol_no;
794 	int error;
795 
796 	buf_offset &= ~HAMMER_BUFMASK64;
797 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
798 	vol_no = HAMMER_VOL_DECODE(buf_offset);
799 	volume = hammer_get_volume(hmp, vol_no, &error);
800 	KKASSERT(volume != 0);
801 	KKASSERT(buf_offset < volume->maxbuf_off);
802 
803 	buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
804 			   buf_offset);
805 	if (buffer)
806 		buffer->zoneX_offset = 0;
807 	hammer_rel_volume(volume, 0);
808 }
809 
810 /*
811  * Access the filesystem buffer containing the specified hammer offset.
812  * buf_offset is a conglomeration of the volume number and vol_buf_beg
813  * relative buffer offset.  It must also have bit 55 set to be valid.
814  * (see hammer_off_t in hammer_disk.h).
815  *
816  * Any prior buffer in *bufferp will be released and replaced by the
817  * requested buffer.
818  */
819 void *
820 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp,
821 	     struct hammer_buffer **bufferp)
822 {
823 	hammer_buffer_t buffer;
824 	int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
825 
826 	buf_offset &= ~HAMMER_BUFMASK64;
827 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0);
828 
829 	buffer = *bufferp;
830 	if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
831 			       buffer->zoneX_offset != buf_offset)) {
832 		if (buffer)
833 			hammer_rel_buffer(buffer, 0);
834 		buffer = hammer_get_buffer(hmp, buf_offset, 0, errorp);
835 		*bufferp = buffer;
836 	} else {
837 		*errorp = 0;
838 	}
839 
840 	/*
841 	 * Return a pointer to the buffer data.
842 	 */
843 	if (buffer == NULL)
844 		return(NULL);
845 	else
846 		return((char *)buffer->ondisk + xoff);
847 }
848 
849 /*
850  * Access the filesystem buffer containing the specified hammer offset.
851  * No disk read operation occurs.  The result buffer may contain garbage.
852  *
853  * Any prior buffer in *bufferp will be released and replaced by the
854  * requested buffer.
855  *
856  * This function marks the buffer dirty but does not increment its
857  * modify_refs count.
858  */
859 void *
860 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp,
861 	     struct hammer_buffer **bufferp)
862 {
863 	hammer_buffer_t buffer;
864 	int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
865 
866 	buf_offset &= ~HAMMER_BUFMASK64;
867 
868 	buffer = *bufferp;
869 	if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
870 			       buffer->zoneX_offset != buf_offset)) {
871 		if (buffer)
872 			hammer_rel_buffer(buffer, 0);
873 		buffer = hammer_get_buffer(hmp, buf_offset, 1, errorp);
874 		*bufferp = buffer;
875 	} else {
876 		*errorp = 0;
877 	}
878 
879 	/*
880 	 * Return a pointer to the buffer data.
881 	 */
882 	if (buffer == NULL)
883 		return(NULL);
884 	else
885 		return((char *)buffer->ondisk + xoff);
886 }
887 
888 /************************************************************************
889  *				NODES					*
890  ************************************************************************
891  *
892  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
893  * method used by the HAMMER filesystem.
894  *
895  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
896  * associated with its buffer, and will only referenced the buffer while
897  * the node itself is referenced.
898  *
899  * A hammer_node can also be passively associated with other HAMMER
900  * structures, such as inodes, while retaining 0 references.  These
901  * associations can be cleared backwards using a pointer-to-pointer in
902  * the hammer_node.
903  *
904  * This allows the HAMMER implementation to cache hammer_nodes long-term
905  * and short-cut a great deal of the infrastructure's complexity.  In
906  * most cases a cached node can be reacquired without having to dip into
907  * either the buffer or cluster management code.
908  *
909  * The caller must pass a referenced cluster on call and will retain
910  * ownership of the reference on return.  The node will acquire its own
911  * additional references, if necessary.
912  */
913 hammer_node_t
914 hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset,
915 		int isnew, int *errorp)
916 {
917 	hammer_node_t node;
918 
919 	KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE);
920 
921 	/*
922 	 * Locate the structure, allocating one if necessary.
923 	 */
924 again:
925 	node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset);
926 	if (node == NULL) {
927 		++hammer_count_nodes;
928 		node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
929 		node->node_offset = node_offset;
930 		node->hmp = hmp;
931 		if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) {
932 			--hammer_count_nodes;
933 			kfree(node, M_HAMMER);
934 			goto again;
935 		}
936 	}
937 	hammer_ref(&node->lock);
938 	if (node->ondisk)
939 		*errorp = 0;
940 	else
941 		*errorp = hammer_load_node(node, isnew);
942 	if (*errorp) {
943 		hammer_rel_node(node);
944 		node = NULL;
945 	}
946 	return(node);
947 }
948 
949 /*
950  * Reference an already-referenced node.
951  */
952 void
953 hammer_ref_node(hammer_node_t node)
954 {
955 	KKASSERT(node->lock.refs > 0 && node->ondisk != NULL);
956 	hammer_ref(&node->lock);
957 }
958 
959 /*
960  * Load a node's on-disk data reference.
961  */
962 static int
963 hammer_load_node(hammer_node_t node, int isnew)
964 {
965 	hammer_buffer_t buffer;
966 	hammer_off_t buf_offset;
967 	int error;
968 
969 	error = 0;
970 	++node->loading;
971 	hammer_lock_ex(&node->lock);
972 	if (node->ondisk == NULL) {
973 		/*
974 		 * This is a little confusing but the jist is that
975 		 * node->buffer determines whether the node is on
976 		 * the buffer's clist and node->ondisk determines
977 		 * whether the buffer is referenced.
978 		 *
979 		 * We could be racing a buffer release, in which case
980 		 * node->buffer may become NULL while we are blocked
981 		 * referencing the buffer.
982 		 */
983 		if ((buffer = node->buffer) != NULL) {
984 			error = hammer_ref_buffer(buffer);
985 			if (error == 0 && node->buffer == NULL) {
986 				TAILQ_INSERT_TAIL(&buffer->clist,
987 						  node, entry);
988 				node->buffer = buffer;
989 			}
990 		} else {
991 			buf_offset = node->node_offset & ~HAMMER_BUFMASK64;
992 			buffer = hammer_get_buffer(node->hmp, buf_offset,
993 						   0, &error);
994 			if (buffer) {
995 				KKASSERT(error == 0);
996 				TAILQ_INSERT_TAIL(&buffer->clist,
997 						  node, entry);
998 				node->buffer = buffer;
999 			}
1000 		}
1001 		if (error == 0) {
1002 			node->ondisk = (void *)((char *)buffer->ondisk +
1003 			       (node->node_offset & HAMMER_BUFMASK));
1004 			if (isnew == 0 &&
1005 			    hammer_crc_test_btree(node->ondisk) == 0) {
1006 				Debugger("CRC FAILED: B-TREE NODE");
1007 			}
1008 		}
1009 	}
1010 	--node->loading;
1011 	hammer_unlock(&node->lock);
1012 	return (error);
1013 }
1014 
1015 /*
1016  * Safely reference a node, interlock against flushes via the IO subsystem.
1017  */
1018 hammer_node_t
1019 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
1020 		     int *errorp)
1021 {
1022 	hammer_node_t node;
1023 
1024 	node = *cache;
1025 	if (node != NULL) {
1026 		hammer_ref(&node->lock);
1027 		if (node->ondisk)
1028 			*errorp = 0;
1029 		else
1030 			*errorp = hammer_load_node(node, 0);
1031 		if (*errorp) {
1032 			hammer_rel_node(node);
1033 			node = NULL;
1034 		}
1035 	} else {
1036 		*errorp = ENOENT;
1037 	}
1038 	return(node);
1039 }
1040 
1041 /*
1042  * Release a hammer_node.  On the last release the node dereferences
1043  * its underlying buffer and may or may not be destroyed.
1044  */
1045 void
1046 hammer_rel_node(hammer_node_t node)
1047 {
1048 	hammer_buffer_t buffer;
1049 
1050 	/*
1051 	 * If this isn't the last ref just decrement the ref count and
1052 	 * return.
1053 	 */
1054 	if (node->lock.refs > 1) {
1055 		hammer_unref(&node->lock);
1056 		return;
1057 	}
1058 
1059 	/*
1060 	 * If there is no ondisk info or no buffer the node failed to load,
1061 	 * remove the last reference and destroy the node.
1062 	 */
1063 	if (node->ondisk == NULL) {
1064 		hammer_unref(&node->lock);
1065 		hammer_flush_node(node);
1066 		/* node is stale now */
1067 		return;
1068 	}
1069 
1070 	/*
1071 	 * Do final cleanups and then either destroy the node and leave it
1072 	 * passively cached.  The buffer reference is removed regardless.
1073 	 */
1074 	buffer = node->buffer;
1075 	node->ondisk = NULL;
1076 
1077 	if ((node->flags & HAMMER_NODE_FLUSH) == 0) {
1078 		hammer_unref(&node->lock);
1079 		hammer_rel_buffer(buffer, 0);
1080 		return;
1081 	}
1082 
1083 	/*
1084 	 * Destroy the node.
1085 	 */
1086 	hammer_unref(&node->lock);
1087 	hammer_flush_node(node);
1088 	/* node is stale */
1089 	hammer_rel_buffer(buffer, 0);
1090 }
1091 
1092 /*
1093  * Free space on-media associated with a B-Tree node.
1094  */
1095 void
1096 hammer_delete_node(hammer_transaction_t trans, hammer_node_t node)
1097 {
1098 	KKASSERT((node->flags & HAMMER_NODE_DELETED) == 0);
1099 	node->flags |= HAMMER_NODE_DELETED;
1100 	hammer_blockmap_free(trans, node->node_offset, sizeof(*node->ondisk));
1101 }
1102 
1103 /*
1104  * Passively cache a referenced hammer_node in *cache.  The caller may
1105  * release the node on return.
1106  */
1107 void
1108 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1109 {
1110 	hammer_node_t old;
1111 
1112 	/*
1113 	 * If the node is being deleted, don't cache it!
1114 	 */
1115 	if (node->flags & HAMMER_NODE_DELETED)
1116 		return;
1117 
1118 	/*
1119 	 * Cache the node.  If we previously cached a different node we
1120 	 * have to give HAMMER a chance to destroy it.
1121 	 */
1122 again:
1123 	if (node->cache1 != cache) {
1124 		if (node->cache2 != cache) {
1125 			if ((old = *cache) != NULL) {
1126 				KKASSERT(node->lock.refs != 0);
1127 				hammer_uncache_node(cache);
1128 				goto again;
1129 			}
1130 			if (node->cache2)
1131 				*node->cache2 = NULL;
1132 			node->cache2 = node->cache1;
1133 			node->cache1 = cache;
1134 			*cache = node;
1135 		} else {
1136 			struct hammer_node **tmp;
1137 			tmp = node->cache1;
1138 			node->cache1 = node->cache2;
1139 			node->cache2 = tmp;
1140 		}
1141 	}
1142 }
1143 
1144 void
1145 hammer_uncache_node(struct hammer_node **cache)
1146 {
1147 	hammer_node_t node;
1148 
1149 	if ((node = *cache) != NULL) {
1150 		*cache = NULL;
1151 		if (node->cache1 == cache) {
1152 			node->cache1 = node->cache2;
1153 			node->cache2 = NULL;
1154 		} else if (node->cache2 == cache) {
1155 			node->cache2 = NULL;
1156 		} else {
1157 			panic("hammer_uncache_node: missing cache linkage");
1158 		}
1159 		if (node->cache1 == NULL && node->cache2 == NULL)
1160 			hammer_flush_node(node);
1161 	}
1162 }
1163 
1164 /*
1165  * Remove a node's cache references and destroy the node if it has no
1166  * other references or backing store.
1167  */
1168 void
1169 hammer_flush_node(hammer_node_t node)
1170 {
1171 	hammer_buffer_t buffer;
1172 
1173 	if (node->cache1)
1174 		*node->cache1 = NULL;
1175 	if (node->cache2)
1176 		*node->cache2 = NULL;
1177 	if (node->lock.refs == 0 && node->ondisk == NULL) {
1178 		RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node);
1179 		if ((buffer = node->buffer) != NULL) {
1180 			node->buffer = NULL;
1181 			TAILQ_REMOVE(&buffer->clist, node, entry);
1182 			/* buffer is unreferenced because ondisk is NULL */
1183 		}
1184 		--hammer_count_nodes;
1185 		kfree(node, M_HAMMER);
1186 	}
1187 }
1188 
1189 /*
1190  * Flush passively cached B-Tree nodes associated with this buffer.
1191  * This is only called when the buffer is about to be destroyed, so
1192  * none of the nodes should have any references.  The buffer is locked.
1193  *
1194  * We may be interlocked with the buffer.
1195  */
1196 void
1197 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1198 {
1199 	hammer_node_t node;
1200 
1201 	while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1202 		KKASSERT(node->ondisk == NULL);
1203 
1204 		if (node->lock.refs == 0) {
1205 			hammer_ref(&node->lock);
1206 			node->flags |= HAMMER_NODE_FLUSH;
1207 			hammer_rel_node(node);
1208 		} else {
1209 			KKASSERT(node->loading != 0);
1210 			KKASSERT(node->buffer != NULL);
1211 			buffer = node->buffer;
1212 			node->buffer = NULL;
1213 			TAILQ_REMOVE(&buffer->clist, node, entry);
1214 			/* buffer is unreferenced because ondisk is NULL */
1215 		}
1216 	}
1217 }
1218 
1219 
1220 /************************************************************************
1221  *				ALLOCATORS				*
1222  ************************************************************************/
1223 
1224 /*
1225  * Allocate a B-Tree node.
1226  */
1227 hammer_node_t
1228 hammer_alloc_btree(hammer_transaction_t trans, int *errorp)
1229 {
1230 	hammer_buffer_t buffer = NULL;
1231 	hammer_node_t node = NULL;
1232 	hammer_off_t node_offset;
1233 
1234 	node_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_BTREE_INDEX,
1235 					    sizeof(struct hammer_node_ondisk),
1236 					    errorp);
1237 	if (*errorp == 0) {
1238 		node = hammer_get_node(trans->hmp, node_offset, 1, errorp);
1239 		hammer_modify_node_noundo(trans, node);
1240 		bzero(node->ondisk, sizeof(*node->ondisk));
1241 		hammer_modify_node_done(node);
1242 	}
1243 	if (buffer)
1244 		hammer_rel_buffer(buffer, 0);
1245 	return(node);
1246 }
1247 
1248 #if 0
1249 
1250 /*
1251  * The returned buffers are already appropriately marked as being modified.
1252  * If the caller marks them again unnecessary undo records may be generated.
1253  *
1254  * In-band data is indicated by data_bufferp == NULL.  Pass a data_len of 0
1255  * for zero-fill (caller modifies data_len afterwords).
1256  *
1257  * If the caller is responsible for calling hammer_modify_*() prior to making
1258  * any additional modifications to either the returned record buffer or the
1259  * returned data buffer.
1260  */
1261 void *
1262 hammer_alloc_record(hammer_transaction_t trans,
1263 		    hammer_off_t *rec_offp, u_int16_t rec_type,
1264 		    struct hammer_buffer **rec_bufferp,
1265 		    int32_t data_len, void **datap,
1266 		    hammer_off_t *data_offp,
1267 		    struct hammer_buffer **data_bufferp, int *errorp)
1268 {
1269 	hammer_record_ondisk_t rec;
1270 	hammer_off_t rec_offset;
1271 	hammer_off_t data_offset;
1272 	int32_t reclen;
1273 
1274 	if (datap)
1275 		*datap = NULL;
1276 
1277 	/*
1278 	 * Allocate the record
1279 	 */
1280 	rec_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_RECORD_INDEX,
1281 					   HAMMER_RECORD_SIZE, errorp);
1282 	if (*errorp)
1283 		return(NULL);
1284 	if (data_offp)
1285 		*data_offp = 0;
1286 
1287 	/*
1288 	 * Allocate data
1289 	 */
1290 	if (data_len) {
1291 		if (data_bufferp == NULL) {
1292 			switch(rec_type) {
1293 			case HAMMER_RECTYPE_DATA:
1294 				reclen = offsetof(struct hammer_data_record,
1295 						  data[0]);
1296 				break;
1297 			case HAMMER_RECTYPE_DIRENTRY:
1298 				reclen = offsetof(struct hammer_entry_record,
1299 						  name[0]);
1300 				break;
1301 			default:
1302 				panic("hammer_alloc_record: illegal "
1303 				      "in-band data");
1304 				/* NOT REACHED */
1305 				reclen = 0;
1306 				break;
1307 			}
1308 			KKASSERT(reclen + data_len <= HAMMER_RECORD_SIZE);
1309 			data_offset = rec_offset + reclen;
1310 		} else if (data_len < HAMMER_BUFSIZE) {
1311 			data_offset = hammer_blockmap_alloc(trans,
1312 						HAMMER_ZONE_SMALL_DATA_INDEX,
1313 						data_len, errorp);
1314 			*data_offp = data_offset;
1315 		} else {
1316 			data_offset = hammer_blockmap_alloc(trans,
1317 						HAMMER_ZONE_LARGE_DATA_INDEX,
1318 						data_len, errorp);
1319 			*data_offp = data_offset;
1320 		}
1321 	} else {
1322 		data_offset = 0;
1323 	}
1324 	if (*errorp) {
1325 		hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
1326 		return(NULL);
1327 	}
1328 
1329 	/*
1330 	 * Basic return values.
1331 	 *
1332 	 * Note that because this is a 'new' buffer, there is no need to
1333 	 * generate UNDO records for it.
1334 	 */
1335 	*rec_offp = rec_offset;
1336 	rec = hammer_bread(trans->hmp, rec_offset, errorp, rec_bufferp);
1337 	hammer_modify_buffer(trans, *rec_bufferp, NULL, 0);
1338 	bzero(rec, sizeof(*rec));
1339 	KKASSERT(*errorp == 0);
1340 	rec->base.data_off = data_offset;
1341 	rec->base.data_len = data_len;
1342 	hammer_modify_buffer_done(*rec_bufferp);
1343 
1344 	if (data_bufferp) {
1345 		if (data_len) {
1346 			*datap = hammer_bread(trans->hmp, data_offset, errorp,
1347 					      data_bufferp);
1348 			KKASSERT(*errorp == 0);
1349 		} else {
1350 			*datap = NULL;
1351 		}
1352 	} else if (data_len) {
1353 		KKASSERT(data_offset + data_len - rec_offset <=
1354 			 HAMMER_RECORD_SIZE);
1355 		if (datap) {
1356 			*datap = (void *)((char *)rec +
1357 					  (int32_t)(data_offset - rec_offset));
1358 		}
1359 	} else {
1360 		KKASSERT(datap == NULL);
1361 	}
1362 	KKASSERT(*errorp == 0);
1363 	return(rec);
1364 }
1365 
1366 #endif
1367 
1368 /*
1369  * Allocate data.  If the address of a data buffer is supplied then
1370  * any prior non-NULL *data_bufferp will be released and *data_bufferp
1371  * will be set to the related buffer.  The caller must release it when
1372  * finally done.  The initial *data_bufferp should be set to NULL by
1373  * the caller.
1374  *
1375  * The caller is responsible for making hammer_modify*() calls on the
1376  * *data_bufferp.
1377  */
1378 void *
1379 hammer_alloc_data(hammer_transaction_t trans, int32_t data_len,
1380 		  hammer_off_t *data_offsetp,
1381 		  struct hammer_buffer **data_bufferp, int *errorp)
1382 {
1383 	void *data;
1384 
1385 	/*
1386 	 * Allocate data
1387 	 */
1388 	if (data_len) {
1389 		if (data_len < HAMMER_BUFSIZE) {
1390 			*data_offsetp = hammer_blockmap_alloc(trans,
1391 						HAMMER_ZONE_SMALL_DATA_INDEX,
1392 						data_len, errorp);
1393 		} else {
1394 			*data_offsetp = hammer_blockmap_alloc(trans,
1395 						HAMMER_ZONE_LARGE_DATA_INDEX,
1396 						data_len, errorp);
1397 		}
1398 	} else {
1399 		*data_offsetp = 0;
1400 	}
1401 	if (*errorp == 0 && data_bufferp) {
1402 		if (data_len) {
1403 			data = hammer_bread(trans->hmp, *data_offsetp, errorp,
1404 					    data_bufferp);
1405 			KKASSERT(*errorp == 0);
1406 		} else {
1407 			data = NULL;
1408 		}
1409 	} else {
1410 		data = NULL;
1411 	}
1412 	KKASSERT(*errorp == 0);
1413 	return(data);
1414 }
1415 
1416 /*
1417  * Sync dirty buffers to the media and clean-up any loose ends.
1418  */
1419 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
1420 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
1421 
1422 int
1423 hammer_queue_inodes_flusher(hammer_mount_t hmp, int waitfor)
1424 {
1425 	struct hammer_sync_info info;
1426 
1427 	info.error = 0;
1428 	info.waitfor = waitfor;
1429 	if (waitfor == MNT_WAIT) {
1430 		vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_ONEPASS,
1431 			      hammer_sync_scan1, hammer_sync_scan2, &info);
1432 	} else {
1433 		vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_ONEPASS|VMSC_NOWAIT,
1434 			      hammer_sync_scan1, hammer_sync_scan2, &info);
1435 	}
1436 	return(info.error);
1437 }
1438 
1439 int
1440 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1441 {
1442 	struct hammer_sync_info info;
1443 
1444 	info.error = 0;
1445 	info.waitfor = waitfor;
1446 
1447 	vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
1448 		      hammer_sync_scan1, hammer_sync_scan2, &info);
1449         if (waitfor == MNT_WAIT)
1450                 hammer_flusher_sync(hmp);
1451         else
1452                 hammer_flusher_async(hmp);
1453 
1454 	return(info.error);
1455 }
1456 
1457 static int
1458 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1459 {
1460 	struct hammer_inode *ip;
1461 
1462 	ip = VTOI(vp);
1463 	if (vp->v_type == VNON || ip == NULL ||
1464 	    ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1465 	     RB_EMPTY(&vp->v_rbdirty_tree))) {
1466 		return(-1);
1467 	}
1468 	return(0);
1469 }
1470 
1471 static int
1472 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1473 {
1474 	struct hammer_sync_info *info = data;
1475 	struct hammer_inode *ip;
1476 	int error;
1477 
1478 	ip = VTOI(vp);
1479 	if (vp->v_type == VNON || vp->v_type == VBAD ||
1480 	    ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1481 	     RB_EMPTY(&vp->v_rbdirty_tree))) {
1482 		return(0);
1483 	}
1484 	error = VOP_FSYNC(vp, info->waitfor);
1485 	if (error)
1486 		info->error = error;
1487 	return(0);
1488 }
1489 
1490