xref: /dragonfly/sys/vfs/hammer/hammer_ondisk.c (revision fb5b3747)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.76 2008/08/29 20:19:08 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41 
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47 
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
51 static int hammer_load_node(hammer_transaction_t trans,
52 				hammer_node_t node, int isnew);
53 static void _hammer_rel_node(hammer_node_t node, int locked);
54 
55 static int
56 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
57 {
58 	if (vol1->vol_no < vol2->vol_no)
59 		return(-1);
60 	if (vol1->vol_no > vol2->vol_no)
61 		return(1);
62 	return(0);
63 }
64 
65 /*
66  * hammer_buffer structures are indexed via their zoneX_offset, not
67  * their zone2_offset.
68  */
69 static int
70 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
71 {
72 	if (buf1->zoneX_offset < buf2->zoneX_offset)
73 		return(-1);
74 	if (buf1->zoneX_offset > buf2->zoneX_offset)
75 		return(1);
76 	return(0);
77 }
78 
79 static int
80 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
81 {
82 	if (node1->node_offset < node2->node_offset)
83 		return(-1);
84 	if (node1->node_offset > node2->node_offset)
85 		return(1);
86 	return(0);
87 }
88 
89 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
90 	     hammer_vol_rb_compare, int32_t, vol_no);
91 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
92 	     hammer_buf_rb_compare, hammer_off_t, zoneX_offset);
93 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
94 	     hammer_nod_rb_compare, hammer_off_t, node_offset);
95 
96 /************************************************************************
97  *				VOLUMES					*
98  ************************************************************************
99  *
100  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
101  * code on failure.  Volumes must be loaded at mount time, get_volume() will
102  * not load a new volume.
103  *
104  * The passed devvp is vref()'d but not locked.  This function consumes the
105  * ref (typically by associating it with the volume structure).
106  *
107  * Calls made to hammer_load_volume() or single-threaded
108  */
109 int
110 hammer_install_volume(struct hammer_mount *hmp, const char *volname,
111 		      struct vnode *devvp)
112 {
113 	struct mount *mp;
114 	hammer_volume_t volume;
115 	struct hammer_volume_ondisk *ondisk;
116 	struct nlookupdata nd;
117 	struct buf *bp = NULL;
118 	int error;
119 	int ronly;
120 	int setmp = 0;
121 
122 	mp = hmp->mp;
123 	ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
124 
125 	/*
126 	 * Allocate a volume structure
127 	 */
128 	++hammer_count_volumes;
129 	volume = kmalloc(sizeof(*volume), hmp->m_misc, M_WAITOK|M_ZERO);
130 	volume->vol_name = kstrdup(volname, hmp->m_misc);
131 	volume->io.hmp = hmp;	/* bootstrap */
132 	hammer_io_init(&volume->io, volume, HAMMER_STRUCTURE_VOLUME);
133 	volume->io.offset = 0LL;
134 	volume->io.bytes = HAMMER_BUFSIZE;
135 
136 	/*
137 	 * Get the device vnode
138 	 */
139 	if (devvp == NULL) {
140 		error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
141 		if (error == 0)
142 			error = nlookup(&nd);
143 		if (error == 0)
144 			error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
145 		nlookup_done(&nd);
146 	} else {
147 		error = 0;
148 		volume->devvp = devvp;
149 	}
150 
151 	if (error == 0) {
152 		if (vn_isdisk(volume->devvp, &error)) {
153 			error = vfs_mountedon(volume->devvp);
154 		}
155 	}
156 	if (error == 0 && vcount(volume->devvp) > 0)
157 		error = EBUSY;
158 	if (error == 0) {
159 		vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
160 		error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
161 		if (error == 0) {
162 			error = VOP_OPEN(volume->devvp,
163 					 (ronly ? FREAD : FREAD|FWRITE),
164 					 FSCRED, NULL);
165 		}
166 		vn_unlock(volume->devvp);
167 	}
168 	if (error) {
169 		hammer_free_volume(volume);
170 		return(error);
171 	}
172 	volume->devvp->v_rdev->si_mountpoint = mp;
173 	setmp = 1;
174 
175 	/*
176 	 * Extract the volume number from the volume header and do various
177 	 * sanity checks.
178 	 */
179 	error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
180 	if (error)
181 		goto late_failure;
182 	ondisk = (void *)bp->b_data;
183 	if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) {
184 		kprintf("hammer_mount: volume %s has an invalid header\n",
185 			volume->vol_name);
186 		error = EFTYPE;
187 		goto late_failure;
188 	}
189 	volume->vol_no = ondisk->vol_no;
190 	volume->buffer_base = ondisk->vol_buf_beg;
191 	volume->vol_flags = ondisk->vol_flags;
192 	volume->nblocks = ondisk->vol_nblocks;
193 	volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no,
194 				    ondisk->vol_buf_end - ondisk->vol_buf_beg);
195 	volume->maxraw_off = ondisk->vol_buf_end;
196 
197 	if (RB_EMPTY(&hmp->rb_vols_root)) {
198 		hmp->fsid = ondisk->vol_fsid;
199 	} else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
200 		kprintf("hammer_mount: volume %s's fsid does not match "
201 			"other volumes\n", volume->vol_name);
202 		error = EFTYPE;
203 		goto late_failure;
204 	}
205 
206 	/*
207 	 * Insert the volume structure into the red-black tree.
208 	 */
209 	if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
210 		kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
211 			volume->vol_name, volume->vol_no);
212 		error = EEXIST;
213 	}
214 
215 	/*
216 	 * Set the root volume .  HAMMER special cases rootvol the structure.
217 	 * We do not hold a ref because this would prevent related I/O
218 	 * from being flushed.
219 	 */
220 	if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
221 		hmp->rootvol = volume;
222 		hmp->nvolumes = ondisk->vol_count;
223 		if (bp) {
224 			brelse(bp);
225 			bp = NULL;
226 		}
227 		hmp->mp->mnt_stat.f_blocks += ondisk->vol0_stat_bigblocks *
228 			(HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE);
229 		hmp->mp->mnt_vstat.f_blocks += ondisk->vol0_stat_bigblocks *
230 			(HAMMER_LARGEBLOCK_SIZE / HAMMER_BUFSIZE);
231 	}
232 late_failure:
233 	if (bp)
234 		brelse(bp);
235 	if (error) {
236 		/*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
237 		if (setmp)
238 			volume->devvp->v_rdev->si_mountpoint = NULL;
239 		VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
240 		hammer_free_volume(volume);
241 	}
242 	return (error);
243 }
244 
245 /*
246  * This is called for each volume when updating the mount point from
247  * read-write to read-only or vise-versa.
248  */
249 int
250 hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused)
251 {
252 	if (volume->devvp) {
253 		vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
254 		if (volume->io.hmp->ronly) {
255 			/* do not call vinvalbuf */
256 			VOP_OPEN(volume->devvp, FREAD, FSCRED, NULL);
257 			VOP_CLOSE(volume->devvp, FREAD|FWRITE);
258 		} else {
259 			/* do not call vinvalbuf */
260 			VOP_OPEN(volume->devvp, FREAD|FWRITE, FSCRED, NULL);
261 			VOP_CLOSE(volume->devvp, FREAD);
262 		}
263 		vn_unlock(volume->devvp);
264 	}
265 	return(0);
266 }
267 
268 /*
269  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
270  * so returns -1 on failure.
271  */
272 int
273 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
274 {
275 	hammer_mount_t hmp = volume->io.hmp;
276 	int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
277 
278 	/*
279 	 * Clean up the root volume pointer, which is held unlocked in hmp.
280 	 */
281 	if (hmp->rootvol == volume)
282 		hmp->rootvol = NULL;
283 
284 	/*
285 	 * We must not flush a dirty buffer to disk on umount.  It should
286 	 * have already been dealt with by the flusher, or we may be in
287 	 * catastrophic failure.
288 	 */
289 	hammer_io_clear_modify(&volume->io, 1);
290 	volume->io.waitdep = 1;
291 
292 	/*
293 	 * Clean up the persistent ref ioerror might have on the volume
294 	 */
295 	if (volume->io.ioerror)
296 		hammer_io_clear_error_noassert(&volume->io);
297 
298 	/*
299 	 * This should release the bp.  Releasing the volume with flush set
300 	 * implies the interlock is set.
301 	 */
302 	hammer_ref_interlock_true(&volume->io.lock);
303 	hammer_rel_volume(volume, 1);
304 	KKASSERT(volume->io.bp == NULL);
305 
306 	/*
307 	 * There should be no references on the volume, no clusters, and
308 	 * no super-clusters.
309 	 */
310 	KKASSERT(hammer_norefs(&volume->io.lock));
311 
312 	volume->ondisk = NULL;
313 	if (volume->devvp) {
314 		if (volume->devvp->v_rdev &&
315 		    volume->devvp->v_rdev->si_mountpoint == hmp->mp
316 		) {
317 			volume->devvp->v_rdev->si_mountpoint = NULL;
318 		}
319 		if (ronly) {
320 			/*
321 			 * Make sure we don't sync anything to disk if we
322 			 * are in read-only mode (1) or critically-errored
323 			 * (2).  Note that there may be dirty buffers in
324 			 * normal read-only mode from crash recovery.
325 			 */
326 			vinvalbuf(volume->devvp, 0, 0, 0);
327 			VOP_CLOSE(volume->devvp, FREAD);
328 		} else {
329 			/*
330 			 * Normal termination, save any dirty buffers
331 			 * (XXX there really shouldn't be any).
332 			 */
333 			vinvalbuf(volume->devvp, V_SAVE, 0, 0);
334 			VOP_CLOSE(volume->devvp, FREAD|FWRITE);
335 		}
336 	}
337 
338 	/*
339 	 * Destroy the structure
340 	 */
341 	RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
342 	hammer_free_volume(volume);
343 	return(0);
344 }
345 
346 static
347 void
348 hammer_free_volume(hammer_volume_t volume)
349 {
350 	hammer_mount_t hmp = volume->io.hmp;
351 
352 	if (volume->vol_name) {
353 		kfree(volume->vol_name, hmp->m_misc);
354 		volume->vol_name = NULL;
355 	}
356 	if (volume->devvp) {
357 		vrele(volume->devvp);
358 		volume->devvp = NULL;
359 	}
360 	--hammer_count_volumes;
361 	kfree(volume, hmp->m_misc);
362 }
363 
364 /*
365  * Get a HAMMER volume.  The volume must already exist.
366  */
367 hammer_volume_t
368 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
369 {
370 	struct hammer_volume *volume;
371 
372 	/*
373 	 * Locate the volume structure
374 	 */
375 	volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
376 	if (volume == NULL) {
377 		*errorp = ENOENT;
378 		return(NULL);
379 	}
380 
381 	/*
382 	 * Reference the volume, load/check the data on the 0->1 transition.
383 	 * hammer_load_volume() will dispose of the interlock on return,
384 	 * and also clean up the ref count on error.
385 	 */
386 	if (hammer_ref_interlock(&volume->io.lock)) {
387 		*errorp = hammer_load_volume(volume);
388 		if (*errorp)
389 			volume = NULL;
390 	} else {
391 		KKASSERT(volume->ondisk);
392 		*errorp = 0;
393 	}
394 	return(volume);
395 }
396 
397 int
398 hammer_ref_volume(hammer_volume_t volume)
399 {
400 	int error;
401 
402 	/*
403 	 * Reference the volume and deal with the check condition used to
404 	 * load its ondisk info.
405 	 */
406 	if (hammer_ref_interlock(&volume->io.lock)) {
407 		error = hammer_load_volume(volume);
408 	} else {
409 		KKASSERT(volume->ondisk);
410 		error = 0;
411 	}
412 	return (error);
413 }
414 
415 hammer_volume_t
416 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
417 {
418 	hammer_volume_t volume;
419 
420 	volume = hmp->rootvol;
421 	KKASSERT(volume != NULL);
422 
423 	/*
424 	 * Reference the volume and deal with the check condition used to
425 	 * load its ondisk info.
426 	 */
427 	if (hammer_ref_interlock(&volume->io.lock)) {
428 		*errorp = hammer_load_volume(volume);
429 		if (*errorp)
430 			volume = NULL;
431 	} else {
432 		KKASSERT(volume->ondisk);
433 		*errorp = 0;
434 	}
435 	return (volume);
436 }
437 
438 /*
439  * Load a volume's on-disk information.  The volume must be referenced and
440  * the interlock is held on call.  The interlock will be released on return.
441  * The reference will also be released on return if an error occurs.
442  */
443 static int
444 hammer_load_volume(hammer_volume_t volume)
445 {
446 	int error;
447 
448 	if (volume->ondisk == NULL) {
449 		error = hammer_io_read(volume->devvp, &volume->io,
450 				       HAMMER_BUFSIZE);
451 		if (error == 0) {
452 			volume->ondisk = (void *)volume->io.bp->b_data;
453                         hammer_ref_interlock_done(&volume->io.lock);
454 		} else {
455                         hammer_rel_volume(volume, 1);
456 		}
457 	} else {
458 		error = 0;
459 	}
460 	return(error);
461 }
462 
463 /*
464  * Release a previously acquired reference on the volume.
465  *
466  * Volumes are not unloaded from memory during normal operation.
467  */
468 void
469 hammer_rel_volume(hammer_volume_t volume, int locked)
470 {
471 	struct buf *bp;
472 
473 	if (hammer_rel_interlock(&volume->io.lock, locked)) {
474 		volume->ondisk = NULL;
475 		bp = hammer_io_release(&volume->io, locked);
476 		hammer_rel_interlock_done(&volume->io.lock, locked);
477 		if (bp)
478 			brelse(bp);
479 	}
480 }
481 
482 int
483 hammer_mountcheck_volumes(struct hammer_mount *hmp)
484 {
485 	hammer_volume_t vol;
486 	int i;
487 
488 	for (i = 0; i < hmp->nvolumes; ++i) {
489 		vol = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, i);
490 		if (vol == NULL)
491 			return(EINVAL);
492 	}
493 	return(0);
494 }
495 
496 /************************************************************************
497  *				BUFFERS					*
498  ************************************************************************
499  *
500  * Manage buffers.  Currently most blockmap-backed zones are direct-mapped
501  * to zone-2 buffer offsets, without a translation stage.  However, the
502  * hammer_buffer structure is indexed by its zoneX_offset, not its
503  * zone2_offset.
504  *
505  * The proper zone must be maintained throughout the code-base all the way
506  * through to the big-block allocator, or routines like hammer_del_buffers()
507  * will not be able to locate all potentially conflicting buffers.
508  */
509 
510 /*
511  * Helper function returns whether a zone offset can be directly translated
512  * to a raw buffer index or not.  Really only the volume and undo zones
513  * can't be directly translated.  Volumes are special-cased and undo zones
514  * shouldn't be aliased accessed in read-only mode.
515  *
516  * This function is ONLY used to detect aliased zones during a read-only
517  * mount.
518  */
519 static __inline int
520 hammer_direct_zone(hammer_off_t buf_offset)
521 {
522 	switch(HAMMER_ZONE_DECODE(buf_offset)) {
523 	case HAMMER_ZONE_RAW_BUFFER_INDEX:
524 	case HAMMER_ZONE_FREEMAP_INDEX:
525 	case HAMMER_ZONE_BTREE_INDEX:
526 	case HAMMER_ZONE_META_INDEX:
527 	case HAMMER_ZONE_LARGE_DATA_INDEX:
528 	case HAMMER_ZONE_SMALL_DATA_INDEX:
529 		return(1);
530 	default:
531 		return(0);
532 	}
533 	/* NOT REACHED */
534 }
535 
536 hammer_buffer_t
537 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
538 		  int bytes, int isnew, int *errorp)
539 {
540 	hammer_buffer_t buffer;
541 	hammer_volume_t volume;
542 	hammer_off_t	zone2_offset;
543 	hammer_io_type_t iotype;
544 	int vol_no;
545 	int zone;
546 
547 	buf_offset &= ~HAMMER_BUFMASK64;
548 again:
549 	/*
550 	 * Shortcut if the buffer is already cached
551 	 */
552 	buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root, buf_offset);
553 	if (buffer) {
554 		/*
555 		 * Once refed the ondisk field will not be cleared by
556 		 * any other action.  Shortcut the operation if the
557 		 * ondisk structure is valid.
558 		 */
559 found_aliased:
560 		if (hammer_ref_interlock(&buffer->io.lock) == 0) {
561 			hammer_io_advance(&buffer->io);
562 			KKASSERT(buffer->ondisk);
563 			*errorp = 0;
564 			return(buffer);
565 		}
566 
567 		/*
568 		 * 0->1 transition or defered 0->1 transition (CHECK),
569 		 * interlock now held.  Shortcut if ondisk is already
570 		 * assigned.
571 		 */
572 		++hammer_count_refedbufs;
573 		if (buffer->ondisk) {
574 			hammer_io_advance(&buffer->io);
575 			hammer_ref_interlock_done(&buffer->io.lock);
576 			*errorp = 0;
577 			return(buffer);
578 		}
579 
580 		/*
581 		 * The buffer is no longer loose if it has a ref, and
582 		 * cannot become loose once it gains a ref.  Loose
583 		 * buffers will never be in a modified state.  This should
584 		 * only occur on the 0->1 transition of refs.
585 		 *
586 		 * lose_list can be modified via a biodone() interrupt
587 		 * so the io_token must be held.
588 		 */
589 		if (buffer->io.mod_root == &hmp->lose_root) {
590 			lwkt_gettoken(&hmp->io_token);
591 			if (buffer->io.mod_root == &hmp->lose_root) {
592 				RB_REMOVE(hammer_mod_rb_tree,
593 					  buffer->io.mod_root, &buffer->io);
594 				buffer->io.mod_root = NULL;
595 				KKASSERT(buffer->io.modified == 0);
596 			}
597 			lwkt_reltoken(&hmp->io_token);
598 		}
599 		goto found;
600 	} else if (hmp->ronly && hammer_direct_zone(buf_offset)) {
601 		/*
602 		 * If this is a read-only mount there could be an alias
603 		 * in the raw-zone.  If there is we use that buffer instead.
604 		 *
605 		 * rw mounts will not have aliases.  Also note when going
606 		 * from ro -> rw the recovered raw buffers are flushed and
607 		 * reclaimed, so again there will not be any aliases once
608 		 * the mount is rw.
609 		 */
610 		buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root,
611 				   (buf_offset & ~HAMMER_OFF_ZONE_MASK) |
612 				   HAMMER_ZONE_RAW_BUFFER);
613 		if (buffer) {
614 			kprintf("HAMMER: recovered aliased %016jx\n",
615 				(intmax_t)buf_offset);
616 			goto found_aliased;
617 		}
618 	}
619 
620 	/*
621 	 * What is the buffer class?
622 	 */
623 	zone = HAMMER_ZONE_DECODE(buf_offset);
624 
625 	switch(zone) {
626 	case HAMMER_ZONE_LARGE_DATA_INDEX:
627 	case HAMMER_ZONE_SMALL_DATA_INDEX:
628 		iotype = HAMMER_STRUCTURE_DATA_BUFFER;
629 		break;
630 	case HAMMER_ZONE_UNDO_INDEX:
631 		iotype = HAMMER_STRUCTURE_UNDO_BUFFER;
632 		break;
633 	case HAMMER_ZONE_META_INDEX:
634 	default:
635 		/*
636 		 * NOTE: inode data and directory entries are placed in this
637 		 * zone.  inode atime/mtime is updated in-place and thus
638 		 * buffers containing inodes must be synchronized as
639 		 * meta-buffers, same as buffers containing B-Tree info.
640 		 */
641 		iotype = HAMMER_STRUCTURE_META_BUFFER;
642 		break;
643 	}
644 
645 	/*
646 	 * Handle blockmap offset translations
647 	 */
648 	if (zone >= HAMMER_ZONE_BTREE_INDEX) {
649 		zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp);
650 	} else if (zone == HAMMER_ZONE_UNDO_INDEX) {
651 		zone2_offset = hammer_undo_lookup(hmp, buf_offset, errorp);
652 	} else {
653 		KKASSERT(zone == HAMMER_ZONE_RAW_BUFFER_INDEX);
654 		zone2_offset = buf_offset;
655 		*errorp = 0;
656 	}
657 	if (*errorp)
658 		return(NULL);
659 
660 	/*
661 	 * NOTE: zone2_offset and maxbuf_off are both full zone-2 offset
662 	 * specifications.
663 	 */
664 	KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
665 		 HAMMER_ZONE_RAW_BUFFER);
666 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
667 	volume = hammer_get_volume(hmp, vol_no, errorp);
668 	if (volume == NULL)
669 		return(NULL);
670 
671 	KKASSERT(zone2_offset < volume->maxbuf_off);
672 
673 	/*
674 	 * Allocate a new buffer structure.  We will check for races later.
675 	 */
676 	++hammer_count_buffers;
677 	buffer = kmalloc(sizeof(*buffer), hmp->m_misc,
678 			 M_WAITOK|M_ZERO|M_USE_RESERVE);
679 	buffer->zone2_offset = zone2_offset;
680 	buffer->zoneX_offset = buf_offset;
681 
682 	hammer_io_init(&buffer->io, volume, iotype);
683 	buffer->io.offset = volume->ondisk->vol_buf_beg +
684 			    (zone2_offset & HAMMER_OFF_SHORT_MASK);
685 	buffer->io.bytes = bytes;
686 	TAILQ_INIT(&buffer->clist);
687 	hammer_ref_interlock_true(&buffer->io.lock);
688 
689 	/*
690 	 * Insert the buffer into the RB tree and handle late collisions.
691 	 */
692 	if (RB_INSERT(hammer_buf_rb_tree, &hmp->rb_bufs_root, buffer)) {
693 		hammer_rel_volume(volume, 0);
694 		buffer->io.volume = NULL;			/* safety */
695 		if (hammer_rel_interlock(&buffer->io.lock, 1))	/* safety */
696 			hammer_rel_interlock_done(&buffer->io.lock, 1);
697 		--hammer_count_buffers;
698 		kfree(buffer, hmp->m_misc);
699 		goto again;
700 	}
701 	++hammer_count_refedbufs;
702 found:
703 
704 	/*
705 	 * The buffer is referenced and interlocked.  Load the buffer
706 	 * if necessary.  hammer_load_buffer() deals with the interlock
707 	 * and, if an error is returned, also deals with the ref.
708 	 */
709 	if (buffer->ondisk == NULL) {
710 		*errorp = hammer_load_buffer(buffer, isnew);
711 		if (*errorp)
712 			buffer = NULL;
713 	} else {
714 		hammer_io_advance(&buffer->io);
715 		hammer_ref_interlock_done(&buffer->io.lock);
716 		*errorp = 0;
717 	}
718 	return(buffer);
719 }
720 
721 /*
722  * This is used by the direct-read code to deal with large-data buffers
723  * created by the reblocker and mirror-write code.  The direct-read code
724  * bypasses the HAMMER buffer subsystem and so any aliased dirty or write-
725  * running hammer buffers must be fully synced to disk before we can issue
726  * the direct-read.
727  *
728  * This code path is not considered critical as only the rebocker and
729  * mirror-write code will create large-data buffers via the HAMMER buffer
730  * subsystem.  They do that because they operate at the B-Tree level and
731  * do not access the vnode/inode structures.
732  */
733 void
734 hammer_sync_buffers(hammer_mount_t hmp, hammer_off_t base_offset, int bytes)
735 {
736 	hammer_buffer_t buffer;
737 	int error;
738 
739 	KKASSERT((base_offset & HAMMER_OFF_ZONE_MASK) ==
740 		 HAMMER_ZONE_LARGE_DATA);
741 
742 	while (bytes > 0) {
743 		buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root,
744 				   base_offset);
745 		if (buffer && (buffer->io.modified || buffer->io.running)) {
746 			error = hammer_ref_buffer(buffer);
747 			if (error == 0) {
748 				hammer_io_wait(&buffer->io);
749 				if (buffer->io.modified) {
750 					hammer_io_write_interlock(&buffer->io);
751 					hammer_io_flush(&buffer->io, 0);
752 					hammer_io_done_interlock(&buffer->io);
753 					hammer_io_wait(&buffer->io);
754 				}
755 				hammer_rel_buffer(buffer, 0);
756 			}
757 		}
758 		base_offset += HAMMER_BUFSIZE;
759 		bytes -= HAMMER_BUFSIZE;
760 	}
761 }
762 
763 /*
764  * Destroy all buffers covering the specified zoneX offset range.  This
765  * is called when the related blockmap layer2 entry is freed or when
766  * a direct write bypasses our buffer/buffer-cache subsystem.
767  *
768  * The buffers may be referenced by the caller itself.  Setting reclaim
769  * will cause the buffer to be destroyed when it's ref count reaches zero.
770  *
771  * Return 0 on success, EAGAIN if some buffers could not be destroyed due
772  * to additional references held by other threads, or some other (typically
773  * fatal) error.
774  */
775 int
776 hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset,
777 		   hammer_off_t zone2_offset, int bytes,
778 		   int report_conflicts)
779 {
780 	hammer_buffer_t buffer;
781 	hammer_volume_t volume;
782 	int vol_no;
783 	int error;
784 	int ret_error;
785 
786 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
787 	volume = hammer_get_volume(hmp, vol_no, &ret_error);
788 	KKASSERT(ret_error == 0);
789 
790 	while (bytes > 0) {
791 		buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root,
792 				   base_offset);
793 		if (buffer) {
794 			error = hammer_ref_buffer(buffer);
795 			if (hammer_debug_general & 0x20000) {
796 				kprintf("hammer: delbufr %016jx "
797 					"rerr=%d 1ref=%d\n",
798 					(intmax_t)buffer->zoneX_offset,
799 					error,
800 					hammer_oneref(&buffer->io.lock));
801 			}
802 			if (error == 0 && !hammer_oneref(&buffer->io.lock)) {
803 				error = EAGAIN;
804 				hammer_rel_buffer(buffer, 0);
805 			}
806 			if (error == 0) {
807 				KKASSERT(buffer->zone2_offset == zone2_offset);
808 				hammer_io_clear_modify(&buffer->io, 1);
809 				buffer->io.reclaim = 1;
810 				buffer->io.waitdep = 1;
811 				KKASSERT(buffer->io.volume == volume);
812 				hammer_rel_buffer(buffer, 0);
813 			}
814 		} else {
815 			error = hammer_io_inval(volume, zone2_offset);
816 		}
817 		if (error) {
818 			ret_error = error;
819 			if (report_conflicts ||
820 			    (hammer_debug_general & 0x8000)) {
821 				kprintf("hammer_del_buffers: unable to "
822 					"invalidate %016llx buffer=%p rep=%d\n",
823 					(long long)base_offset,
824 					buffer, report_conflicts);
825 			}
826 		}
827 		base_offset += HAMMER_BUFSIZE;
828 		zone2_offset += HAMMER_BUFSIZE;
829 		bytes -= HAMMER_BUFSIZE;
830 	}
831 	hammer_rel_volume(volume, 0);
832 	return (ret_error);
833 }
834 
835 /*
836  * Given a referenced and interlocked buffer load/validate the data.
837  *
838  * The buffer interlock will be released on return.  If an error is
839  * returned the buffer reference will also be released (and the buffer
840  * pointer will thus be stale).
841  */
842 static int
843 hammer_load_buffer(hammer_buffer_t buffer, int isnew)
844 {
845 	hammer_volume_t volume;
846 	int error;
847 
848 	/*
849 	 * Load the buffer's on-disk info
850 	 */
851 	volume = buffer->io.volume;
852 
853 	if (hammer_debug_io & 0x0004) {
854 		kprintf("load_buffer %016llx %016llx isnew=%d od=%p\n",
855 			(long long)buffer->zoneX_offset,
856 			(long long)buffer->zone2_offset,
857 			isnew, buffer->ondisk);
858 	}
859 
860 	if (buffer->ondisk == NULL) {
861 		/*
862 		 * Issue the read or generate a new buffer.  When reading
863 		 * the limit argument controls any read-ahead clustering
864 		 * hammer_io_read() is allowed to do.
865 		 *
866 		 * We cannot read-ahead in the large-data zone and we cannot
867 		 * cross a largeblock boundary as the next largeblock might
868 		 * use a different buffer size.
869 		 */
870 		if (isnew) {
871 			error = hammer_io_new(volume->devvp, &buffer->io);
872 		} else if ((buffer->zoneX_offset & HAMMER_OFF_ZONE_MASK) ==
873 			   HAMMER_ZONE_LARGE_DATA) {
874 			error = hammer_io_read(volume->devvp, &buffer->io,
875 					       buffer->io.bytes);
876 		} else {
877 			hammer_off_t limit;
878 
879 			limit = (buffer->zone2_offset +
880 				 HAMMER_LARGEBLOCK_MASK64) &
881 				~HAMMER_LARGEBLOCK_MASK64;
882 			limit -= buffer->zone2_offset;
883 			error = hammer_io_read(volume->devvp, &buffer->io,
884 					       limit);
885 		}
886 		if (error == 0)
887 			buffer->ondisk = (void *)buffer->io.bp->b_data;
888 	} else if (isnew) {
889 		error = hammer_io_new(volume->devvp, &buffer->io);
890 	} else {
891 		error = 0;
892 	}
893 	if (error == 0) {
894 		hammer_io_advance(&buffer->io);
895 		hammer_ref_interlock_done(&buffer->io.lock);
896 	} else {
897 		hammer_rel_buffer(buffer, 1);
898 	}
899 	return (error);
900 }
901 
902 /*
903  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
904  * This routine is only called during unmount or when a volume is
905  * removed.
906  *
907  * If data != NULL, it specifies a volume whoose buffers should
908  * be unloaded.
909  */
910 int
911 hammer_unload_buffer(hammer_buffer_t buffer, void *data)
912 {
913 	struct hammer_volume *volume = (struct hammer_volume *) data;
914 
915 	/*
916 	 * If volume != NULL we are only interested in unloading buffers
917 	 * associated with a particular volume.
918 	 */
919 	if (volume != NULL && volume != buffer->io.volume)
920 		return 0;
921 
922 	/*
923 	 * Clean up the persistent ref ioerror might have on the buffer
924 	 * and acquire a ref.  Expect a 0->1 transition.
925 	 */
926 	if (buffer->io.ioerror) {
927 		hammer_io_clear_error_noassert(&buffer->io);
928 		--hammer_count_refedbufs;
929 	}
930 	hammer_ref_interlock_true(&buffer->io.lock);
931 	++hammer_count_refedbufs;
932 
933 	/*
934 	 * We must not flush a dirty buffer to disk on umount.  It should
935 	 * have already been dealt with by the flusher, or we may be in
936 	 * catastrophic failure.
937 	 *
938 	 * We must set waitdep to ensure that a running buffer is waited
939 	 * on and released prior to us trying to unload the volume.
940 	 */
941 	hammer_io_clear_modify(&buffer->io, 1);
942 	hammer_flush_buffer_nodes(buffer);
943 	buffer->io.waitdep = 1;
944 	hammer_rel_buffer(buffer, 1);
945 	return(0);
946 }
947 
948 /*
949  * Reference a buffer that is either already referenced or via a specially
950  * handled pointer (aka cursor->buffer).
951  */
952 int
953 hammer_ref_buffer(hammer_buffer_t buffer)
954 {
955 	hammer_mount_t hmp;
956 	int error;
957 	int locked;
958 
959 	/*
960 	 * Acquire a ref, plus the buffer will be interlocked on the
961 	 * 0->1 transition.
962 	 */
963 	locked = hammer_ref_interlock(&buffer->io.lock);
964 	hmp = buffer->io.hmp;
965 
966 	/*
967 	 * At this point a biodone() will not touch the buffer other then
968 	 * incidental bits.  However, lose_list can be modified via
969 	 * a biodone() interrupt.
970 	 *
971 	 * No longer loose.  lose_list requires the io_token.
972 	 */
973 	if (buffer->io.mod_root == &hmp->lose_root) {
974 		lwkt_gettoken(&hmp->io_token);
975 		if (buffer->io.mod_root == &hmp->lose_root) {
976 			RB_REMOVE(hammer_mod_rb_tree,
977 				  buffer->io.mod_root, &buffer->io);
978 			buffer->io.mod_root = NULL;
979 		}
980 		lwkt_reltoken(&hmp->io_token);
981 	}
982 
983 	if (locked) {
984 		++hammer_count_refedbufs;
985 		error = hammer_load_buffer(buffer, 0);
986 		/* NOTE: on error the buffer pointer is stale */
987 	} else {
988 		error = 0;
989 	}
990 	return(error);
991 }
992 
993 /*
994  * Release a reference on the buffer.  On the 1->0 transition the
995  * underlying IO will be released but the data reference is left
996  * cached.
997  *
998  * Only destroy the structure itself if the related buffer cache buffer
999  * was disassociated from it.  This ties the management of the structure
1000  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1001  * embedded io is referenced or not.
1002  */
1003 void
1004 hammer_rel_buffer(hammer_buffer_t buffer, int locked)
1005 {
1006 	hammer_volume_t volume;
1007 	hammer_mount_t hmp;
1008 	struct buf *bp = NULL;
1009 	int freeme = 0;
1010 
1011 	hmp = buffer->io.hmp;
1012 
1013 	if (hammer_rel_interlock(&buffer->io.lock, locked) == 0)
1014 		return;
1015 
1016 	/*
1017 	 * hammer_count_refedbufs accounting.  Decrement if we are in
1018 	 * the error path or if CHECK is clear.
1019 	 *
1020 	 * If we are not in the error path and CHECK is set the caller
1021 	 * probably just did a hammer_ref() and didn't account for it,
1022 	 * so we don't account for the loss here.
1023 	 */
1024 	if (locked || (buffer->io.lock.refs & HAMMER_REFS_CHECK) == 0)
1025 		--hammer_count_refedbufs;
1026 
1027 	/*
1028 	 * If the caller locked us or the normal released transitions
1029 	 * from 1->0 (and acquired the lock) attempt to release the
1030 	 * io.  If the called locked us we tell hammer_io_release()
1031 	 * to flush (which would be the unload or failure path).
1032 	 */
1033 	bp = hammer_io_release(&buffer->io, locked);
1034 
1035 	/*
1036 	 * If the buffer has no bp association and no refs we can destroy
1037 	 * it.
1038 	 *
1039 	 * NOTE: It is impossible for any associated B-Tree nodes to have
1040 	 * refs if the buffer has no additional refs.
1041 	 */
1042 	if (buffer->io.bp == NULL && hammer_norefs(&buffer->io.lock)) {
1043 		RB_REMOVE(hammer_buf_rb_tree,
1044 			  &buffer->io.hmp->rb_bufs_root,
1045 			  buffer);
1046 		volume = buffer->io.volume;
1047 		buffer->io.volume = NULL; /* sanity */
1048 		hammer_rel_volume(volume, 0);
1049 		hammer_io_clear_modlist(&buffer->io);
1050 		hammer_flush_buffer_nodes(buffer);
1051 		KKASSERT(TAILQ_EMPTY(&buffer->clist));
1052 		freeme = 1;
1053 	}
1054 
1055 	/*
1056 	 * Cleanup
1057 	 */
1058 	hammer_rel_interlock_done(&buffer->io.lock, locked);
1059 	if (bp)
1060 		brelse(bp);
1061 	if (freeme) {
1062 		--hammer_count_buffers;
1063 		kfree(buffer, hmp->m_misc);
1064 	}
1065 }
1066 
1067 /*
1068  * Access the filesystem buffer containing the specified hammer offset.
1069  * buf_offset is a conglomeration of the volume number and vol_buf_beg
1070  * relative buffer offset.  It must also have bit 55 set to be valid.
1071  * (see hammer_off_t in hammer_disk.h).
1072  *
1073  * Any prior buffer in *bufferp will be released and replaced by the
1074  * requested buffer.
1075  *
1076  * NOTE: The buffer is indexed via its zoneX_offset but we allow the
1077  * passed cached *bufferp to match against either zoneX or zone2.
1078  */
1079 static __inline
1080 void *
1081 _hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes,
1082 	     int *errorp, struct hammer_buffer **bufferp)
1083 {
1084 	hammer_buffer_t buffer;
1085 	int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
1086 
1087 	buf_offset &= ~HAMMER_BUFMASK64;
1088 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0);
1089 
1090 	buffer = *bufferp;
1091 	if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
1092 			       buffer->zoneX_offset != buf_offset)) {
1093 		if (buffer)
1094 			hammer_rel_buffer(buffer, 0);
1095 		buffer = hammer_get_buffer(hmp, buf_offset, bytes, 0, errorp);
1096 		*bufferp = buffer;
1097 	} else {
1098 		*errorp = 0;
1099 	}
1100 
1101 	/*
1102 	 * Return a pointer to the buffer data.
1103 	 */
1104 	if (buffer == NULL)
1105 		return(NULL);
1106 	else
1107 		return((char *)buffer->ondisk + xoff);
1108 }
1109 
1110 void *
1111 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset,
1112 	     int *errorp, struct hammer_buffer **bufferp)
1113 {
1114 	return(_hammer_bread(hmp, buf_offset, HAMMER_BUFSIZE, errorp, bufferp));
1115 }
1116 
1117 void *
1118 hammer_bread_ext(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes,
1119 	         int *errorp, struct hammer_buffer **bufferp)
1120 {
1121 	bytes = (bytes + HAMMER_BUFMASK) & ~HAMMER_BUFMASK;
1122 	return(_hammer_bread(hmp, buf_offset, bytes, errorp, bufferp));
1123 }
1124 
1125 /*
1126  * Access the filesystem buffer containing the specified hammer offset.
1127  * No disk read operation occurs.  The result buffer may contain garbage.
1128  *
1129  * Any prior buffer in *bufferp will be released and replaced by the
1130  * requested buffer.
1131  *
1132  * This function marks the buffer dirty but does not increment its
1133  * modify_refs count.
1134  */
1135 static __inline
1136 void *
1137 _hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes,
1138 	     int *errorp, struct hammer_buffer **bufferp)
1139 {
1140 	hammer_buffer_t buffer;
1141 	int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
1142 
1143 	buf_offset &= ~HAMMER_BUFMASK64;
1144 
1145 	buffer = *bufferp;
1146 	if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
1147 			       buffer->zoneX_offset != buf_offset)) {
1148 		if (buffer)
1149 			hammer_rel_buffer(buffer, 0);
1150 		buffer = hammer_get_buffer(hmp, buf_offset, bytes, 1, errorp);
1151 		*bufferp = buffer;
1152 	} else {
1153 		*errorp = 0;
1154 	}
1155 
1156 	/*
1157 	 * Return a pointer to the buffer data.
1158 	 */
1159 	if (buffer == NULL)
1160 		return(NULL);
1161 	else
1162 		return((char *)buffer->ondisk + xoff);
1163 }
1164 
1165 void *
1166 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset,
1167 	     int *errorp, struct hammer_buffer **bufferp)
1168 {
1169 	return(_hammer_bnew(hmp, buf_offset, HAMMER_BUFSIZE, errorp, bufferp));
1170 }
1171 
1172 void *
1173 hammer_bnew_ext(hammer_mount_t hmp, hammer_off_t buf_offset, int bytes,
1174 		int *errorp, struct hammer_buffer **bufferp)
1175 {
1176 	bytes = (bytes + HAMMER_BUFMASK) & ~HAMMER_BUFMASK;
1177 	return(_hammer_bnew(hmp, buf_offset, bytes, errorp, bufferp));
1178 }
1179 
1180 /************************************************************************
1181  *				NODES					*
1182  ************************************************************************
1183  *
1184  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1185  * method used by the HAMMER filesystem.
1186  *
1187  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1188  * associated with its buffer, and will only referenced the buffer while
1189  * the node itself is referenced.
1190  *
1191  * A hammer_node can also be passively associated with other HAMMER
1192  * structures, such as inodes, while retaining 0 references.  These
1193  * associations can be cleared backwards using a pointer-to-pointer in
1194  * the hammer_node.
1195  *
1196  * This allows the HAMMER implementation to cache hammer_nodes long-term
1197  * and short-cut a great deal of the infrastructure's complexity.  In
1198  * most cases a cached node can be reacquired without having to dip into
1199  * either the buffer or cluster management code.
1200  *
1201  * The caller must pass a referenced cluster on call and will retain
1202  * ownership of the reference on return.  The node will acquire its own
1203  * additional references, if necessary.
1204  */
1205 hammer_node_t
1206 hammer_get_node(hammer_transaction_t trans, hammer_off_t node_offset,
1207 		int isnew, int *errorp)
1208 {
1209 	hammer_mount_t hmp = trans->hmp;
1210 	hammer_node_t node;
1211 	int doload;
1212 
1213 	KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE);
1214 
1215 	/*
1216 	 * Locate the structure, allocating one if necessary.
1217 	 */
1218 again:
1219 	node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset);
1220 	if (node == NULL) {
1221 		++hammer_count_nodes;
1222 		node = kmalloc(sizeof(*node), hmp->m_misc, M_WAITOK|M_ZERO|M_USE_RESERVE);
1223 		node->node_offset = node_offset;
1224 		node->hmp = hmp;
1225 		TAILQ_INIT(&node->cursor_list);
1226 		TAILQ_INIT(&node->cache_list);
1227 		if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) {
1228 			--hammer_count_nodes;
1229 			kfree(node, hmp->m_misc);
1230 			goto again;
1231 		}
1232 		doload = hammer_ref_interlock_true(&node->lock);
1233 	} else {
1234 		doload = hammer_ref_interlock(&node->lock);
1235 	}
1236 	if (doload) {
1237 		*errorp = hammer_load_node(trans, node, isnew);
1238 		trans->flags |= HAMMER_TRANSF_DIDIO;
1239 		if (*errorp)
1240 			node = NULL;
1241 	} else {
1242 		KKASSERT(node->ondisk);
1243 		*errorp = 0;
1244 		hammer_io_advance(&node->buffer->io);
1245 	}
1246 	return(node);
1247 }
1248 
1249 /*
1250  * Reference an already-referenced node.  0->1 transitions should assert
1251  * so we do not have to deal with hammer_ref() setting CHECK.
1252  */
1253 void
1254 hammer_ref_node(hammer_node_t node)
1255 {
1256 	KKASSERT(hammer_isactive(&node->lock) && node->ondisk != NULL);
1257 	hammer_ref(&node->lock);
1258 }
1259 
1260 /*
1261  * Load a node's on-disk data reference.  Called with the node referenced
1262  * and interlocked.
1263  *
1264  * On return the node interlock will be unlocked.  If a non-zero error code
1265  * is returned the node will also be dereferenced (and the caller's pointer
1266  * will be stale).
1267  */
1268 static int
1269 hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew)
1270 {
1271 	hammer_buffer_t buffer;
1272 	hammer_off_t buf_offset;
1273 	int error;
1274 
1275 	error = 0;
1276 	if (node->ondisk == NULL) {
1277 		/*
1278 		 * This is a little confusing but the jist is that
1279 		 * node->buffer determines whether the node is on
1280 		 * the buffer's clist and node->ondisk determines
1281 		 * whether the buffer is referenced.
1282 		 *
1283 		 * We could be racing a buffer release, in which case
1284 		 * node->buffer may become NULL while we are blocked
1285 		 * referencing the buffer.
1286 		 */
1287 		if ((buffer = node->buffer) != NULL) {
1288 			error = hammer_ref_buffer(buffer);
1289 			if (error == 0 && node->buffer == NULL) {
1290 				TAILQ_INSERT_TAIL(&buffer->clist,
1291 						  node, entry);
1292 				node->buffer = buffer;
1293 			}
1294 		} else {
1295 			buf_offset = node->node_offset & ~HAMMER_BUFMASK64;
1296 			buffer = hammer_get_buffer(node->hmp, buf_offset,
1297 						   HAMMER_BUFSIZE, 0, &error);
1298 			if (buffer) {
1299 				KKASSERT(error == 0);
1300 				TAILQ_INSERT_TAIL(&buffer->clist,
1301 						  node, entry);
1302 				node->buffer = buffer;
1303 			}
1304 		}
1305 		if (error)
1306 			goto failed;
1307 		node->ondisk = (void *)((char *)buffer->ondisk +
1308 				        (node->node_offset & HAMMER_BUFMASK));
1309 
1310 		/*
1311 		 * Check CRC.  NOTE: Neither flag is set and the CRC is not
1312 		 * generated on new B-Tree nodes.
1313 		 */
1314 		if (isnew == 0 &&
1315 		    (node->flags & HAMMER_NODE_CRCANY) == 0) {
1316 			if (hammer_crc_test_btree(node->ondisk) == 0) {
1317 				if (hammer_debug_critical)
1318 					Debugger("CRC FAILED: B-TREE NODE");
1319 				node->flags |= HAMMER_NODE_CRCBAD;
1320 			} else {
1321 				node->flags |= HAMMER_NODE_CRCGOOD;
1322 			}
1323 		}
1324 	}
1325 	if (node->flags & HAMMER_NODE_CRCBAD) {
1326 		if (trans->flags & HAMMER_TRANSF_CRCDOM)
1327 			error = EDOM;
1328 		else
1329 			error = EIO;
1330 	}
1331 failed:
1332 	if (error) {
1333 		_hammer_rel_node(node, 1);
1334 	} else {
1335 		hammer_ref_interlock_done(&node->lock);
1336 	}
1337 	return (error);
1338 }
1339 
1340 /*
1341  * Safely reference a node, interlock against flushes via the IO subsystem.
1342  */
1343 hammer_node_t
1344 hammer_ref_node_safe(hammer_transaction_t trans, hammer_node_cache_t cache,
1345 		     int *errorp)
1346 {
1347 	hammer_node_t node;
1348 	int doload;
1349 
1350 	node = cache->node;
1351 	if (node != NULL) {
1352 		doload = hammer_ref_interlock(&node->lock);
1353 		if (doload) {
1354 			*errorp = hammer_load_node(trans, node, 0);
1355 			if (*errorp)
1356 				node = NULL;
1357 		} else {
1358 			KKASSERT(node->ondisk);
1359 			if (node->flags & HAMMER_NODE_CRCBAD) {
1360 				if (trans->flags & HAMMER_TRANSF_CRCDOM)
1361 					*errorp = EDOM;
1362 				else
1363 					*errorp = EIO;
1364 				_hammer_rel_node(node, 0);
1365 				node = NULL;
1366 			} else {
1367 				*errorp = 0;
1368 			}
1369 		}
1370 	} else {
1371 		*errorp = ENOENT;
1372 	}
1373 	return(node);
1374 }
1375 
1376 /*
1377  * Release a hammer_node.  On the last release the node dereferences
1378  * its underlying buffer and may or may not be destroyed.
1379  *
1380  * If locked is non-zero the passed node has been interlocked by the
1381  * caller and we are in the failure/unload path, otherwise it has not and
1382  * we are doing a normal release.
1383  *
1384  * This function will dispose of the interlock and the reference.
1385  * On return the node pointer is stale.
1386  */
1387 void
1388 _hammer_rel_node(hammer_node_t node, int locked)
1389 {
1390 	hammer_buffer_t buffer;
1391 
1392 	/*
1393 	 * Deref the node.  If this isn't the 1->0 transition we're basically
1394 	 * done.  If locked is non-zero this function will just deref the
1395 	 * locked node and return TRUE, otherwise it will deref the locked
1396 	 * node and either lock and return TRUE on the 1->0 transition or
1397 	 * not lock and return FALSE.
1398 	 */
1399 	if (hammer_rel_interlock(&node->lock, locked) == 0)
1400 		return;
1401 
1402 	/*
1403 	 * Either locked was non-zero and we are interlocked, or the
1404 	 * hammer_rel_interlock() call returned non-zero and we are
1405 	 * interlocked.
1406 	 *
1407 	 * The ref-count must still be decremented if locked != 0 so
1408 	 * the cleanup required still varies a bit.
1409 	 *
1410 	 * hammer_flush_node() when called with 1 or 2 will dispose of
1411 	 * the lock and possible ref-count.
1412 	 */
1413 	if (node->ondisk == NULL) {
1414 		hammer_flush_node(node, locked + 1);
1415 		/* node is stale now */
1416 		return;
1417 	}
1418 
1419 	/*
1420 	 * Do not disassociate the node from the buffer if it represents
1421 	 * a modified B-Tree node that still needs its crc to be generated.
1422 	 */
1423 	if (node->flags & HAMMER_NODE_NEEDSCRC) {
1424 		hammer_rel_interlock_done(&node->lock, locked);
1425 		return;
1426 	}
1427 
1428 	/*
1429 	 * Do final cleanups and then either destroy the node and leave it
1430 	 * passively cached.  The buffer reference is removed regardless.
1431 	 */
1432 	buffer = node->buffer;
1433 	node->ondisk = NULL;
1434 
1435 	if ((node->flags & HAMMER_NODE_FLUSH) == 0) {
1436 		/*
1437 		 * Normal release.
1438 		 */
1439 		hammer_rel_interlock_done(&node->lock, locked);
1440 	} else {
1441 		/*
1442 		 * Destroy the node.
1443 		 */
1444 		hammer_flush_node(node, locked + 1);
1445 		/* node is stale */
1446 
1447 	}
1448 	hammer_rel_buffer(buffer, 0);
1449 }
1450 
1451 void
1452 hammer_rel_node(hammer_node_t node)
1453 {
1454 	_hammer_rel_node(node, 0);
1455 }
1456 
1457 /*
1458  * Free space on-media associated with a B-Tree node.
1459  */
1460 void
1461 hammer_delete_node(hammer_transaction_t trans, hammer_node_t node)
1462 {
1463 	KKASSERT((node->flags & HAMMER_NODE_DELETED) == 0);
1464 	node->flags |= HAMMER_NODE_DELETED;
1465 	hammer_blockmap_free(trans, node->node_offset, sizeof(*node->ondisk));
1466 }
1467 
1468 /*
1469  * Passively cache a referenced hammer_node.  The caller may release
1470  * the node on return.
1471  */
1472 void
1473 hammer_cache_node(hammer_node_cache_t cache, hammer_node_t node)
1474 {
1475 	/*
1476 	 * If the node doesn't exist, or is being deleted, don't cache it!
1477 	 *
1478 	 * The node can only ever be NULL in the I/O failure path.
1479 	 */
1480 	if (node == NULL || (node->flags & HAMMER_NODE_DELETED))
1481 		return;
1482 	if (cache->node == node)
1483 		return;
1484 	while (cache->node)
1485 		hammer_uncache_node(cache);
1486 	if (node->flags & HAMMER_NODE_DELETED)
1487 		return;
1488 	cache->node = node;
1489 	TAILQ_INSERT_TAIL(&node->cache_list, cache, entry);
1490 }
1491 
1492 void
1493 hammer_uncache_node(hammer_node_cache_t cache)
1494 {
1495 	hammer_node_t node;
1496 
1497 	if ((node = cache->node) != NULL) {
1498 		TAILQ_REMOVE(&node->cache_list, cache, entry);
1499 		cache->node = NULL;
1500 		if (TAILQ_EMPTY(&node->cache_list))
1501 			hammer_flush_node(node, 0);
1502 	}
1503 }
1504 
1505 /*
1506  * Remove a node's cache references and destroy the node if it has no
1507  * other references or backing store.
1508  *
1509  * locked == 0	Normal unlocked operation
1510  * locked == 1	Call hammer_rel_interlock_done(..., 0);
1511  * locked == 2	Call hammer_rel_interlock_done(..., 1);
1512  *
1513  * XXX for now this isn't even close to being MPSAFE so the refs check
1514  *     is sufficient.
1515  */
1516 void
1517 hammer_flush_node(hammer_node_t node, int locked)
1518 {
1519 	hammer_node_cache_t cache;
1520 	hammer_buffer_t buffer;
1521 	hammer_mount_t hmp = node->hmp;
1522 	int dofree;
1523 
1524 	while ((cache = TAILQ_FIRST(&node->cache_list)) != NULL) {
1525 		TAILQ_REMOVE(&node->cache_list, cache, entry);
1526 		cache->node = NULL;
1527 	}
1528 
1529 	/*
1530 	 * NOTE: refs is predisposed if another thread is blocking and
1531 	 *	 will be larger than 0 in that case.  We aren't MPSAFE
1532 	 *	 here.
1533 	 */
1534 	if (node->ondisk == NULL && hammer_norefs(&node->lock)) {
1535 		KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0);
1536 		RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node);
1537 		if ((buffer = node->buffer) != NULL) {
1538 			node->buffer = NULL;
1539 			TAILQ_REMOVE(&buffer->clist, node, entry);
1540 			/* buffer is unreferenced because ondisk is NULL */
1541 		}
1542 		dofree = 1;
1543 	} else {
1544 		dofree = 0;
1545 	}
1546 
1547 	/*
1548 	 * Deal with the interlock if locked == 1 or locked == 2.
1549 	 */
1550 	if (locked)
1551 		hammer_rel_interlock_done(&node->lock, locked - 1);
1552 
1553 	/*
1554 	 * Destroy if requested
1555 	 */
1556 	if (dofree) {
1557 		--hammer_count_nodes;
1558 		kfree(node, hmp->m_misc);
1559 	}
1560 }
1561 
1562 /*
1563  * Flush passively cached B-Tree nodes associated with this buffer.
1564  * This is only called when the buffer is about to be destroyed, so
1565  * none of the nodes should have any references.  The buffer is locked.
1566  *
1567  * We may be interlocked with the buffer.
1568  */
1569 void
1570 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1571 {
1572 	hammer_node_t node;
1573 
1574 	while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1575 		KKASSERT(node->ondisk == NULL);
1576 		KKASSERT((node->flags & HAMMER_NODE_NEEDSCRC) == 0);
1577 
1578 		if (hammer_try_interlock_norefs(&node->lock)) {
1579 			hammer_ref(&node->lock);
1580 			node->flags |= HAMMER_NODE_FLUSH;
1581 			_hammer_rel_node(node, 1);
1582 		} else {
1583 			KKASSERT(node->buffer != NULL);
1584 			buffer = node->buffer;
1585 			node->buffer = NULL;
1586 			TAILQ_REMOVE(&buffer->clist, node, entry);
1587 			/* buffer is unreferenced because ondisk is NULL */
1588 		}
1589 	}
1590 }
1591 
1592 
1593 /************************************************************************
1594  *				ALLOCATORS				*
1595  ************************************************************************/
1596 
1597 /*
1598  * Allocate a B-Tree node.
1599  */
1600 hammer_node_t
1601 hammer_alloc_btree(hammer_transaction_t trans, hammer_off_t hint, int *errorp)
1602 {
1603 	hammer_buffer_t buffer = NULL;
1604 	hammer_node_t node = NULL;
1605 	hammer_off_t node_offset;
1606 
1607 	node_offset = hammer_blockmap_alloc(trans, HAMMER_ZONE_BTREE_INDEX,
1608 					    sizeof(struct hammer_node_ondisk),
1609 					    hint, errorp);
1610 	if (*errorp == 0) {
1611 		node = hammer_get_node(trans, node_offset, 1, errorp);
1612 		hammer_modify_node_noundo(trans, node);
1613 		bzero(node->ondisk, sizeof(*node->ondisk));
1614 		hammer_modify_node_done(node);
1615 	}
1616 	if (buffer)
1617 		hammer_rel_buffer(buffer, 0);
1618 	return(node);
1619 }
1620 
1621 /*
1622  * Allocate data.  If the address of a data buffer is supplied then
1623  * any prior non-NULL *data_bufferp will be released and *data_bufferp
1624  * will be set to the related buffer.  The caller must release it when
1625  * finally done.  The initial *data_bufferp should be set to NULL by
1626  * the caller.
1627  *
1628  * The caller is responsible for making hammer_modify*() calls on the
1629  * *data_bufferp.
1630  */
1631 void *
1632 hammer_alloc_data(hammer_transaction_t trans, int32_t data_len,
1633 		  u_int16_t rec_type, hammer_off_t *data_offsetp,
1634 		  struct hammer_buffer **data_bufferp,
1635 		  hammer_off_t hint, int *errorp)
1636 {
1637 	void *data;
1638 	int zone;
1639 
1640 	/*
1641 	 * Allocate data
1642 	 */
1643 	if (data_len) {
1644 		switch(rec_type) {
1645 		case HAMMER_RECTYPE_INODE:
1646 		case HAMMER_RECTYPE_DIRENTRY:
1647 		case HAMMER_RECTYPE_EXT:
1648 		case HAMMER_RECTYPE_FIX:
1649 		case HAMMER_RECTYPE_PFS:
1650 		case HAMMER_RECTYPE_SNAPSHOT:
1651 		case HAMMER_RECTYPE_CONFIG:
1652 			zone = HAMMER_ZONE_META_INDEX;
1653 			break;
1654 		case HAMMER_RECTYPE_DATA:
1655 		case HAMMER_RECTYPE_DB:
1656 			if (data_len <= HAMMER_BUFSIZE / 2) {
1657 				zone = HAMMER_ZONE_SMALL_DATA_INDEX;
1658 			} else {
1659 				data_len = (data_len + HAMMER_BUFMASK) &
1660 					   ~HAMMER_BUFMASK;
1661 				zone = HAMMER_ZONE_LARGE_DATA_INDEX;
1662 			}
1663 			break;
1664 		default:
1665 			panic("hammer_alloc_data: rec_type %04x unknown",
1666 			      rec_type);
1667 			zone = 0;	/* NOT REACHED */
1668 			break;
1669 		}
1670 		*data_offsetp = hammer_blockmap_alloc(trans, zone, data_len,
1671 						      hint, errorp);
1672 	} else {
1673 		*data_offsetp = 0;
1674 	}
1675 	if (*errorp == 0 && data_bufferp) {
1676 		if (data_len) {
1677 			data = hammer_bread_ext(trans->hmp, *data_offsetp,
1678 						data_len, errorp, data_bufferp);
1679 		} else {
1680 			data = NULL;
1681 		}
1682 	} else {
1683 		data = NULL;
1684 	}
1685 	return(data);
1686 }
1687 
1688 /*
1689  * Sync dirty buffers to the media and clean-up any loose ends.
1690  *
1691  * These functions do not start the flusher going, they simply
1692  * queue everything up to the flusher.
1693  */
1694 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
1695 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
1696 
1697 int
1698 hammer_queue_inodes_flusher(hammer_mount_t hmp, int waitfor)
1699 {
1700 	struct hammer_sync_info info;
1701 
1702 	info.error = 0;
1703 	info.waitfor = waitfor;
1704 	if (waitfor == MNT_WAIT) {
1705 		vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_ONEPASS,
1706 			      hammer_sync_scan1, hammer_sync_scan2, &info);
1707 	} else {
1708 		vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_ONEPASS|VMSC_NOWAIT,
1709 			      hammer_sync_scan1, hammer_sync_scan2, &info);
1710 	}
1711 	return(info.error);
1712 }
1713 
1714 /*
1715  * Filesystem sync.  If doing a synchronous sync make a second pass on
1716  * the vnodes in case any were already flushing during the first pass,
1717  * and activate the flusher twice (the second time brings the UNDO FIFO's
1718  * start position up to the end position after the first call).
1719  *
1720  * If doing a lazy sync make just one pass on the vnode list, ignoring
1721  * any new vnodes added to the list while the sync is in progress.
1722  */
1723 int
1724 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1725 {
1726 	struct hammer_sync_info info;
1727 	int flags;
1728 
1729 	flags = VMSC_GETVP;
1730 	if (waitfor & MNT_LAZY)
1731 		flags |= VMSC_ONEPASS;
1732 
1733 	info.error = 0;
1734 	info.waitfor = MNT_NOWAIT;
1735 	vmntvnodescan(hmp->mp, flags | VMSC_NOWAIT,
1736 		      hammer_sync_scan1, hammer_sync_scan2, &info);
1737 
1738 	if (info.error == 0 && (waitfor & MNT_WAIT)) {
1739 		info.waitfor = waitfor;
1740 		vmntvnodescan(hmp->mp, flags,
1741 			      hammer_sync_scan1, hammer_sync_scan2, &info);
1742 	}
1743         if (waitfor == MNT_WAIT) {
1744                 hammer_flusher_sync(hmp);
1745                 hammer_flusher_sync(hmp);
1746 	} else {
1747                 hammer_flusher_async(hmp, NULL);
1748                 hammer_flusher_async(hmp, NULL);
1749 	}
1750 	return(info.error);
1751 }
1752 
1753 static int
1754 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1755 {
1756 	struct hammer_inode *ip;
1757 
1758 	ip = VTOI(vp);
1759 	if (vp->v_type == VNON || ip == NULL ||
1760 	    ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1761 	     RB_EMPTY(&vp->v_rbdirty_tree))) {
1762 		return(-1);
1763 	}
1764 	return(0);
1765 }
1766 
1767 static int
1768 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1769 {
1770 	struct hammer_sync_info *info = data;
1771 	struct hammer_inode *ip;
1772 	int error;
1773 
1774 	ip = VTOI(vp);
1775 	if (vp->v_type == VNON || vp->v_type == VBAD ||
1776 	    ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1777 	     RB_EMPTY(&vp->v_rbdirty_tree))) {
1778 		return(0);
1779 	}
1780 	error = VOP_FSYNC(vp, MNT_NOWAIT, 0);
1781 	if (error)
1782 		info->error = error;
1783 	return(0);
1784 }
1785 
1786