xref: /dragonfly/sys/vfs/hammer/hammer_ondisk.c (revision 9b5ae8ee)
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.10 2007/12/14 08:05:39 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41 
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47 
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl, int isnew);
51 static int hammer_load_cluster(hammer_cluster_t cluster, int isnew);
52 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
53 static void hammer_remove_node_clist(hammer_buffer_t buffer,
54 			hammer_node_t node);
55 static void initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head,
56 			u_int64_t type);
57 static void alloc_new_buffer(hammer_cluster_t cluster,
58 			hammer_alist_t live, u_int64_t type, int32_t nelements,
59 			int32_t start,
60 			int *errorp, struct hammer_buffer **bufferp);
61 #if 0
62 static void readhammerbuf(hammer_volume_t vol, void *data,
63 			int64_t offset);
64 static void writehammerbuf(hammer_volume_t vol, const void *data,
65 			int64_t offset);
66 #endif
67 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
68 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
69 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
70 			int32_t start, int isfwd);
71 static void hammer_adjust_stats(hammer_cluster_t cluster,
72 			u_int64_t buf_type, int nblks);
73 
74 struct hammer_alist_config Buf_alist_config;
75 struct hammer_alist_config Vol_normal_alist_config;
76 struct hammer_alist_config Vol_super_alist_config;
77 struct hammer_alist_config Supercl_alist_config;
78 struct hammer_alist_config Clu_master_alist_config;
79 struct hammer_alist_config Clu_slave_alist_config;
80 
81 /*
82  * Red-Black tree support for various structures
83  */
84 static int
85 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
86 {
87 	if (ip1->obj_id < ip2->obj_id)
88 		return(-1);
89 	if (ip1->obj_id > ip2->obj_id)
90 		return(1);
91 	if (ip1->obj_asof < ip2->obj_asof)
92 		return(-1);
93 	if (ip1->obj_asof > ip2->obj_asof)
94 		return(1);
95 	return(0);
96 }
97 
98 static int
99 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
100 {
101 	if (info->obj_id < ip->obj_id)
102 		return(-1);
103 	if (info->obj_id > ip->obj_id)
104 		return(1);
105 	if (info->obj_asof < ip->obj_asof)
106 		return(-1);
107 	if (info->obj_asof > ip->obj_asof)
108 		return(1);
109 	return(0);
110 }
111 
112 static int
113 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
114 {
115 	if (vol1->vol_no < vol2->vol_no)
116 		return(-1);
117 	if (vol1->vol_no > vol2->vol_no)
118 		return(1);
119 	return(0);
120 }
121 
122 static int
123 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
124 {
125 	if (cl1->scl_no < cl2->scl_no)
126 		return(-1);
127 	if (cl1->scl_no > cl2->scl_no)
128 		return(1);
129 	return(0);
130 }
131 
132 static int
133 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
134 {
135 	if (cl1->clu_no < cl2->clu_no)
136 		return(-1);
137 	if (cl1->clu_no > cl2->clu_no)
138 		return(1);
139 	return(0);
140 }
141 
142 static int
143 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
144 {
145 	if (buf1->buf_no < buf2->buf_no)
146 		return(-1);
147 	if (buf1->buf_no > buf2->buf_no)
148 		return(1);
149 	return(0);
150 }
151 
152 static int
153 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
154 {
155 	if (node1->node_offset < node2->node_offset)
156 		return(-1);
157 	if (node1->node_offset > node2->node_offset)
158 		return(1);
159 	return(0);
160 }
161 
162 /*
163  * Note: The lookup function for hammer_ino_rb_tree winds up being named
164  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
165  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
166  */
167 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
168 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
169 		hammer_inode_info_cmp, hammer_inode_info_t);
170 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
171 	     hammer_vol_rb_compare, int32_t, vol_no);
172 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
173 	     hammer_scl_rb_compare, int32_t, scl_no);
174 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
175 	     hammer_clu_rb_compare, int32_t, clu_no);
176 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
177 	     hammer_buf_rb_compare, int32_t, buf_no);
178 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
179 	     hammer_nod_rb_compare, int32_t, node_offset);
180 
181 /************************************************************************
182  *				VOLUMES					*
183  ************************************************************************
184  *
185  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
186  * code on failure.  Volumes must be loaded at mount time, get_volume() will
187  * not load a new volume.
188  *
189  * Calls made to hammer_load_volume() or single-threaded
190  */
191 int
192 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
193 {
194 	struct mount *mp;
195 	hammer_volume_t volume;
196 	struct hammer_volume_ondisk *ondisk;
197 	struct nlookupdata nd;
198 	struct buf *bp = NULL;
199 	int error;
200 	int ronly;
201 
202 	mp = hmp->mp;
203 	ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
204 
205 	/*
206 	 * Allocate a volume structure
207 	 */
208 	volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
209 	volume->vol_name = kstrdup(volname, M_HAMMER);
210 	volume->hmp = hmp;
211 	volume->io.type = HAMMER_STRUCTURE_VOLUME;
212 	volume->io.offset = 0LL;
213 
214 	/*
215 	 * Get the device vnode
216 	 */
217 	error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
218 	if (error == 0)
219 		error = nlookup(&nd);
220 	if (error == 0)
221 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
222 	nlookup_done(&nd);
223 	if (error == 0) {
224 		vn_isdisk(volume->devvp, &error);
225 	}
226 	if (error == 0) {
227 		vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
228 		error = VOP_OPEN(volume->devvp, (ronly ? FREAD : FREAD|FWRITE),
229 				 FSCRED, NULL);
230 		vn_unlock(volume->devvp);
231 	}
232 	if (error) {
233 		hammer_free_volume(volume);
234 		return(error);
235 	}
236 
237 	/*
238 	 * Extract the volume number from the volume header and do various
239 	 * sanity checks.
240 	 */
241 	error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
242 	if (error)
243 		goto late_failure;
244 	ondisk = (void *)bp->b_data;
245 	if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
246 		kprintf("hammer_mount: volume %s has an invalid header\n",
247 			volume->vol_name);
248 		error = EFTYPE;
249 		goto late_failure;
250 	}
251 	volume->vol_no = ondisk->vol_no;
252 	volume->cluster_base = ondisk->vol_clo_beg;
253 	volume->vol_clsize = ondisk->vol_clsize;
254 	volume->vol_flags = ondisk->vol_flags;
255 	volume->nblocks = ondisk->vol_nblocks;
256 	RB_INIT(&volume->rb_clus_root);
257 	RB_INIT(&volume->rb_scls_root);
258 
259 	hmp->mp->mnt_stat.f_blocks += volume->nblocks;
260 
261 	if (RB_EMPTY(&hmp->rb_vols_root)) {
262 		hmp->fsid = ondisk->vol_fsid;
263 	} else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
264 		kprintf("hammer_mount: volume %s's fsid does not match "
265 			"other volumes\n", volume->vol_name);
266 		error = EFTYPE;
267 		goto late_failure;
268 	}
269 
270 	/*
271 	 * Insert the volume structure into the red-black tree.
272 	 */
273 	if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
274 		kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
275 			volume->vol_name, volume->vol_no);
276 		error = EEXIST;
277 	}
278 
279 	/*
280 	 * Set the root volume and load the root cluster.  HAMMER special
281 	 * cases rootvol and rootcl and will not deallocate the structures.
282 	 * We do not hold a ref because this would prevent related I/O
283 	 * from being flushed.
284 	 */
285 	if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
286 		hmp->rootvol = volume;
287 		hmp->rootcl = hammer_get_cluster(volume,
288 						 ondisk->vol0_root_clu_no,
289 						 &error, 0);
290 		hammer_rel_cluster(hmp->rootcl, 0);
291 		hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
292 	}
293 late_failure:
294 	if (bp)
295 		brelse(bp);
296 	if (error) {
297 		/*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
298 		VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
299 		hammer_free_volume(volume);
300 	}
301 	return (error);
302 }
303 
304 /*
305  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
306  * so returns -1 on failure.
307  */
308 int
309 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
310 {
311 	struct hammer_mount *hmp = volume->hmp;
312 	hammer_cluster_t rootcl;
313 	int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
314 
315 	/*
316 	 * Sync clusters, sync volume
317 	 */
318 
319 	hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
320 
321 	/*
322 	 * Clean up the root cluster, which is held unlocked in the root
323 	 * volume.
324 	 */
325 	if (hmp->rootvol == volume) {
326 		if ((rootcl = hmp->rootcl) != NULL)
327 			hmp->rootcl = NULL;
328 		hmp->rootvol = NULL;
329 	}
330 
331 	/*
332 	 * Unload clusters and super-clusters.  Unloading a super-cluster
333 	 * also unloads related clusters, but the filesystem may not be
334 	 * using super-clusters so unload clusters anyway.
335 	 */
336 	RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
337 			hammer_unload_cluster, NULL);
338 	RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
339 			hammer_unload_supercl, NULL);
340 
341 	/*
342 	 * Release our buffer and flush anything left in the buffer cache.
343 	 */
344 	hammer_io_release(&volume->io, 1);
345 
346 	/*
347 	 * There should be no references on the volume, no clusters, and
348 	 * no super-clusters.
349 	 */
350 	KKASSERT(volume->io.lock.refs == 0);
351 	KKASSERT(RB_EMPTY(&volume->rb_clus_root));
352 	KKASSERT(RB_EMPTY(&volume->rb_scls_root));
353 
354 	volume->ondisk = NULL;
355 	if (volume->devvp) {
356 		if (ronly) {
357 			vinvalbuf(volume->devvp, 0, 0, 0);
358 			VOP_CLOSE(volume->devvp, FREAD);
359 		} else {
360 			vinvalbuf(volume->devvp, V_SAVE, 0, 0);
361 			VOP_CLOSE(volume->devvp, FREAD|FWRITE);
362 		}
363 	}
364 
365 	/*
366 	 * Destroy the structure
367 	 */
368 	RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
369 	hammer_free_volume(volume);
370 	return(0);
371 }
372 
373 static
374 void
375 hammer_free_volume(hammer_volume_t volume)
376 {
377 	if (volume->vol_name) {
378 		kfree(volume->vol_name, M_HAMMER);
379 		volume->vol_name = NULL;
380 	}
381 	if (volume->devvp) {
382 		vrele(volume->devvp);
383 		volume->devvp = NULL;
384 	}
385 	kfree(volume, M_HAMMER);
386 }
387 
388 /*
389  * Get a HAMMER volume.  The volume must already exist.
390  */
391 hammer_volume_t
392 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
393 {
394 	struct hammer_volume *volume;
395 
396 	/*
397 	 * Locate the volume structure
398 	 */
399 	volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
400 	if (volume == NULL) {
401 		*errorp = ENOENT;
402 		return(NULL);
403 	}
404 	hammer_ref(&volume->io.lock);
405 
406 	/*
407 	 * Deal with on-disk info
408 	 */
409 	if (volume->ondisk == NULL) {
410 		*errorp = hammer_load_volume(volume);
411 		if (*errorp) {
412 			hammer_rel_volume(volume, 1);
413 			volume = NULL;
414 		}
415 	} else {
416 		*errorp = 0;
417 	}
418 	return(volume);
419 }
420 
421 int
422 hammer_ref_volume(hammer_volume_t volume)
423 {
424 	int error;
425 
426 	hammer_ref(&volume->io.lock);
427 
428 	/*
429 	 * Deal with on-disk info
430 	 */
431 	if (volume->ondisk == NULL) {
432 		error = hammer_load_volume(volume);
433 		if (error)
434 			hammer_rel_volume(volume, 1);
435 	} else {
436 		error = 0;
437 	}
438 	return (error);
439 }
440 
441 hammer_volume_t
442 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
443 {
444 	hammer_volume_t volume;
445 
446 	volume = hmp->rootvol;
447 	KKASSERT(volume != NULL);
448 	hammer_ref(&volume->io.lock);
449 
450 	/*
451 	 * Deal with on-disk info
452 	 */
453 	if (volume->ondisk == NULL) {
454 		*errorp = hammer_load_volume(volume);
455 		if (*errorp) {
456 			hammer_rel_volume(volume, 1);
457 			volume = NULL;
458 		}
459 	} else {
460 		*errorp = 0;
461 	}
462 	return (volume);
463 }
464 
465 /*
466  * Load a volume's on-disk information.  The volume must be referenced and
467  * not locked.  We temporarily acquire an exclusive lock to interlock
468  * against releases or multiple get's.
469  */
470 static int
471 hammer_load_volume(hammer_volume_t volume)
472 {
473 	struct hammer_volume_ondisk *ondisk;
474 	int error;
475 
476 	hammer_lock_ex(&volume->io.lock);
477 	if (volume->ondisk == NULL) {
478 		error = hammer_io_read(volume->devvp, &volume->io);
479 		if (error) {
480 			hammer_unlock(&volume->io.lock);
481 			return (error);
482 		}
483 		volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
484 
485 		/*
486 		 * Configure the volume's A-lists.  These are used to
487 		 * allocate clusters.
488 		 */
489 		if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
490 			volume->alist.config = &Vol_super_alist_config;
491 			volume->alist.meta = ondisk->vol_almeta.super;
492 			volume->alist.info = volume;
493 		} else {
494 			volume->alist.config = &Vol_normal_alist_config;
495 			volume->alist.meta = ondisk->vol_almeta.normal;
496 			volume->alist.info = NULL;
497 		}
498 		hammer_alist_init(&volume->alist);
499 	} else {
500 		error = 0;
501 	}
502 	hammer_unlock(&volume->io.lock);
503 	return(0);
504 }
505 
506 /*
507  * Release a volume.  Call hammer_io_release on the last reference.  We have
508  * to acquire an exclusive lock to interlock against volume->ondisk tests
509  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
510  * lock to be held.
511  *
512  * Volumes are not unloaded from memory during normal operation.
513  */
514 void
515 hammer_rel_volume(hammer_volume_t volume, int flush)
516 {
517 	if (volume->io.lock.refs == 1) {
518 		hammer_lock_ex(&volume->io.lock);
519 		if (volume->io.lock.refs == 1) {
520 			volume->ondisk = NULL;
521 			hammer_io_release(&volume->io, flush);
522 		}
523 		hammer_unlock(&volume->io.lock);
524 	}
525 	hammer_unref(&volume->io.lock);
526 }
527 
528 /************************************************************************
529  *				SUPER-CLUSTERS				*
530  ************************************************************************
531  *
532  * Manage super-clusters.  Note that a supercl holds a reference to its
533  * associated volume.
534  */
535 hammer_supercl_t
536 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
537 		   int *errorp, int isnew)
538 {
539 	hammer_supercl_t supercl;
540 
541 	/*
542 	 * Locate and lock the super-cluster structure, creating one
543 	 * if necessary.
544 	 */
545 again:
546 	supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
547 	if (supercl == NULL) {
548 		supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
549 		supercl->scl_no = scl_no;
550 		supercl->volume = volume;
551 		supercl->io.offset = calculate_supercl_offset(volume, scl_no);
552 		supercl->io.type = HAMMER_STRUCTURE_SUPERCL;
553 		hammer_ref(&supercl->io.lock);
554 
555 		/*
556 		 * Insert the cluster into the RB tree and handle late
557 		 * collisions.
558 		 */
559 		if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
560 			hammer_unref(&supercl->io.lock);
561 			kfree(supercl, M_HAMMER);
562 			goto again;
563 		}
564 		hammer_ref(&volume->io.lock);
565 	} else {
566 		hammer_ref(&supercl->io.lock);
567 	}
568 
569 	/*
570 	 * Deal with on-disk info
571 	 */
572 	if (supercl->ondisk == NULL || isnew) {
573 		*errorp = hammer_load_supercl(supercl, isnew);
574 		if (*errorp) {
575 			hammer_rel_supercl(supercl, 1);
576 			supercl = NULL;
577 		}
578 	} else {
579 		*errorp = 0;
580 	}
581 	return(supercl);
582 }
583 
584 static int
585 hammer_load_supercl(hammer_supercl_t supercl, int isnew)
586 {
587 	struct hammer_supercl_ondisk *ondisk;
588 	hammer_volume_t volume = supercl->volume;
589 	int error;
590 
591 	hammer_lock_ex(&supercl->io.lock);
592 	if (supercl->ondisk == NULL) {
593 		if (isnew)
594 			error = hammer_io_new(volume->devvp, &supercl->io);
595 		else
596 			error = hammer_io_read(volume->devvp, &supercl->io);
597 		if (error) {
598 			hammer_unlock(&supercl->io.lock);
599 			return (error);
600 		}
601 		supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
602 
603 		supercl->alist.config = &Supercl_alist_config;
604 		supercl->alist.meta = ondisk->scl_meta;
605 		supercl->alist.info = NULL;
606 	} else if (isnew) {
607 		error = hammer_io_new(volume->devvp, &supercl->io);
608 	} else {
609 		error = 0;
610 	}
611 	if (error == 0 && isnew) {
612 		/*
613 		 * If this is a new super-cluster we have to initialize
614 		 * various ondisk structural elements.  The caller is
615 		 * responsible for the remainder.
616 		 */
617 		struct hammer_alist_live dummy;
618 
619 		ondisk = supercl->ondisk;
620 		dummy.config = &Buf_alist_config;
621 		dummy.meta = ondisk->head.buf_almeta;
622 		dummy.info = NULL;
623 		initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
624 		hammer_alist_init(&supercl->alist);
625 	}
626 	hammer_unlock(&supercl->io.lock);
627 	return (error);
628 }
629 
630 /*
631  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
632  */
633 int
634 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
635 {
636 	KKASSERT(supercl->io.lock.refs == 0);
637 	hammer_ref(&supercl->io.lock);
638 	hammer_rel_supercl(supercl, 1);
639 	return(0);
640 }
641 
642 /*
643  * Release a super-cluster.  We have to deal with several places where
644  * another thread can ref the super-cluster.
645  *
646  * Only destroy the structure itself if the related buffer cache buffer
647  * was disassociated from it.  This ties the management of the structure
648  * to the buffer cache subsystem.
649  */
650 void
651 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
652 {
653 	hammer_volume_t volume;
654 
655 	if (supercl->io.lock.refs == 1) {
656 		hammer_lock_ex(&supercl->io.lock);
657 		if (supercl->io.lock.refs == 1) {
658 			hammer_io_release(&supercl->io, flush);
659 			if (supercl->io.bp == NULL &&
660 			    supercl->io.lock.refs == 1) {
661 				volume = supercl->volume;
662 				RB_REMOVE(hammer_scl_rb_tree,
663 					  &volume->rb_scls_root, supercl);
664 				supercl->volume = NULL;	/* sanity */
665 				kfree(supercl, M_HAMMER);
666 				hammer_rel_volume(volume, 0);
667 				return;
668 			}
669 		}
670 		hammer_unlock(&supercl->io.lock);
671 	}
672 	hammer_unref(&supercl->io.lock);
673 }
674 
675 /************************************************************************
676  *				CLUSTERS				*
677  ************************************************************************
678  *
679  * Manage clusters.  Note that a cluster holds a reference to its
680  * associated volume.
681  */
682 hammer_cluster_t
683 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
684 		   int *errorp, int isnew)
685 {
686 	hammer_cluster_t cluster;
687 
688 again:
689 	cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
690 	if (cluster == NULL) {
691 		cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
692 		cluster->clu_no = clu_no;
693 		cluster->volume = volume;
694 		cluster->io.offset = calculate_cluster_offset(volume, clu_no);
695 		cluster->state = HAMMER_CLUSTER_IDLE;
696 		RB_INIT(&cluster->rb_bufs_root);
697 		RB_INIT(&cluster->rb_nods_root);
698 		cluster->io.type = HAMMER_STRUCTURE_CLUSTER;
699 		hammer_ref(&cluster->io.lock);
700 
701 		/*
702 		 * Insert the cluster into the RB tree and handle late
703 		 * collisions.
704 		 */
705 		if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
706 			hammer_unref(&cluster->io.lock);
707 			kfree(cluster, M_HAMMER);
708 			goto again;
709 		}
710 		hammer_ref(&volume->io.lock);
711 	} else {
712 		hammer_ref(&cluster->io.lock);
713 	}
714 
715 	/*
716 	 * Deal with on-disk info
717 	 */
718 	if (cluster->ondisk == NULL || isnew) {
719 		*errorp = hammer_load_cluster(cluster, isnew);
720 		if (*errorp) {
721 			hammer_rel_cluster(cluster, 1);
722 			cluster = NULL;
723 		}
724 	} else {
725 		*errorp = 0;
726 	}
727 	return (cluster);
728 }
729 
730 hammer_cluster_t
731 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
732 {
733 	hammer_cluster_t cluster;
734 
735 	cluster = hmp->rootcl;
736 	KKASSERT(cluster != NULL);
737 	hammer_ref(&cluster->io.lock);
738 
739 	/*
740 	 * Deal with on-disk info
741 	 */
742 	if (cluster->ondisk == NULL) {
743 		*errorp = hammer_load_cluster(cluster, 0);
744 		if (*errorp) {
745 			hammer_rel_cluster(cluster, 1);
746 			cluster = NULL;
747 		}
748 	} else {
749 		*errorp = 0;
750 	}
751 	return (cluster);
752 }
753 
754 static
755 int
756 hammer_load_cluster(hammer_cluster_t cluster, int isnew)
757 {
758 	hammer_volume_t volume = cluster->volume;
759 	struct hammer_cluster_ondisk *ondisk;
760 	int error;
761 
762 	/*
763 	 * Load the cluster's on-disk info
764 	 */
765 	hammer_lock_ex(&cluster->io.lock);
766 	if (cluster->ondisk == NULL) {
767 		if (isnew)
768 			error = hammer_io_new(volume->devvp, &cluster->io);
769 		else
770 			error = hammer_io_read(volume->devvp, &cluster->io);
771 		if (error) {
772 			hammer_unlock(&cluster->io.lock);
773 			return (error);
774 		}
775 		cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
776 
777 		cluster->alist_master.config = &Clu_master_alist_config;
778 		cluster->alist_master.meta = ondisk->clu_master_meta;
779 		cluster->alist_btree.config = &Clu_slave_alist_config;
780 		cluster->alist_btree.meta = ondisk->clu_btree_meta;
781 		cluster->alist_btree.info = cluster;
782 		cluster->alist_record.config = &Clu_slave_alist_config;
783 		cluster->alist_record.meta = ondisk->clu_record_meta;
784 		cluster->alist_record.info = cluster;
785 		cluster->alist_mdata.config = &Clu_slave_alist_config;
786 		cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
787 		cluster->alist_mdata.info = cluster;
788 
789 		cluster->clu_btree_beg = ondisk->clu_btree_beg;
790 		cluster->clu_btree_end = ondisk->clu_btree_end;
791 	} else if (isnew) {
792 		error = hammer_io_new(volume->devvp, &cluster->io);
793 	} else {
794 		error = 0;
795 	}
796 	if (error == 0 && isnew) {
797 		/*
798 		 * If this is a new cluster we have to initialize
799 		 * various ondisk structural elements.  The caller is
800 		 * responsible for the remainder.
801 		 */
802 		struct hammer_alist_live dummy;
803 
804 		ondisk = cluster->ondisk;
805 
806 		dummy.config = &Buf_alist_config;
807 		dummy.meta = ondisk->head.buf_almeta;
808 		dummy.info = NULL;
809 		initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
810 
811 		hammer_alist_init(&cluster->alist_master);
812 		hammer_alist_init(&cluster->alist_btree);
813 		hammer_alist_init(&cluster->alist_record);
814 		hammer_alist_init(&cluster->alist_mdata);
815 	}
816 	hammer_unlock(&cluster->io.lock);
817 	return (error);
818 }
819 
820 /*
821  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
822  */
823 int
824 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
825 {
826 	hammer_ref(&cluster->io.lock);
827 	RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
828 		hammer_unload_buffer, NULL);
829 	KKASSERT(cluster->io.lock.refs == 1);
830 	hammer_rel_cluster(cluster, 1);
831 	return(0);
832 }
833 
834 /*
835  * Reference a cluster that is either already referenced or via a specially
836  * handled pointer (aka rootcl).
837  */
838 int
839 hammer_ref_cluster(hammer_cluster_t cluster)
840 {
841 	int error;
842 
843 	KKASSERT(cluster != NULL);
844 	hammer_ref(&cluster->io.lock);
845 
846 	/*
847 	 * Deal with on-disk info
848 	 */
849 	if (cluster->ondisk == NULL) {
850 		error = hammer_load_cluster(cluster, 0);
851 		if (error)
852 			hammer_rel_cluster(cluster, 1);
853 	} else {
854 		error = 0;
855 	}
856 	return(error);
857 }
858 
859 /*
860  * Release a cluster.  We have to deal with several places where
861  * another thread can ref the cluster.
862  *
863  * Only destroy the structure itself if the related buffer cache buffer
864  * was disassociated from it.  This ties the management of the structure
865  * to the buffer cache subsystem.
866  */
867 void
868 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
869 {
870 	hammer_node_t node;
871 	hammer_volume_t volume;
872 
873 	if (cluster->io.lock.refs == 1) {
874 		hammer_lock_ex(&cluster->io.lock);
875 		if (cluster->io.lock.refs == 1) {
876 			/*
877 			 * Release the I/O.  If we or the kernel wants to
878 			 * flush, this will release the bp.  Otherwise the
879 			 * bp may be written and flushed passively by the
880 			 * kernel later on.
881 			 */
882 			hammer_io_release(&cluster->io, flush);
883 
884 			/*
885 			 * The B-Tree node cache is not counted in the
886 			 * cluster's reference count.  Clean out the
887 			 * cache.
888 			 *
889 			 * If the cluster acquires a new reference while we
890 			 * are trying to clean it out, abort the cleaning.
891 			 *
892 			 * Any actively referenced nodes will reference the
893 			 * related buffer and cluster, so a ref count check
894 			 * should be sufficient.
895 			 */
896 			while (cluster->io.bp == NULL &&
897 			       cluster->io.lock.refs == 1 &&
898 			       (node = RB_ROOT(&cluster->rb_nods_root)) != NULL
899 			) {
900 				KKASSERT(node->lock.refs == 0);
901 				hammer_flush_node(node);
902 			}
903 
904 			/*
905 			 * Final cleanup
906 			 */
907 			if (cluster->io.bp == NULL &&
908 			    cluster->io.lock.refs == 1 &&
909 			    RB_EMPTY(&cluster->rb_nods_root)) {
910 				KKASSERT(RB_EMPTY(&cluster->rb_bufs_root));
911 				volume = cluster->volume;
912 				RB_REMOVE(hammer_clu_rb_tree,
913 					  &volume->rb_clus_root, cluster);
914 				cluster->volume = NULL;	/* sanity */
915 				kfree(cluster, M_HAMMER);
916 				hammer_rel_volume(volume, 0);
917 				return;
918 			}
919 		}
920 		hammer_unlock(&cluster->io.lock);
921 	}
922 	hammer_unref(&cluster->io.lock);
923 }
924 
925 /************************************************************************
926  *				BUFFERS					*
927  ************************************************************************
928  *
929  * Manage buffers.  Note that a buffer holds a reference to its associated
930  * cluster, and its cluster will hold a reference to the cluster's volume.
931  *
932  * A non-zero buf_type indicates that a new buffer should be created and
933  * zero'd.
934  */
935 hammer_buffer_t
936 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
937 		  u_int64_t buf_type, int *errorp)
938 {
939 	hammer_buffer_t buffer;
940 
941 	/*
942 	 * Find the buffer.  Note that buffer 0 corresponds to the cluster
943 	 * header and should never be requested.
944 	 */
945 	KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
946 		 buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
947 
948 	/*
949 	 * Locate and lock the buffer structure, creating one if necessary.
950 	 */
951 again:
952 	buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
953 	if (buffer == NULL) {
954 		buffer = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
955 		buffer->buf_no = buf_no;
956 		buffer->cluster = cluster;
957 		buffer->volume = cluster->volume;
958 		buffer->io.offset = cluster->io.offset +
959 				    (buf_no * HAMMER_BUFSIZE);
960 		buffer->io.type = HAMMER_STRUCTURE_BUFFER;
961 		TAILQ_INIT(&buffer->clist);
962 		hammer_ref(&buffer->io.lock);
963 
964 		/*
965 		 * Insert the cluster into the RB tree and handle late
966 		 * collisions.
967 		 */
968 		if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
969 			hammer_unref(&buffer->io.lock);
970 			kfree(buffer, M_HAMMER);
971 			goto again;
972 		}
973 		hammer_ref(&cluster->io.lock);
974 	} else {
975 		hammer_ref(&buffer->io.lock);
976 	}
977 
978 	/*
979 	 * Deal with on-disk info
980 	 */
981 	if (buffer->ondisk == NULL || buf_type) {
982 		*errorp = hammer_load_buffer(buffer, buf_type);
983 		if (*errorp) {
984 			hammer_rel_buffer(buffer, 1);
985 			buffer = NULL;
986 		}
987 	} else {
988 		*errorp = 0;
989 	}
990 	return(buffer);
991 }
992 
993 static int
994 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
995 {
996 	hammer_volume_t volume;
997 	hammer_fsbuf_ondisk_t ondisk;
998 	int error;
999 
1000 	/*
1001 	 * Load the buffer's on-disk info
1002 	 */
1003 	volume = buffer->volume;
1004 	hammer_lock_ex(&buffer->io.lock);
1005 	if (buffer->ondisk == NULL) {
1006 		if (buf_type) {
1007 			error = hammer_io_new(volume->devvp, &buffer->io);
1008 		} else {
1009 			error = hammer_io_read(volume->devvp, &buffer->io);
1010 		}
1011 		if (error) {
1012 			hammer_unlock(&buffer->io.lock);
1013 			return (error);
1014 		}
1015 		buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1016 		buffer->alist.config = &Buf_alist_config;
1017 		buffer->alist.meta = ondisk->head.buf_almeta;
1018 		buffer->buf_type = ondisk->head.buf_type;
1019 	} else if (buf_type) {
1020 		error = hammer_io_new(volume->devvp, &buffer->io);
1021 	} else {
1022 		error = 0;
1023 	}
1024 	if (error == 0 && buf_type) {
1025 		ondisk = buffer->ondisk;
1026 		initbuffer(&buffer->alist, &ondisk->head, buf_type);
1027 		buffer->buf_type = ondisk->head.buf_type;
1028 	}
1029 	hammer_unlock(&buffer->io.lock);
1030 	return (error);
1031 }
1032 
1033 /*
1034  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1035  */
1036 int
1037 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1038 {
1039 	hammer_ref(&buffer->io.lock);
1040 	hammer_flush_buffer_nodes(buffer);
1041 	KKASSERT(buffer->io.lock.refs == 1);
1042 	hammer_rel_buffer(buffer, 1);
1043 	return(0);
1044 }
1045 
1046 /*
1047  * Reference a buffer that is either already referenced or via a specially
1048  * handled pointer (aka cursor->buffer).
1049  */
1050 int
1051 hammer_ref_buffer(hammer_buffer_t buffer)
1052 {
1053 	int error;
1054 
1055 	hammer_ref(&buffer->io.lock);
1056 	if (buffer->ondisk == NULL) {
1057 		error = hammer_load_buffer(buffer, 0);
1058 		if (error) {
1059 			hammer_rel_buffer(buffer, 1);
1060 			/*
1061 			 * NOTE: buffer pointer can become stale after
1062 			 * the above release.
1063 			 */
1064 		} else {
1065 			KKASSERT(buffer->buf_type ==
1066 				 buffer->ondisk->head.buf_type);
1067 		}
1068 	} else {
1069 		error = 0;
1070 	}
1071 	return(error);
1072 }
1073 
1074 /*
1075  * Release a buffer.  We have to deal with several places where
1076  * another thread can ref the buffer.
1077  *
1078  * Only destroy the structure itself if the related buffer cache buffer
1079  * was disassociated from it.  This ties the management of the structure
1080  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1081  * embedded io is referenced or not.
1082  */
1083 void
1084 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1085 {
1086 	hammer_cluster_t cluster;
1087 	hammer_node_t node;
1088 
1089 	if (buffer->io.lock.refs == 1) {
1090 		hammer_lock_ex(&buffer->io.lock);
1091 		if (buffer->io.lock.refs == 1) {
1092 			hammer_io_release(&buffer->io, flush);
1093 
1094 			/*
1095 			 * Clean out the B-Tree node cache, if any, then
1096 			 * clean up the cluster ref and free the buffer.
1097 			 *
1098 			 * If the buffer acquires a new reference while we
1099 			 * are trying to clean it out, abort the cleaning.
1100 			 */
1101 			while (buffer->io.bp == NULL &&
1102 			       buffer->io.lock.refs == 1 &&
1103 			       (node = TAILQ_FIRST(&buffer->clist)) != NULL
1104 			) {
1105 				KKASSERT(node->lock.refs == 0);
1106 				hammer_flush_node(node);
1107 			}
1108 			if (buffer->io.bp == NULL &&
1109 			    hammer_islastref(&buffer->io.lock)) {
1110 				cluster = buffer->cluster;
1111 				RB_REMOVE(hammer_buf_rb_tree,
1112 					  &cluster->rb_bufs_root, buffer);
1113 				buffer->cluster = NULL; /* sanity */
1114 				kfree(buffer, M_HAMMER);
1115 				hammer_rel_cluster(cluster, 0);
1116 				return;
1117 			}
1118 		}
1119 		hammer_unlock(&buffer->io.lock);
1120 	}
1121 	hammer_unref(&buffer->io.lock);
1122 }
1123 
1124 /*
1125  * Flush passively cached B-Tree nodes associated with this buffer.
1126  *
1127  * NOTE: The buffer is referenced and locked.
1128  */
1129 void
1130 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1131 {
1132 	hammer_node_t node;
1133 
1134 	node = TAILQ_FIRST(&buffer->clist);
1135 	while (node) {
1136 		buffer->save_scan = TAILQ_NEXT(node, entry);
1137 		if (node->lock.refs == 0)
1138 			hammer_flush_node(node);
1139 		node = buffer->save_scan;
1140 	}
1141 }
1142 
1143 /************************************************************************
1144  *				NODES					*
1145  ************************************************************************
1146  *
1147  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1148  * method used by the HAMMER filesystem.
1149  *
1150  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1151  * associated with its buffer.  It can have an active buffer reference
1152  * even when the node itself has no references.  The node also passively
1153  * associates itself with its cluster without holding any cluster refs.
1154  * The cluster ref is indirectly maintained by the active buffer ref when
1155  * a node is acquired.
1156  *
1157  * A hammer_node can also be passively associated with other HAMMER
1158  * structures, such as inodes, while retaining 0 references.  These
1159  * associations can be cleared backwards using a pointer-to-pointer in
1160  * the hammer_node.
1161  *
1162  * This allows the HAMMER implementation to cache hammer_node's long-term
1163  * and short-cut a great deal of the infrastructure's complexity.  In
1164  * most cases a cached node can be reacquired without having to dip into
1165  * either the buffer or cluster management code.
1166  *
1167  * The caller must pass a referenced cluster on call and will retain
1168  * ownership of the reference on return.  The node will acquire its own
1169  * additional references, if necessary.
1170  */
1171 hammer_node_t
1172 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1173 {
1174 	hammer_node_t node;
1175 
1176 	/*
1177 	 * Locate the structure, allocating one if necessary.
1178 	 */
1179 again:
1180 	node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1181 			 node_offset);
1182 	if (node == NULL) {
1183 		node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1184 		node->node_offset = node_offset;
1185 		node->cluster = cluster;
1186 		if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1187 			      node)) {
1188 			kfree(node, M_HAMMER);
1189 			goto again;
1190 		}
1191 	}
1192 	*errorp = hammer_ref_node(node);
1193 	if (*errorp) {
1194 		/*
1195 		 * NOTE: The node pointer may be stale on error return.
1196 		 * In fact, its probably been destroyed.
1197 		 */
1198 		node = NULL;
1199 	}
1200 	return(node);
1201 }
1202 
1203 /*
1204  * Reference the node to prevent disassociations, then associate and
1205  * load the related buffer.  This routine can also be called to reference
1206  * a node from a cache pointer.
1207  *
1208  * NOTE: Because the caller does not have a ref on the node, the caller's
1209  * node pointer will be stale if an error is returned.  We may also wind
1210  * up clearing the related cache pointers.
1211  *
1212  * NOTE: The cluster is indirectly referenced by our buffer ref.
1213  */
1214 int
1215 hammer_ref_node(hammer_node_t node)
1216 {
1217 	hammer_buffer_t buffer;
1218 	int32_t buf_no;
1219 	int error;
1220 
1221 	hammer_ref(&node->lock);
1222 	error = 0;
1223 	if (node->ondisk == NULL) {
1224 		hammer_lock_ex(&node->lock);
1225 		if (node->ondisk == NULL) {
1226 			/*
1227 			 * This is a little confusing but the jist is that
1228 			 * node->buffer determines whether the node is on
1229 			 * the buffer's clist and node->ondisk determines
1230 			 * whether the buffer is referenced.
1231 			 */
1232 			if ((buffer = node->buffer) != NULL) {
1233 				error = hammer_ref_buffer(buffer);
1234 			} else {
1235 				buf_no = node->node_offset / HAMMER_BUFSIZE;
1236 				buffer = hammer_get_buffer(node->cluster,
1237 							   buf_no, 0, &error);
1238 				if (buffer) {
1239 					KKASSERT(error == 0);
1240 					TAILQ_INSERT_TAIL(&buffer->clist,
1241 							  node, entry);
1242 					node->buffer = buffer;
1243 				}
1244 			}
1245 			if (error == 0) {
1246 				node->ondisk = (void *)((char *)buffer->ondisk +
1247 				       (node->node_offset & HAMMER_BUFMASK));
1248 			}
1249 		}
1250 		hammer_unlock(&node->lock);
1251 	}
1252 	if (error)
1253 		hammer_rel_node(node);
1254 	return (error);
1255 }
1256 
1257 /*
1258  * Release a hammer_node.  The node retains a passive association with
1259  * its cluster, buffer and caches.
1260  *
1261  * However, to avoid cluttering up kernel memory with tons of B-Tree
1262  * node cache structures we destroy the node if no passive cache or
1263  * (instantiated) buffer references exist.
1264  */
1265 void
1266 hammer_rel_node(hammer_node_t node)
1267 {
1268 	hammer_cluster_t cluster;
1269 	hammer_buffer_t buffer;
1270 
1271 	if (hammer_islastref(&node->lock)) {
1272 		cluster = node->cluster;
1273 		/*
1274 		 * Clutter control, this case only occurs after a failed
1275 		 * load since otherwise ondisk will be non-NULL.
1276 		 */
1277 		if (node->cache1 == NULL && node->cache2 == NULL &&
1278 		    node->ondisk == NULL) {
1279 			RB_REMOVE(hammer_nod_rb_tree, &cluster->rb_nods_root,
1280 				  node);
1281 			if ((buffer = node->buffer) != NULL) {
1282 				node->buffer = NULL;
1283 				hammer_remove_node_clist(buffer, node);
1284 			}
1285 			kfree(node, M_HAMMER);
1286 			return;
1287 		}
1288 
1289 		/*
1290 		 * node->ondisk determines whether we have a buffer reference
1291 		 * to get rid of or not.  Only get rid of the reference if
1292 		 * the kernel tried to flush the buffer.
1293 		 *
1294 		 * NOTE: Once unref'd the node can be physically destroyed,
1295 		 * so our node is stale afterwords.
1296 		 *
1297 		 * This case occurs if the node still has cache references.
1298 		 * We could remove the references and free the structure
1299 		 * but for now we allow them (and the node structure) to
1300 		 * remain intact.
1301 		 */
1302 		if (node->ondisk && hammer_io_checkflush(&node->buffer->io)) {
1303 			buffer = node->buffer;
1304 			node->buffer = NULL;
1305 			node->ondisk = NULL;
1306 			hammer_remove_node_clist(buffer, node);
1307 			hammer_unref(&node->lock);
1308 			hammer_rel_buffer(buffer, 0);
1309 		} else {
1310 			hammer_unref(&node->lock);
1311 		}
1312 	} else {
1313 		hammer_unref(&node->lock);
1314 	}
1315 }
1316 
1317 /*
1318  * Cache-and-release a hammer_node.  Kinda like catching and releasing a
1319  * fish, but keeping an eye on him.  The node is passively cached in *cache.
1320  *
1321  * NOTE!  HAMMER may NULL *cache at any time, even after you have
1322  * referenced the node!
1323  */
1324 void
1325 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1326 {
1327 	if (node->cache1 != cache) {
1328 		if (node->cache2 == cache) {
1329 			struct hammer_node **tmp;
1330 			tmp = node->cache1;
1331 			node->cache1 = node->cache2;
1332 			node->cache2 = tmp;
1333 		} else {
1334 			if (node->cache2)
1335 				*node->cache2 = NULL;
1336 			node->cache2 = node->cache1;
1337 			node->cache1 = cache;
1338 			*cache = node;
1339 		}
1340 	}
1341 }
1342 
1343 void
1344 hammer_uncache_node(struct hammer_node **cache)
1345 {
1346 	hammer_node_t node;
1347 
1348 	if ((node = *cache) != NULL) {
1349 		*cache = NULL;
1350 		if (node->cache1 == cache) {
1351 			node->cache1 = node->cache2;
1352 			node->cache2 = NULL;
1353 		} else if (node->cache2 == cache) {
1354 			node->cache2 = NULL;
1355 		} else {
1356 			panic("hammer_uncache_node: missing cache linkage");
1357 		}
1358 		if (node->cache1 == NULL && node->cache2 == NULL &&
1359 		    node->lock.refs == 0) {
1360 			hammer_flush_node(node);
1361 		}
1362 	}
1363 }
1364 
1365 /*
1366  * Remove a node's cache references and destroy the node if it has no
1367  * references.  This is typically called from the buffer handling code.
1368  *
1369  * The node may have an active buffer reference (ondisk != NULL) even
1370  * if the node itself has no references.
1371  *
1372  * Note that a caller iterating through nodes via a buffer must have its
1373  * own reference on the buffer or our hammer_rel_buffer() call below may
1374  * rip it out from under the caller.
1375  */
1376 void
1377 hammer_flush_node(hammer_node_t node)
1378 {
1379 	hammer_buffer_t buffer;
1380 
1381 	if (node->cache1)
1382 		*node->cache1 = NULL;
1383 	if (node->cache2)
1384 		*node->cache2 = NULL;
1385 	if (node->lock.refs == 0) {
1386 		RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1387 			  node);
1388 		if ((buffer = node->buffer) != NULL) {
1389 			node->buffer = NULL;
1390 			hammer_remove_node_clist(buffer, node);
1391 			if (node->ondisk) {
1392 				node->ondisk = NULL;
1393 				hammer_rel_buffer(buffer, 0);
1394 			}
1395 		}
1396 		kfree(node, M_HAMMER);
1397 	}
1398 }
1399 
1400 /*
1401  * Remove a node from the buffer's clist.  Adjust save_scan as appropriate.
1402  * This is in its own little routine to properly handle interactions with
1403  * save_scan, so it is possible to block while scanning a buffer's node list.
1404  */
1405 static
1406 void
1407 hammer_remove_node_clist(hammer_buffer_t buffer, hammer_node_t node)
1408 {
1409 	if (buffer->save_scan == node)
1410 		buffer->save_scan = TAILQ_NEXT(node, entry);
1411 	TAILQ_REMOVE(&buffer->clist, node, entry);
1412 }
1413 
1414 /************************************************************************
1415  *				A-LIST ALLOCATORS			*
1416  ************************************************************************/
1417 
1418 /*
1419  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1420  *
1421  * The passed *bufferp should be initialized to NULL.  On successive calls
1422  * *bufferp caches the most recent buffer used until put away by the caller.
1423  * Note that previously returned pointers using the cached buffer become
1424  * invalid on successive calls which reuse *bufferp.
1425  *
1426  * All allocations first attempt to use the block found at the specified
1427  * iterator.  If that fails the first available block is used.  If that
1428  * fails a new buffer is allocated and associated with the buffer type
1429  * A-list and the element is allocated out of the new buffer.
1430  */
1431 
1432 hammer_node_t
1433 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1434 {
1435 	hammer_buffer_t buffer;
1436 	hammer_alist_t live;
1437 	hammer_node_t node;
1438 	int32_t elm_no;
1439 	int32_t buf_no;
1440 	int32_t node_offset;
1441 
1442 	/*
1443 	 * Allocate a B-Tree element
1444 	 */
1445 	buffer = NULL;
1446 	live = &cluster->alist_btree;
1447 	elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1448 	if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1449 		elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1450 	if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1451 		alloc_new_buffer(cluster, live,
1452 				 HAMMER_FSBUF_BTREE, HAMMER_BTREE_NODES,
1453 				 cluster->ondisk->idx_index, errorp, &buffer);
1454 		elm_no = hammer_alist_alloc(live, 1);
1455 		if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1456 			*errorp = ENOSPC;
1457 			if (buffer)
1458 				hammer_rel_buffer(buffer, 0);
1459 			hammer_modify_cluster(cluster);
1460 			return(NULL);
1461 		}
1462 	}
1463 	cluster->ondisk->idx_index = elm_no;
1464 	KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1465 	hammer_modify_cluster(cluster);
1466 
1467 	/*
1468 	 * Load and return the B-Tree element
1469 	 */
1470 	buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1471 	node_offset = buf_no * HAMMER_BUFSIZE +
1472 		      offsetof(union hammer_fsbuf_ondisk,
1473 			       btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1474 	node = hammer_get_node(cluster, node_offset, errorp);
1475 	if (node) {
1476 		bzero(node->ondisk, sizeof(*node->ondisk));
1477 	} else {
1478 		hammer_alist_free(live, elm_no, 1);
1479 		hammer_rel_node(node);
1480 		node = NULL;
1481 	}
1482 	if (buffer)
1483 		hammer_rel_buffer(buffer, 0);
1484 	return(node);
1485 }
1486 
1487 void *
1488 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1489 		  int *errorp, struct hammer_buffer **bufferp)
1490 {
1491 	hammer_buffer_t buffer;
1492 	hammer_alist_t live;
1493 	int32_t elm_no;
1494 	int32_t buf_no;
1495 	int32_t nblks;
1496 	void *item;
1497 
1498 	/*
1499 	 * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1500 	 * for these allocations.
1501 	 */
1502 	if ((bytes & HAMMER_BUFMASK) == 0) {
1503 		nblks = bytes / HAMMER_BUFSIZE;
1504 		/* only one block allowed for now (so buffer can hold it) */
1505 		KKASSERT(nblks == 1);
1506 
1507 		buf_no = hammer_alloc_master(cluster, nblks,
1508 					     cluster->ondisk->idx_ldata, 1);
1509 		if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1510 			*errorp = ENOSPC;
1511 			return(NULL);
1512 		}
1513 		hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1514 		cluster->ondisk->idx_ldata = buf_no;
1515 		buffer = *bufferp;
1516 		*bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1517 		if (buffer)
1518 			hammer_rel_buffer(buffer, 0);
1519 		buffer = *bufferp;
1520 		return(buffer->ondisk);
1521 	}
1522 
1523 	/*
1524 	 * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1525 	 * (64 bytes) for these allocations.
1526 	 */
1527 	nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1528 	nblks /= HAMMER_DATA_BLKSIZE;
1529 	live = &cluster->alist_mdata;
1530 	elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1531 	if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1532 		elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1533 	if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1534 		alloc_new_buffer(cluster, live,
1535 				 HAMMER_FSBUF_DATA, HAMMER_DATA_NODES,
1536 				 cluster->ondisk->idx_data, errorp, bufferp);
1537 		elm_no = hammer_alist_alloc(live, nblks);
1538 		if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1539 			*errorp = ENOSPC;
1540 			hammer_modify_cluster(cluster);
1541 			return(NULL);
1542 		}
1543 	}
1544 	cluster->ondisk->idx_index = elm_no;
1545 	hammer_modify_cluster(cluster);
1546 
1547 	/*
1548 	 * Load and return the B-Tree element
1549 	 */
1550 	buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1551 	buffer = *bufferp;
1552 	if (buffer == NULL || buffer->cluster != cluster ||
1553 	    buffer->buf_no != buf_no) {
1554 		if (buffer)
1555 			hammer_rel_buffer(buffer, 0);
1556 		buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1557 		*bufferp = buffer;
1558 	}
1559 	KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1560 	KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1561 	item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1562 	bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1563 	*errorp = 0;
1564 	return(item);
1565 }
1566 
1567 void *
1568 hammer_alloc_record(hammer_cluster_t cluster,
1569 		    int *errorp, struct hammer_buffer **bufferp)
1570 {
1571 	hammer_buffer_t buffer;
1572 	hammer_alist_t live;
1573 	int32_t elm_no;
1574 	int32_t buf_no;
1575 	void *item;
1576 
1577 	/*
1578 	 * Allocate a record element
1579 	 */
1580 	live = &cluster->alist_record;
1581 	elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1582 	if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1583 		elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1584 	if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1585 		alloc_new_buffer(cluster, live,
1586 				 HAMMER_FSBUF_RECORDS, HAMMER_RECORD_NODES,
1587 				 cluster->ondisk->idx_record, errorp, bufferp);
1588 		elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1589 		kprintf("hammer_alloc_record elm again %08x\n", elm_no);
1590 		if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1591 			*errorp = ENOSPC;
1592 			hammer_modify_cluster(cluster);
1593 			return(NULL);
1594 		}
1595 	}
1596 	cluster->ondisk->idx_record = elm_no;
1597 	hammer_modify_cluster(cluster);
1598 
1599 	/*
1600 	 * Load and return the B-Tree element
1601 	 */
1602 	buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1603 	buffer = *bufferp;
1604 	if (buffer == NULL || buffer->cluster != cluster ||
1605 	    buffer->buf_no != buf_no) {
1606 		if (buffer)
1607 			hammer_rel_buffer(buffer, 0);
1608 		buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1609 		*bufferp = buffer;
1610 	}
1611 	KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1612 	KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES);
1613 	item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1614 	bzero(item, sizeof(union hammer_record_ondisk));
1615 	*errorp = 0;
1616 	return(item);
1617 }
1618 
1619 void
1620 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
1621 {
1622 	int32_t elm_no;
1623 	int32_t nblks;
1624 	hammer_alist_t live;
1625 
1626 	if ((bytes & HAMMER_BUFMASK) == 0) {
1627 		nblks = bytes / HAMMER_BUFSIZE;
1628 		KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
1629 		hammer_alist_free(&buffer->cluster->alist_master,
1630 				  buffer->buf_no, nblks);
1631 		hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
1632 		hammer_modify_cluster(buffer->cluster);
1633 		return;
1634 	}
1635 
1636 	elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
1637 		 HAMMER_DATA_BLKSIZE;
1638 	KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
1639 	elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1640 	nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1641 	nblks /= HAMMER_DATA_BLKSIZE;
1642 	live = &buffer->cluster->alist_mdata;
1643 	hammer_alist_free(live, elm_no, nblks);
1644 	hammer_modify_cluster(buffer->cluster);
1645 }
1646 
1647 void
1648 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec)
1649 {
1650 	int32_t elm_no;
1651 	hammer_alist_t live;
1652 
1653 	elm_no = rec - &buffer->ondisk->record.recs[0];
1654 	KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
1655 	elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1656 	live = &buffer->cluster->alist_record;
1657 	hammer_alist_free(live, elm_no, 1);
1658 	hammer_modify_cluster(buffer->cluster);
1659 }
1660 
1661 void
1662 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
1663 {
1664 	const int32_t blksize = sizeof(struct hammer_node_ondisk);
1665 	int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1666 	hammer_alist_t live;
1667 	int32_t elm_no;
1668 
1669 	elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1670 	fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
1671 	live = &cluster->alist_btree;
1672 	KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1673 	elm_no += fsbuf_offset / blksize;
1674 	hammer_alist_free(live, elm_no, 1);
1675 	hammer_modify_cluster(cluster);
1676 }
1677 
1678 void
1679 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
1680 {
1681 	const int32_t blksize = HAMMER_DATA_BLKSIZE;
1682 	int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1683 	hammer_alist_t live;
1684 	int32_t elm_no;
1685 	int32_t buf_no;
1686 	int32_t nblks;
1687 
1688 	if ((bytes & HAMMER_BUFMASK) == 0) {
1689 		nblks = bytes / HAMMER_BUFSIZE;
1690 		KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
1691 		buf_no = bclu_offset / HAMMER_BUFSIZE;
1692 		hammer_alist_free(&cluster->alist_master, buf_no, nblks);
1693 		hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
1694 		hammer_modify_cluster(cluster);
1695 		return;
1696 	}
1697 
1698 	elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1699 	fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
1700 	live = &cluster->alist_mdata;
1701 	nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1702 	nblks /= HAMMER_DATA_BLKSIZE;
1703 	KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1704 	elm_no += fsbuf_offset / blksize;
1705 	hammer_alist_free(live, elm_no, nblks);
1706 	hammer_modify_cluster(cluster);
1707 }
1708 
1709 void
1710 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset)
1711 {
1712 	const int32_t blksize = sizeof(union hammer_record_ondisk);
1713 	int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1714 	hammer_alist_t live;
1715 	int32_t elm_no;
1716 
1717 	elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1718 	fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
1719 	live = &cluster->alist_record;
1720 	KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1721 	elm_no += fsbuf_offset / blksize;
1722 	hammer_alist_free(live, elm_no, 1);
1723 	hammer_modify_cluster(cluster);
1724 }
1725 
1726 
1727 /*
1728  * Allocate a new filesystem buffer and assign it to the specified
1729  * filesystem buffer type.  The new buffer will be added to the
1730  * type-specific A-list and initialized.
1731  */
1732 static void
1733 alloc_new_buffer(hammer_cluster_t cluster, hammer_alist_t live,
1734 		 u_int64_t type, int32_t nelements,
1735 		 int start, int *errorp, struct hammer_buffer **bufferp)
1736 {
1737 	hammer_buffer_t buffer;
1738 	int32_t buf_no;
1739 	int isfwd;
1740 
1741 	start = start / HAMMER_FSBUF_MAXBLKS;	/* convert to buf_no */
1742 
1743 	isfwd = (type != HAMMER_FSBUF_RECORDS);
1744 	buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
1745 	KKASSERT(buf_no != HAMMER_ALIST_BLOCK_NONE); /* XXX */
1746 	hammer_modify_cluster(cluster);
1747 
1748 	/*
1749 	 * The new buffer must be initialized (type != 0) regardless of
1750 	 * whether we already have it cached or not, so don't try to
1751 	 * optimize the cached buffer check.  Just call hammer_get_buffer().
1752 	 */
1753 	buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
1754 	if (*bufferp)
1755 		hammer_rel_buffer(*bufferp, 0);
1756 	*bufferp = buffer;
1757 
1758 	/*
1759 	 * Finally, do a meta-free of the buffer's elements into the
1760 	 * type-specific A-list and update our statistics to reflect
1761 	 * the allocation.
1762 	 */
1763 	if (buffer) {
1764 		kprintf("alloc_new_buffer buf_no %d type %016llx nelms %d\n",
1765 			buf_no, type, nelements);
1766 		hammer_alist_free(live, buf_no * HAMMER_FSBUF_MAXBLKS,
1767 				  nelements);
1768 		hammer_adjust_stats(cluster, type, 1);
1769 	}
1770 }
1771 
1772 /*
1773  * Sync dirty buffers to the media
1774  */
1775 
1776 /*
1777  * Sync the entire filesystem.
1778  */
1779 int
1780 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1781 {
1782 	struct hammer_sync_info info;
1783 
1784 	info.error = 0;
1785 	info.waitfor = waitfor;
1786 
1787 	RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1788 		hammer_sync_volume, &info);
1789 	return(info.error);
1790 }
1791 
1792 int
1793 hammer_sync_volume(hammer_volume_t volume, void *data)
1794 {
1795 	struct hammer_sync_info *info = data;
1796 
1797 	RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
1798 		hammer_sync_cluster, info);
1799 	if (hammer_ref_volume(volume) == 0) {
1800 		hammer_io_flush(&volume->io, info);
1801 		hammer_rel_volume(volume, 0);
1802 	}
1803 	return(0);
1804 }
1805 
1806 int
1807 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
1808 {
1809 	struct hammer_sync_info *info = data;
1810 
1811 	if (cluster->state != HAMMER_CLUSTER_IDLE) {
1812 		RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
1813 			hammer_sync_buffer, info);
1814 		if (hammer_ref_cluster(cluster) == 0) {
1815 			hammer_io_flush(&cluster->io, info);
1816 			hammer_rel_cluster(cluster, 0);
1817 		}
1818 	}
1819 	return(0);
1820 }
1821 
1822 int
1823 hammer_sync_buffer(hammer_buffer_t buffer, void *data)
1824 {
1825 	struct hammer_sync_info *info = data;
1826 
1827 	if (hammer_ref_buffer(buffer) == 0) {
1828 		hammer_lock_ex(&buffer->io.lock);
1829 		hammer_flush_buffer_nodes(buffer);
1830 		hammer_unlock(&buffer->io.lock);
1831 		hammer_io_flush(&buffer->io, info);
1832 		hammer_rel_buffer(buffer, 0);
1833 	}
1834 	return(0);
1835 }
1836 
1837 #if 0
1838 
1839 /*
1840  * Flush various tracking structures to disk
1841  */
1842 
1843 /*
1844  * Flush various tracking structures to disk
1845  */
1846 void
1847 flush_all_volumes(void)
1848 {
1849 	hammer_volume_t vol;
1850 
1851 	for (vol = VolBase; vol; vol = vol->next)
1852 		flush_volume(vol);
1853 }
1854 
1855 void
1856 flush_volume(hammer_volume_t vol)
1857 {
1858 	hammer_supercl_t supercl;
1859 	hammer_cluster_t cl;
1860 
1861 	for (supercl = vol->supercl_base; supercl; supercl = supercl->next)
1862 		flush_supercl(supercl);
1863 	for (cl = vol->cluster_base; cl; cl = cl->next)
1864 		flush_cluster(cl);
1865 	writehammerbuf(vol, vol->ondisk, 0);
1866 }
1867 
1868 void
1869 flush_supercl(hammer_supercl_t supercl)
1870 {
1871 	int64_t supercl_offset;
1872 
1873 	supercl_offset = supercl->scl_offset;
1874 	writehammerbuf(supercl->volume, supercl->ondisk, supercl_offset);
1875 }
1876 
1877 void
1878 flush_cluster(hammer_cluster_t cl)
1879 {
1880 	hammer_buffer_t buf;
1881 	int64_t cluster_offset;
1882 
1883 	for (buf = cl->buffer_base; buf; buf = buf->next)
1884 		flush_buffer(buf);
1885 	cluster_offset = cl->clu_offset;
1886 	writehammerbuf(cl->volume, cl->ondisk, cluster_offset);
1887 }
1888 
1889 void
1890 flush_buffer(hammer_buffer_t buf)
1891 {
1892 	int64_t buffer_offset;
1893 
1894 	buffer_offset = buf->buf_offset + buf->cluster->clu_offset;
1895 	writehammerbuf(buf->volume, buf->ondisk, buffer_offset);
1896 }
1897 
1898 #endif
1899 
1900 /*
1901  * Generic buffer initialization
1902  */
1903 static void
1904 initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
1905 {
1906 	head->buf_type = type;
1907 	hammer_alist_init(live);
1908 }
1909 
1910 /*
1911  * Calculate the cluster's offset in the volume.  This calculation is
1912  * slightly more complex when using superclusters because superclusters
1913  * are grouped in blocks of 16, followed by 16 x N clusters where N
1914  * is the number of clusters a supercluster can manage.
1915  */
1916 static int64_t
1917 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
1918 {
1919 	int32_t scl_group;
1920 	int64_t scl_group_size;
1921 	int64_t off;
1922 
1923 	if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
1924 		scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
1925 			    HAMMER_SCL_MAXCLUSTERS;
1926 		scl_group_size =
1927 			    ((int64_t)HAMMER_BUFSIZE *
1928 			     HAMMER_VOL_SUPERCLUSTER_GROUP) +
1929 			    ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
1930 			     volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
1931 		scl_group_size +=
1932 			    HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
1933 
1934 		off = volume->cluster_base +
1935 		      scl_group * scl_group_size +
1936 		      (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
1937 		      ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
1938 		       HAMMER_VOL_SUPERCLUSTER_GROUP))
1939 		      * HAMMER_BUFSIZE;
1940 	} else {
1941 		off = volume->cluster_base +
1942 		      (int64_t)clu_no * volume->vol_clsize;
1943 	}
1944 	return(off);
1945 }
1946 
1947 /*
1948  * Calculate a super-cluster's offset in the volume.
1949  */
1950 static int64_t
1951 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
1952 {
1953 	int64_t off;
1954 	int32_t scl_group;
1955 	int64_t scl_group_size;
1956 
1957 	KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
1958 	scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
1959 	if (scl_group) {
1960 		scl_group_size =
1961 			    ((int64_t)HAMMER_BUFSIZE *
1962 			     HAMMER_VOL_SUPERCLUSTER_GROUP) +
1963 			    ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
1964 			     volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
1965 		scl_group_size +=
1966 			    HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
1967 		off = volume->cluster_base + (scl_group * scl_group_size) +
1968 		      (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
1969 	} else {
1970 		off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
1971 	}
1972 	return(off);
1973 }
1974 
1975 /*
1976  *
1977  *
1978  */
1979 static int32_t
1980 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
1981 		    int32_t start, int isfwd)
1982 {
1983 	int32_t buf_no;
1984 
1985 	if (isfwd) {
1986 		buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
1987 						nblks, start);
1988 		if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1989 			buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
1990 						nblks, 0);
1991 		}
1992 	} else {
1993 		buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
1994 						nblks, start);
1995 		if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1996 			buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
1997 						nblks, HAMMER_ALIST_BLOCK_MAX);
1998 		}
1999 	}
2000 
2001 	/*
2002 	 * Recover space from empty record, b-tree, and data a-lists.
2003 	 */
2004 
2005 	return(buf_no);
2006 }
2007 
2008 /*
2009  * Adjust allocation statistics
2010  */
2011 static void
2012 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2013 {
2014 	switch(buf_type) {
2015 	case HAMMER_FSBUF_BTREE:
2016 		cluster->ondisk->stat_idx_bufs += nblks;
2017 		cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2018 		cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2019 		break;
2020 	case HAMMER_FSBUF_DATA:
2021 		cluster->ondisk->stat_data_bufs += nblks;
2022 		cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2023 		cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2024 		break;
2025 	case HAMMER_FSBUF_RECORDS:
2026 		cluster->ondisk->stat_rec_bufs += nblks;
2027 		cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2028 		cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2029 		break;
2030 	}
2031 	hammer_modify_cluster(cluster);
2032 	hammer_modify_volume(cluster->volume);
2033 	hammer_modify_volume(cluster->volume->hmp->rootvol);
2034 }
2035 
2036 /*
2037  * A-LIST SUPPORT
2038  *
2039  * Setup the parameters for the various A-lists we use in hammer.  The
2040  * supercluster A-list must be chained to the cluster A-list and cluster
2041  * slave A-lists are chained to buffer A-lists.
2042  *
2043  * See hammer_init_alist_config() below.
2044  */
2045 
2046 /*
2047  * A-LIST - cluster recursion into a filesystem buffer
2048  */
2049 static int
2050 buffer_alist_init(void *info, int32_t blk, int32_t radix)
2051 {
2052 	return(0);
2053 #if 0
2054 	hammer_cluster_t cluster = info;
2055 	hammer_buffer_t buffer;
2056 	int32_t buf_no;
2057 	int error = 0;
2058 
2059 	/*
2060 	 * Calculate the buffer number, initialize based on the buffer type.
2061 	 * The buffer has already been allocated so assert that it has been
2062 	 * initialized.
2063 	 */
2064 	buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2065 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2066 	if (buffer) {
2067 		hammer_adjust_stats(cluster, buffer->ondisk->head.buf_type, 1);
2068 		hammer_rel_buffer(buffer, 0);
2069 	}
2070 	return (error);
2071 #endif
2072 }
2073 
2074 static int
2075 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2076 {
2077 	return(0);
2078 #if 0
2079 	hammer_cluster_t cluster = info;
2080 	hammer_buffer_t buffer;
2081 	int32_t buf_no;
2082 	int error = 0;
2083 
2084 	/*
2085 	 * Calculate the buffer number, initialize based on the buffer type.
2086 	 * The buffer has already been allocated so assert that it has been
2087 	 * initialized.
2088 	 */
2089 	buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2090 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2091 	if (buffer) {
2092 		hammer_adjust_stats(cluster, buffer->ondisk->head.buf_type, -1);
2093 		hammer_rel_buffer(buffer, 0);
2094 	}
2095 	return (error);
2096 #endif
2097 }
2098 
2099 /*
2100  * Note: atblk can be negative and atblk - blk can go negative.
2101  */
2102 static int
2103 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2104                       int32_t count, int32_t atblk, int32_t *fullp)
2105 {
2106 	hammer_cluster_t cluster = info;
2107 	hammer_buffer_t buffer;
2108 	int32_t buf_no;
2109 	int32_t r;
2110 	int error = 0;
2111 
2112 	buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2113 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2114 	if (buffer) {
2115 		KKASSERT(buffer->ondisk->head.buf_type != 0);
2116 
2117 		r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2118 		if (r != HAMMER_ALIST_BLOCK_NONE)
2119 			r += blk;
2120 		*fullp = hammer_alist_isfull(&buffer->alist);
2121 		hammer_modify_buffer(buffer);
2122 		hammer_rel_buffer(buffer, 0);
2123 	} else {
2124 		r = HAMMER_ALIST_BLOCK_NONE;
2125 	}
2126 	return(r);
2127 }
2128 
2129 static int
2130 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2131                       int32_t count, int32_t atblk, int32_t *fullp)
2132 {
2133 	hammer_cluster_t cluster = info;
2134 	hammer_buffer_t buffer;
2135 	int32_t buf_no;
2136 	int32_t r;
2137 	int error = 0;
2138 
2139 	buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2140 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2141 	if (buffer) {
2142 		KKASSERT(buffer->ondisk->head.buf_type != 0);
2143 
2144 		r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2145 		if (r != HAMMER_ALIST_BLOCK_NONE)
2146 			r += blk;
2147 		*fullp = hammer_alist_isfull(&buffer->alist);
2148 		hammer_modify_buffer(buffer);
2149 		hammer_rel_buffer(buffer, 0);
2150 	} else {
2151 		r = HAMMER_ALIST_BLOCK_NONE;
2152 		*fullp = 0;
2153 	}
2154 	return(r);
2155 }
2156 
2157 static void
2158 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2159                  int32_t base_blk, int32_t count, int32_t *emptyp)
2160 {
2161 	hammer_cluster_t cluster = info;
2162 	hammer_buffer_t buffer;
2163 	int32_t buf_no;
2164 	int error = 0;
2165 
2166 	buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2167 	buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2168 	if (buffer) {
2169 		KKASSERT(buffer->ondisk->head.buf_type != 0);
2170 		hammer_alist_free(&buffer->alist, base_blk, count);
2171 		*emptyp = hammer_alist_isempty(&buffer->alist);
2172 		/* XXX don't bother updating the buffer is completely empty? */
2173 		hammer_modify_buffer(buffer);
2174 		hammer_rel_buffer(buffer, 0);
2175 	} else {
2176 		*emptyp = 0;
2177 	}
2178 }
2179 
2180 static void
2181 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2182 {
2183 }
2184 
2185 /*
2186  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2187  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2188  * but callbacks must be installed to recurse from one A-List to another.
2189  *
2190  * Implementing these callbacks allows us to operate a multi-layered A-List
2191  * as a single entity.
2192  */
2193 static int
2194 super_alist_init(void *info, int32_t blk, int32_t radix)
2195 {
2196 	hammer_volume_t volume = info;
2197 	hammer_supercl_t supercl;
2198 	int32_t scl_no;
2199 	int error = 0;
2200 
2201 	/*
2202 	 * Calculate the super-cluster number containing the cluster (blk)
2203 	 * and obtain the super-cluster buffer.
2204 	 */
2205 	scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2206 	supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2207 	if (supercl)
2208 		hammer_rel_supercl(supercl, 0);
2209 	return (error);
2210 }
2211 
2212 static int
2213 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2214 {
2215 	return(0);
2216 }
2217 
2218 static int
2219 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2220 		      int32_t count, int32_t atblk, int32_t *fullp)
2221 {
2222 	hammer_volume_t volume = info;
2223 	hammer_supercl_t supercl;
2224 	int32_t scl_no;
2225 	int32_t r;
2226 	int error = 0;
2227 
2228 	scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2229 	supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2230 	if (supercl) {
2231 		r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2232 		if (r != HAMMER_ALIST_BLOCK_NONE)
2233 			r += blk;
2234 		*fullp = hammer_alist_isfull(&supercl->alist);
2235 		hammer_modify_supercl(supercl);
2236 		hammer_rel_supercl(supercl, 0);
2237 	} else {
2238 		r = HAMMER_ALIST_BLOCK_NONE;
2239 		*fullp = 0;
2240 	}
2241 	return(r);
2242 }
2243 
2244 static int
2245 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2246 		      int32_t count, int32_t atblk, int32_t *fullp)
2247 {
2248 	hammer_volume_t volume = info;
2249 	hammer_supercl_t supercl;
2250 	int32_t scl_no;
2251 	int32_t r;
2252 	int error = 0;
2253 
2254 	scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2255 	supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2256 	if (supercl) {
2257 		r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2258 		if (r != HAMMER_ALIST_BLOCK_NONE)
2259 			r += blk;
2260 		*fullp = hammer_alist_isfull(&supercl->alist);
2261 		hammer_modify_supercl(supercl);
2262 		hammer_rel_supercl(supercl, 0);
2263 	} else {
2264 		r = HAMMER_ALIST_BLOCK_NONE;
2265 		*fullp = 0;
2266 	}
2267 	return(r);
2268 }
2269 
2270 static void
2271 super_alist_free(void *info, int32_t blk, int32_t radix,
2272 		 int32_t base_blk, int32_t count, int32_t *emptyp)
2273 {
2274 	hammer_volume_t volume = info;
2275 	hammer_supercl_t supercl;
2276 	int32_t scl_no;
2277 	int error = 0;
2278 
2279 	scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2280 	supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2281 	if (supercl) {
2282 		hammer_alist_free(&supercl->alist, base_blk, count);
2283 		*emptyp = hammer_alist_isempty(&supercl->alist);
2284 		hammer_modify_supercl(supercl);
2285 		hammer_rel_supercl(supercl, 0);
2286 	} else {
2287 		*emptyp = 0;
2288 	}
2289 }
2290 
2291 static void
2292 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2293 {
2294 }
2295 
2296 void
2297 hammer_init_alist_config(void)
2298 {
2299 	hammer_alist_config_t config;
2300 
2301 	hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2302 			      1, HAMMER_FSBUF_METAELMS);
2303 	hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2304 			      1, HAMMER_VOL_METAELMS_1LYR);
2305 	hammer_alist_template(&Vol_super_alist_config,
2306 			  HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2307 			      HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR);
2308 	hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2309 			      1, HAMMER_SUPERCL_METAELMS);
2310 	hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2311 			      1, HAMMER_CLU_MASTER_METAELMS);
2312 	hammer_alist_template(&Clu_slave_alist_config,
2313 			      HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2314 			      HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS);
2315 
2316 	config = &Vol_super_alist_config;
2317 	config->bl_radix_init = super_alist_init;
2318 	config->bl_radix_destroy = super_alist_destroy;
2319 	config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2320 	config->bl_radix_alloc_rev = super_alist_alloc_rev;
2321 	config->bl_radix_free = super_alist_free;
2322 	config->bl_radix_print = super_alist_print;
2323 
2324 	config = &Clu_slave_alist_config;
2325 	config->bl_radix_init = buffer_alist_init;
2326 	config->bl_radix_destroy = buffer_alist_destroy;
2327 	config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2328 	config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2329 	config->bl_radix_free = buffer_alist_free;
2330 	config->bl_radix_print = buffer_alist_print;
2331 }
2332 
2333