xref: /dragonfly/sys/vfs/hammer/hammer_io.c (revision 0ca59c34)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * IO Primitives and buffer cache management
36  *
37  * All major data-tracking structures in HAMMER contain a struct hammer_io
38  * which is used to manage their backing store.  We use filesystem buffers
39  * for backing store and we leave them passively associated with their
40  * HAMMER structures.
41  *
42  * If the kernel tries to destroy a passively associated buf which we cannot
43  * yet let go we set B_LOCKED in the buffer and then actively released it
44  * later when we can.
45  *
46  * The io_token is required for anything which might race bioops and bio_done
47  * callbacks, with one exception: A successful hammer_try_interlock_norefs().
48  * the fs_token will be held in all other cases.
49  */
50 
51 #include <sys/buf2.h>
52 
53 #include "hammer.h"
54 
55 static void hammer_io_modify(hammer_io_t io, int count);
56 static void hammer_io_deallocate(struct buf *bp);
57 static void hammer_indirect_callback(struct bio *bio);
58 static void hammer_io_direct_write_complete(struct bio *nbio);
59 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
60 static void hammer_io_set_modlist(struct hammer_io *io);
61 static void hammer_io_flush_mark(hammer_volume_t volume);
62 
63 static int
64 hammer_mod_rb_compare(hammer_io_t io1, hammer_io_t io2)
65 {
66 	hammer_off_t io1_offset;
67 	hammer_off_t io2_offset;
68 
69 	io1_offset = ((io1->offset & HAMMER_OFF_SHORT_MASK) << 8) |
70 		     io1->volume->vol_no;
71 	io2_offset = ((io2->offset & HAMMER_OFF_SHORT_MASK) << 8) |
72 		     io2->volume->vol_no;
73 
74 	if (io1_offset < io2_offset)
75 		return(-1);
76 	if (io1_offset > io2_offset)
77 		return(1);
78 	return(0);
79 }
80 
81 RB_GENERATE(hammer_mod_rb_tree, hammer_io, rb_node, hammer_mod_rb_compare);
82 
83 /*
84  * Initialize a new, already-zero'd hammer_io structure, or reinitialize
85  * an existing hammer_io structure which may have switched to another type.
86  */
87 void
88 hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type)
89 {
90 	io->volume = volume;
91 	io->hmp = volume->io.hmp;
92 	io->type = type;
93 }
94 
95 /*
96  * Helper routine to disassociate a buffer cache buffer from an I/O
97  * structure.  The io must be interlocked and marked appropriately for
98  * reclamation.
99  *
100  * The io must be in a released state with the io->bp owned and
101  * locked by the caller of this function.  When not called from an
102  * io_deallocate() this cannot race an io_deallocate() since the
103  * kernel would be unable to get the buffer lock in that case.
104  * (The released state in this case means we own the bp, not the
105  * hammer_io structure).
106  *
107  * The io may have 0 or 1 references depending on who called us.  The
108  * caller is responsible for dealing with the refs.
109  *
110  * This call can only be made when no action is required on the buffer.
111  *
112  * This function is guaranteed not to race against anything because we
113  * own both the io lock and the bp lock and are interlocked with no
114  * references.
115  */
116 static void
117 hammer_io_disassociate(hammer_io_structure_t iou)
118 {
119 	struct buf *bp = iou->io.bp;
120 
121 	KKASSERT(iou->io.released);
122 	KKASSERT(iou->io.modified == 0);
123 	KKASSERT(LIST_FIRST(&bp->b_dep) == (void *)iou);
124 	buf_dep_init(bp);
125 	iou->io.bp = NULL;
126 
127 	/*
128 	 * If the buffer was locked someone wanted to get rid of it.
129 	 */
130 	if (bp->b_flags & B_LOCKED) {
131 		atomic_add_int(&hammer_count_io_locked, -1);
132 		bp->b_flags &= ~B_LOCKED;
133 	}
134 	if (iou->io.reclaim) {
135 		bp->b_flags |= B_NOCACHE|B_RELBUF;
136 		iou->io.reclaim = 0;
137 	}
138 
139 	switch(iou->io.type) {
140 	case HAMMER_STRUCTURE_VOLUME:
141 		iou->volume.ondisk = NULL;
142 		break;
143 	case HAMMER_STRUCTURE_DATA_BUFFER:
144 	case HAMMER_STRUCTURE_META_BUFFER:
145 	case HAMMER_STRUCTURE_UNDO_BUFFER:
146 		iou->buffer.ondisk = NULL;
147 		break;
148 	case HAMMER_STRUCTURE_DUMMY:
149 		hpanic("bad io type");
150 		break;
151 	}
152 }
153 
154 /*
155  * Wait for any physical IO to complete
156  *
157  * XXX we aren't interlocked against a spinlock or anything so there
158  *     is a small window in the interlock / io->running == 0 test.
159  */
160 void
161 hammer_io_wait(hammer_io_t io)
162 {
163 	if (io->running) {
164 		hammer_mount_t hmp = io->hmp;
165 
166 		lwkt_gettoken(&hmp->io_token);
167 		while (io->running) {
168 			io->waiting = 1;
169 			tsleep_interlock(io, 0);
170 			if (io->running)
171 				tsleep(io, PINTERLOCKED, "hmrflw", hz);
172 		}
173 		lwkt_reltoken(&hmp->io_token);
174 	}
175 }
176 
177 /*
178  * Wait for all currently queued HAMMER-initiated I/Os to complete.
179  *
180  * This is not supposed to count direct I/O's but some can leak
181  * through (for non-full-sized direct I/Os).
182  */
183 void
184 hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush)
185 {
186 	struct hammer_io iodummy;
187 	hammer_io_t io;
188 
189 	/*
190 	 * Degenerate case, no I/O is running
191 	 */
192 	lwkt_gettoken(&hmp->io_token);
193 	if (TAILQ_EMPTY(&hmp->iorun_list)) {
194 		lwkt_reltoken(&hmp->io_token);
195 		if (doflush)
196 			hammer_io_flush_sync(hmp);
197 		return;
198 	}
199 	bzero(&iodummy, sizeof(iodummy));
200 	iodummy.type = HAMMER_STRUCTURE_DUMMY;
201 
202 	/*
203 	 * Add placemarker and then wait until it becomes the head of
204 	 * the list.
205 	 */
206 	TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry);
207 	while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) {
208 		tsleep(&iodummy, 0, ident, 0);
209 	}
210 
211 	/*
212 	 * Chain in case several placemarkers are present.
213 	 */
214 	TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry);
215 	io = TAILQ_FIRST(&hmp->iorun_list);
216 	if (io && io->type == HAMMER_STRUCTURE_DUMMY)
217 		wakeup(io);
218 	lwkt_reltoken(&hmp->io_token);
219 
220 	if (doflush)
221 		hammer_io_flush_sync(hmp);
222 }
223 
224 /*
225  * Clear a flagged error condition on a I/O buffer.  The caller must hold
226  * its own ref on the buffer.
227  */
228 void
229 hammer_io_clear_error(struct hammer_io *io)
230 {
231 	hammer_mount_t hmp = io->hmp;
232 
233 	lwkt_gettoken(&hmp->io_token);
234 	if (io->ioerror) {
235 		io->ioerror = 0;
236 		hammer_rel(&io->lock);
237 		KKASSERT(hammer_isactive(&io->lock));
238 	}
239 	lwkt_reltoken(&hmp->io_token);
240 }
241 
242 void
243 hammer_io_clear_error_noassert(struct hammer_io *io)
244 {
245 	hammer_mount_t hmp = io->hmp;
246 
247 	lwkt_gettoken(&hmp->io_token);
248 	if (io->ioerror) {
249 		io->ioerror = 0;
250 		hammer_rel(&io->lock);
251 	}
252 	lwkt_reltoken(&hmp->io_token);
253 }
254 
255 /*
256  * This is an advisory function only which tells the buffer cache
257  * the bp is not a meta-data buffer, even though it is backed by
258  * a block device.
259  *
260  * This is used by HAMMER's reblocking code to avoid trying to
261  * swapcache the filesystem's data when it is read or written
262  * by the reblocking code.
263  *
264  * The caller has a ref on the buffer preventing the bp from
265  * being disassociated from it.
266  */
267 void
268 hammer_io_notmeta(hammer_buffer_t buffer)
269 {
270 	if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) {
271 		hammer_mount_t hmp = buffer->io.hmp;
272 
273 		lwkt_gettoken(&hmp->io_token);
274 		buffer->io.bp->b_flags |= B_NOTMETA;
275 		lwkt_reltoken(&hmp->io_token);
276 	}
277 }
278 
279 /*
280  * Load bp for a HAMMER structure.  The io must be exclusively locked by
281  * the caller.
282  *
283  * This routine is mostly used on meta-data and small-data blocks.  Generally
284  * speaking HAMMER assumes some locality of reference and will cluster.
285  *
286  * Note that the caller (hammer_ondisk.c) may place further restrictions
287  * on clusterability via the limit (in bytes).  Typically large-data
288  * zones cannot be clustered due to their mixed buffer sizes.  This is
289  * not an issue since such clustering occurs in hammer_vnops at the
290  * regular file layer, whereas this is the buffered block device layer.
291  *
292  * No I/O callbacks can occur while we hold the buffer locked.
293  */
294 int
295 hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit)
296 {
297 	struct buf *bp;
298 	int   error;
299 
300 	if ((bp = io->bp) == NULL) {
301 		atomic_add_long(&hammer_count_io_running_read, io->bytes);
302 		if (hammer_cluster_enable && limit > io->bytes) {
303 			error = cluster_read(devvp, io->offset + limit,
304 					     io->offset, io->bytes,
305 					     HAMMER_CLUSTER_SIZE,
306 					     HAMMER_CLUSTER_SIZE,
307 					     &io->bp);
308 		} else {
309 			error = bread(devvp, io->offset, io->bytes, &io->bp);
310 		}
311 		hammer_stats_disk_read += io->bytes;
312 		atomic_add_long(&hammer_count_io_running_read, -io->bytes);
313 
314 		/*
315 		 * The code generally assumes b_ops/b_dep has been set-up,
316 		 * even if we error out here.
317 		 */
318 		bp = io->bp;
319 		if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
320 			const char *metatype;
321 
322 			switch(io->type) {
323 			case HAMMER_STRUCTURE_VOLUME:
324 				metatype = "volume";
325 				break;
326 			case HAMMER_STRUCTURE_META_BUFFER:
327 				switch(HAMMER_ITOB(io)->zoneX_offset
328 					& HAMMER_OFF_ZONE_MASK) {
329 				case HAMMER_ZONE_BTREE:
330 					metatype = "btree";
331 					break;
332 				case HAMMER_ZONE_META:
333 					metatype = "meta";
334 					break;
335 				case HAMMER_ZONE_FREEMAP:
336 					metatype = "freemap";
337 					break;
338 				default:
339 					metatype = "meta?";
340 					break;
341 				}
342 				break;
343 			case HAMMER_STRUCTURE_DATA_BUFFER:
344 				metatype = "data";
345 				break;
346 			case HAMMER_STRUCTURE_UNDO_BUFFER:
347 				metatype = "undo";
348 				break;
349 			default:
350 				metatype = "unknown";
351 				break;
352 			}
353 			hdkprintf("doff %016jx %s\n",
354 				(intmax_t)bp->b_bio2.bio_offset,
355 				metatype);
356 		}
357 		bp->b_flags &= ~B_IODEBUG;
358 		bp->b_ops = &hammer_bioops;
359 		KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
360 
361 		/* io->worklist is locked by the io lock */
362 		LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
363 		BUF_KERNPROC(bp);
364 		KKASSERT(io->modified == 0);
365 		KKASSERT(io->running == 0);
366 		KKASSERT(io->waiting == 0);
367 		io->released = 0;	/* we hold an active lock on bp */
368 	} else {
369 		error = 0;
370 	}
371 	return(error);
372 }
373 
374 /*
375  * Similar to hammer_io_read() but returns a zero'd out buffer instead.
376  * Must be called with the IO exclusively locked.
377  *
378  * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background
379  * I/O by forcing the buffer to not be in a released state before calling
380  * it.
381  *
382  * This function will also mark the IO as modified but it will not
383  * increment the modify_refs count.
384  *
385  * No I/O callbacks can occur while we hold the buffer locked.
386  */
387 int
388 hammer_io_new(struct vnode *devvp, struct hammer_io *io)
389 {
390 	struct buf *bp;
391 
392 	if ((bp = io->bp) == NULL) {
393 		io->bp = getblk(devvp, io->offset, io->bytes, 0, 0);
394 		bp = io->bp;
395 		bp->b_ops = &hammer_bioops;
396 		KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
397 
398 		/* io->worklist is locked by the io lock */
399 		LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
400 		io->released = 0;
401 		KKASSERT(io->running == 0);
402 		io->waiting = 0;
403 		BUF_KERNPROC(bp);
404 	} else {
405 		if (io->released) {
406 			regetblk(bp);
407 			BUF_KERNPROC(bp);
408 			io->released = 0;
409 		}
410 	}
411 	hammer_io_modify(io, 0);
412 	vfs_bio_clrbuf(bp);
413 	return(0);
414 }
415 
416 /*
417  * Advance the activity count on the underlying buffer because
418  * HAMMER does not getblk/brelse on every access.
419  *
420  * The io->bp cannot go away while the buffer is referenced.
421  */
422 void
423 hammer_io_advance(struct hammer_io *io)
424 {
425 	if (io->bp)
426 		buf_act_advance(io->bp);
427 }
428 
429 /*
430  * Remove potential device level aliases against buffers managed by high level
431  * vnodes.  Aliases can also be created due to mixed buffer sizes or via
432  * direct access to the backing store device.
433  *
434  * This is nasty because the buffers are also VMIO-backed.  Even if a buffer
435  * does not exist its backing VM pages might, and we have to invalidate
436  * those as well or a getblk() will reinstate them.
437  *
438  * Buffer cache buffers associated with hammer_buffers cannot be
439  * invalidated.
440  */
441 int
442 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
443 {
444 	hammer_io_structure_t iou;
445 	hammer_mount_t hmp;
446 	hammer_off_t phys_offset;
447 	struct buf *bp;
448 	int error;
449 
450 	hmp = volume->io.hmp;
451 	lwkt_gettoken(&hmp->io_token);
452 
453 	/*
454 	 * If a device buffer already exists for the specified physical
455 	 * offset use that, otherwise instantiate a buffer to cover any
456 	 * related VM pages, set BNOCACHE, and brelse().
457 	 */
458 	phys_offset = volume->ondisk->vol_buf_beg +
459 		      (zone2_offset & HAMMER_OFF_SHORT_MASK);
460 	if ((bp = findblk(volume->devvp, phys_offset, 0)) != NULL)
461 		bremfree(bp);
462 	else
463 		bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0);
464 
465 	if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) {
466 #if 0
467 		hammer_ref(&iou->io.lock);
468 		hammer_io_clear_modify(&iou->io, 1);
469 		bundirty(bp);
470 		iou->io.released = 0;
471 		BUF_KERNPROC(bp);
472 		iou->io.reclaim = 1;
473 		iou->io.waitdep = 1;	/* XXX this is a fs_token field */
474 		KKASSERT(hammer_isactive(&iou->io.lock) == 1);
475 		hammer_rel_buffer(&iou->buffer, 0);
476 		/*hammer_io_deallocate(bp);*/
477 #endif
478 		bqrelse(bp);
479 		error = EAGAIN;
480 	} else {
481 		KKASSERT((bp->b_flags & B_LOCKED) == 0);
482 		bundirty(bp);
483 		bp->b_flags |= B_NOCACHE|B_RELBUF;
484 		brelse(bp);
485 		error = 0;
486 	}
487 	lwkt_reltoken(&hmp->io_token);
488 	return(error);
489 }
490 
491 /*
492  * This routine is called on the last reference to a hammer structure.
493  * The io must be interlocked with a refcount of zero.  The hammer structure
494  * will remain interlocked on return.
495  *
496  * This routine may return a non-NULL bp to the caller for dispoal.
497  * The caller typically brelse()'s the bp.
498  *
499  * The bp may or may not still be passively associated with the IO.  It
500  * will remain passively associated if it is unreleasable (e.g. a modified
501  * meta-data buffer).
502  *
503  * The only requirement here is that modified meta-data and volume-header
504  * buffer may NOT be disassociated from the IO structure, and consequently
505  * we also leave such buffers actively associated with the IO if they already
506  * are (since the kernel can't do anything with them anyway).  Only the
507  * flusher is allowed to write such buffers out.  Modified pure-data and
508  * undo buffers are returned to the kernel but left passively associated
509  * so we can track when the kernel writes the bp out.
510  */
511 struct buf *
512 hammer_io_release(struct hammer_io *io, int flush)
513 {
514 	union hammer_io_structure *iou = (void *)io;
515 	struct buf *bp;
516 
517 	if ((bp = io->bp) == NULL)
518 		return(NULL);
519 
520 	/*
521 	 * Try to flush a dirty IO to disk if asked to by the
522 	 * caller or if the kernel tried to flush the buffer in the past.
523 	 *
524 	 * Kernel-initiated flushes are only allowed for pure-data buffers.
525 	 * meta-data and volume buffers can only be flushed explicitly
526 	 * by HAMMER.
527 	 */
528 	if (io->modified) {
529 		if (flush) {
530 			hammer_io_flush(io, 0);
531 		} else if (bp->b_flags & B_LOCKED) {
532 			switch(io->type) {
533 			case HAMMER_STRUCTURE_DATA_BUFFER:
534 				hammer_io_flush(io, 0);
535 				break;
536 			case HAMMER_STRUCTURE_UNDO_BUFFER:
537 				hammer_io_flush(io, hammer_undo_reclaim(io));
538 				break;
539 			default:
540 				break;
541 			}
542 		} /* else no explicit request to flush the buffer */
543 	}
544 
545 	/*
546 	 * Wait for the IO to complete if asked to.  This occurs when
547 	 * the buffer must be disposed of definitively during an umount
548 	 * or buffer invalidation.
549 	 */
550 	if (io->waitdep && io->running) {
551 		hammer_io_wait(io);
552 	}
553 
554 	/*
555 	 * Return control of the buffer to the kernel (with the provisio
556 	 * that our bioops can override kernel decisions with regards to
557 	 * the buffer).
558 	 */
559 	if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) {
560 		/*
561 		 * Always disassociate the bp if an explicit flush
562 		 * was requested and the IO completed with no error
563 		 * (so unmount can really clean up the structure).
564 		 */
565 		if (io->released) {
566 			regetblk(bp);
567 			BUF_KERNPROC(bp);
568 		} else {
569 			io->released = 1;
570 		}
571 		hammer_io_disassociate((hammer_io_structure_t)io);
572 		/* return the bp */
573 	} else if (io->modified) {
574 		/*
575 		 * Only certain IO types can be released to the kernel if
576 		 * the buffer has been modified.
577 		 *
578 		 * volume and meta-data IO types may only be explicitly
579 		 * flushed by HAMMER.
580 		 */
581 		switch(io->type) {
582 		case HAMMER_STRUCTURE_DATA_BUFFER:
583 		case HAMMER_STRUCTURE_UNDO_BUFFER:
584 			if (io->released == 0) {
585 				io->released = 1;
586 				bp->b_flags |= B_CLUSTEROK;
587 				bdwrite(bp);
588 			}
589 			break;
590 		default:
591 			break;
592 		}
593 		bp = NULL;	/* bp left associated */
594 	} else if (io->released == 0) {
595 		/*
596 		 * Clean buffers can be generally released to the kernel.
597 		 * We leave the bp passively associated with the HAMMER
598 		 * structure and use bioops to disconnect it later on
599 		 * if the kernel wants to discard the buffer.
600 		 *
601 		 * We can steal the structure's ownership of the bp.
602 		 */
603 		io->released = 1;
604 		if (bp->b_flags & B_LOCKED) {
605 			hammer_io_disassociate(iou);
606 			/* return the bp */
607 		} else {
608 			if (io->reclaim) {
609 				hammer_io_disassociate(iou);
610 				/* return the bp */
611 			} else {
612 				/* return the bp (bp passively associated) */
613 			}
614 		}
615 	} else {
616 		/*
617 		 * A released buffer is passively associate with our
618 		 * hammer_io structure.  The kernel cannot destroy it
619 		 * without making a bioops call.  If the kernel (B_LOCKED)
620 		 * or we (reclaim) requested that the buffer be destroyed
621 		 * we destroy it, otherwise we do a quick get/release to
622 		 * reset its position in the kernel's LRU list.
623 		 *
624 		 * Leaving the buffer passively associated allows us to
625 		 * use the kernel's LRU buffer flushing mechanisms rather
626 		 * then rolling our own.
627 		 *
628 		 * XXX there are two ways of doing this.  We can re-acquire
629 		 * and passively release to reset the LRU, or not.
630 		 */
631 		if (io->running == 0) {
632 			regetblk(bp);
633 			if ((bp->b_flags & B_LOCKED) || io->reclaim) {
634 				hammer_io_disassociate(iou);
635 				/* return the bp */
636 			} else {
637 				/* return the bp (bp passively associated) */
638 			}
639 		} else {
640 			/*
641 			 * bp is left passively associated but we do not
642 			 * try to reacquire it.  Interactions with the io
643 			 * structure will occur on completion of the bp's
644 			 * I/O.
645 			 */
646 			bp = NULL;
647 		}
648 	}
649 	return(bp);
650 }
651 
652 /*
653  * This routine is called with a locked IO when a flush is desired and
654  * no other references to the structure exists other then ours.  This
655  * routine is ONLY called when HAMMER believes it is safe to flush a
656  * potentially modified buffer out.
657  *
658  * The locked io or io reference prevents a flush from being initiated
659  * by the kernel.
660  */
661 void
662 hammer_io_flush(struct hammer_io *io, int reclaim)
663 {
664 	struct buf *bp;
665 	hammer_mount_t hmp;
666 
667 	/*
668 	 * Degenerate case - nothing to flush if nothing is dirty.
669 	 */
670 	if (io->modified == 0)
671 		return;
672 
673 	KKASSERT(io->bp);
674 	KKASSERT(io->modify_refs <= 0);
675 
676 	/*
677 	 * Acquire ownership of the bp, particularly before we clear our
678 	 * modified flag.
679 	 *
680 	 * We are going to bawrite() this bp.  Don't leave a window where
681 	 * io->released is set, we actually own the bp rather then our
682 	 * buffer.
683 	 *
684 	 * The io_token should not be required here as only
685 	 */
686 	hmp = io->hmp;
687 	bp = io->bp;
688 	if (io->released) {
689 		regetblk(bp);
690 		/* BUF_KERNPROC(io->bp); */
691 		/* io->released = 0; */
692 		KKASSERT(io->released);
693 		KKASSERT(io->bp == bp);
694 	} else {
695 		io->released = 1;
696 	}
697 
698 	if (reclaim) {
699 		io->reclaim = 1;
700 		if ((bp->b_flags & B_LOCKED) == 0) {
701 			bp->b_flags |= B_LOCKED;
702 			atomic_add_int(&hammer_count_io_locked, 1);
703 		}
704 	}
705 
706 	/*
707 	 * Acquire exclusive access to the bp and then clear the modified
708 	 * state of the buffer prior to issuing I/O to interlock any
709 	 * modifications made while the I/O is in progress.  This shouldn't
710 	 * happen anyway but losing data would be worse.  The modified bit
711 	 * will be rechecked after the IO completes.
712 	 *
713 	 * NOTE: This call also finalizes the buffer's content (inval == 0).
714 	 *
715 	 * This is only legal when lock.refs == 1 (otherwise we might clear
716 	 * the modified bit while there are still users of the cluster
717 	 * modifying the data).
718 	 *
719 	 * Do this before potentially blocking so any attempt to modify the
720 	 * ondisk while we are blocked blocks waiting for us.
721 	 */
722 	hammer_ref(&io->lock);
723 	hammer_io_clear_modify(io, 0);
724 	hammer_rel(&io->lock);
725 
726 	if (hammer_debug_io & 0x0002)
727 		hdkprintf("%016jx\n", bp->b_bio1.bio_offset);
728 
729 	/*
730 	 * Transfer ownership to the kernel and initiate I/O.
731 	 *
732 	 * NOTE: We do not hold io_token so an atomic op is required to
733 	 *	 update io_running_space.
734 	 */
735 	io->running = 1;
736 	atomic_add_long(&hmp->io_running_space, io->bytes);
737 	atomic_add_long(&hammer_count_io_running_write, io->bytes);
738 	lwkt_gettoken(&hmp->io_token);
739 	TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry);
740 	lwkt_reltoken(&hmp->io_token);
741 	cluster_awrite(bp);
742 	hammer_io_flush_mark(io->volume);
743 }
744 
745 /************************************************************************
746  *				BUFFER DIRTYING				*
747  ************************************************************************
748  *
749  * These routines deal with dependancies created when IO buffers get
750  * modified.  The caller must call hammer_modify_*() on a referenced
751  * HAMMER structure prior to modifying its on-disk data.
752  *
753  * Any intent to modify an IO buffer acquires the related bp and imposes
754  * various write ordering dependancies.
755  */
756 
757 /*
758  * Mark a HAMMER structure as undergoing modification.  Meta-data buffers
759  * are locked until the flusher can deal with them, pure data buffers
760  * can be written out.
761  *
762  * The referenced io prevents races.
763  */
764 static
765 void
766 hammer_io_modify(hammer_io_t io, int count)
767 {
768 	/*
769 	 * io->modify_refs must be >= 0
770 	 */
771 	while (io->modify_refs < 0) {
772 		io->waitmod = 1;
773 		tsleep(io, 0, "hmrmod", 0);
774 	}
775 
776 	/*
777 	 * Shortcut if nothing to do.
778 	 */
779 	KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL);
780 	io->modify_refs += count;
781 	if (io->modified && io->released == 0)
782 		return;
783 
784 	/*
785 	 * NOTE: It is important not to set the modified bit
786 	 *	 until after we have acquired the bp or we risk
787 	 *	 racing against checkwrite.
788 	 */
789 	hammer_lock_ex(&io->lock);
790 	if (io->released) {
791 		regetblk(io->bp);
792 		BUF_KERNPROC(io->bp);
793 		io->released = 0;
794 	}
795 	if (io->modified == 0) {
796 		hammer_io_set_modlist(io);
797 		io->modified = 1;
798 	}
799 	hammer_unlock(&io->lock);
800 }
801 
802 static __inline
803 void
804 hammer_io_modify_done(hammer_io_t io)
805 {
806 	KKASSERT(io->modify_refs > 0);
807 	--io->modify_refs;
808 	if (io->modify_refs == 0 && io->waitmod) {
809 		io->waitmod = 0;
810 		wakeup(io);
811 	}
812 }
813 
814 /*
815  * The write interlock blocks other threads trying to modify a buffer
816  * (they block in hammer_io_modify()) after us, or blocks us while other
817  * threads are in the middle of modifying a buffer.
818  *
819  * The caller also has a ref on the io, however if we are not careful
820  * we will race bioops callbacks (checkwrite).  To deal with this
821  * we must at least acquire and release the io_token, and it is probably
822  * better to hold it through the setting of modify_refs.
823  */
824 void
825 hammer_io_write_interlock(hammer_io_t io)
826 {
827 	hammer_mount_t hmp = io->hmp;
828 
829 	lwkt_gettoken(&hmp->io_token);
830 	while (io->modify_refs != 0) {
831 		io->waitmod = 1;
832 		tsleep(io, 0, "hmrmod", 0);
833 	}
834 	io->modify_refs = -1;
835 	lwkt_reltoken(&hmp->io_token);
836 }
837 
838 void
839 hammer_io_done_interlock(hammer_io_t io)
840 {
841 	KKASSERT(io->modify_refs == -1);
842 	io->modify_refs = 0;
843 	if (io->waitmod) {
844 		io->waitmod = 0;
845 		wakeup(io);
846 	}
847 }
848 
849 /*
850  * Caller intends to modify a volume's ondisk structure.
851  *
852  * This is only allowed if we are the flusher or we have a ref on the
853  * sync_lock.
854  */
855 void
856 hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume,
857 		     void *base, int len)
858 {
859 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
860 
861 	hammer_io_modify(&volume->io, 1);
862 	if (len) {
863 		intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk;
864 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
865 		hammer_generate_undo(trans,
866 			 HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset),
867 			 base, len);
868 	}
869 }
870 
871 /*
872  * Caller intends to modify a buffer's ondisk structure.
873  *
874  * This is only allowed if we are the flusher or we have a ref on the
875  * sync_lock.
876  */
877 void
878 hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer,
879 		     void *base, int len)
880 {
881 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
882 
883 	hammer_io_modify(&buffer->io, 1);
884 	if (len) {
885 		intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk;
886 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
887 		hammer_generate_undo(trans,
888 				     buffer->zone2_offset + rel_offset,
889 				     base, len);
890 	}
891 }
892 
893 void
894 hammer_modify_volume_done(hammer_volume_t volume)
895 {
896 	hammer_io_modify_done(&volume->io);
897 }
898 
899 void
900 hammer_modify_buffer_done(hammer_buffer_t buffer)
901 {
902 	hammer_io_modify_done(&buffer->io);
903 }
904 
905 /*
906  * Mark an entity as not being dirty any more and finalize any
907  * delayed adjustments to the buffer.
908  *
909  * Delayed adjustments are an important performance enhancement, allowing
910  * us to avoid recalculating B-Tree node CRCs over and over again when
911  * making bulk-modifications to the B-Tree.
912  *
913  * If inval is non-zero delayed adjustments are ignored.
914  *
915  * This routine may dereference related btree nodes and cause the
916  * buffer to be dereferenced.  The caller must own a reference on io.
917  */
918 void
919 hammer_io_clear_modify(struct hammer_io *io, int inval)
920 {
921 	hammer_mount_t hmp;
922 
923 	/*
924 	 * io_token is needed to avoid races on mod_root
925 	 */
926 	if (io->modified == 0)
927 		return;
928 	hmp = io->hmp;
929 	lwkt_gettoken(&hmp->io_token);
930 	if (io->modified == 0) {
931 		lwkt_reltoken(&hmp->io_token);
932 		return;
933 	}
934 
935 	/*
936 	 * Take us off the mod-list and clear the modified bit.
937 	 */
938 	KKASSERT(io->mod_root != NULL);
939 	if (io->mod_root == &io->hmp->volu_root ||
940 	    io->mod_root == &io->hmp->meta_root) {
941 		io->hmp->locked_dirty_space -= io->bytes;
942 		atomic_add_long(&hammer_count_dirtybufspace, -io->bytes);
943 	}
944 	RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io);
945 	io->mod_root = NULL;
946 	io->modified = 0;
947 
948 	lwkt_reltoken(&hmp->io_token);
949 
950 	/*
951 	 * If this bit is not set there are no delayed adjustments.
952 	 */
953 	if (io->gencrc == 0)
954 		return;
955 	io->gencrc = 0;
956 
957 	/*
958 	 * Finalize requested CRCs.  The NEEDSCRC flag also holds a reference
959 	 * on the node (& underlying buffer).  Release the node after clearing
960 	 * the flag.
961 	 */
962 	if (io->type == HAMMER_STRUCTURE_META_BUFFER) {
963 		hammer_buffer_t buffer = HAMMER_ITOB(io);
964 		hammer_node_t node;
965 
966 restart:
967 		TAILQ_FOREACH(node, &buffer->clist, entry) {
968 			if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0)
969 				continue;
970 			node->flags &= ~HAMMER_NODE_NEEDSCRC;
971 			KKASSERT(node->ondisk);
972 			if (inval == 0)
973 				node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE);
974 			hammer_rel_node(node);
975 			goto restart;
976 		}
977 	}
978 	/* caller must still have ref on io */
979 	KKASSERT(hammer_isactive(&io->lock));
980 }
981 
982 /*
983  * Clear the IO's modify list.  Even though the IO is no longer modified
984  * it may still be on the lose_root.  This routine is called just before
985  * the governing hammer_buffer is destroyed.
986  *
987  * mod_root requires io_token protection.
988  */
989 void
990 hammer_io_clear_modlist(struct hammer_io *io)
991 {
992 	hammer_mount_t hmp = io->hmp;
993 
994 	KKASSERT(io->modified == 0);
995 	if (io->mod_root) {
996 		lwkt_gettoken(&hmp->io_token);
997 		if (io->mod_root) {
998 			KKASSERT(io->mod_root == &io->hmp->lose_root);
999 			RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io);
1000 			io->mod_root = NULL;
1001 		}
1002 		lwkt_reltoken(&hmp->io_token);
1003 	}
1004 }
1005 
1006 static void
1007 hammer_io_set_modlist(struct hammer_io *io)
1008 {
1009 	struct hammer_mount *hmp = io->hmp;
1010 
1011 	lwkt_gettoken(&hmp->io_token);
1012 	KKASSERT(io->mod_root == NULL);
1013 
1014 	switch(io->type) {
1015 	case HAMMER_STRUCTURE_VOLUME:
1016 		io->mod_root = &hmp->volu_root;
1017 		hmp->locked_dirty_space += io->bytes;
1018 		atomic_add_long(&hammer_count_dirtybufspace, io->bytes);
1019 		break;
1020 	case HAMMER_STRUCTURE_META_BUFFER:
1021 		io->mod_root = &hmp->meta_root;
1022 		hmp->locked_dirty_space += io->bytes;
1023 		atomic_add_long(&hammer_count_dirtybufspace, io->bytes);
1024 		break;
1025 	case HAMMER_STRUCTURE_UNDO_BUFFER:
1026 		io->mod_root = &hmp->undo_root;
1027 		break;
1028 	case HAMMER_STRUCTURE_DATA_BUFFER:
1029 		io->mod_root = &hmp->data_root;
1030 		break;
1031 	case HAMMER_STRUCTURE_DUMMY:
1032 		hpanic("bad io type");
1033 		break; /* NOT REACHED */
1034 	}
1035 	if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) {
1036 		hpanic("duplicate entry");
1037 		/* NOT REACHED */
1038 	}
1039 	lwkt_reltoken(&hmp->io_token);
1040 }
1041 
1042 /************************************************************************
1043  *				HAMMER_BIOOPS				*
1044  ************************************************************************
1045  *
1046  */
1047 
1048 /*
1049  * Pre-IO initiation kernel callback - cluster build only
1050  *
1051  * bioops callback - hold io_token
1052  */
1053 static void
1054 hammer_io_start(struct buf *bp)
1055 {
1056 	/* nothing to do, so io_token not needed */
1057 }
1058 
1059 /*
1060  * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT!
1061  *
1062  * NOTE: HAMMER may modify a data buffer after we have initiated write
1063  *	 I/O.
1064  *
1065  * NOTE: MPSAFE callback
1066  *
1067  * bioops callback - hold io_token
1068  */
1069 static void
1070 hammer_io_complete(struct buf *bp)
1071 {
1072 	union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep);
1073 	struct hammer_mount *hmp = iou->io.hmp;
1074 	struct hammer_io *ionext;
1075 
1076 	lwkt_gettoken(&hmp->io_token);
1077 
1078 	KKASSERT(iou->io.released == 1);
1079 
1080 	/*
1081 	 * Deal with people waiting for I/O to drain
1082 	 */
1083 	if (iou->io.running) {
1084 		/*
1085 		 * Deal with critical write errors.  Once a critical error
1086 		 * has been flagged in hmp the UNDO FIFO will not be updated.
1087 		 * That way crash recover will give us a consistent
1088 		 * filesystem.
1089 		 *
1090 		 * Because of this we can throw away failed UNDO buffers.  If
1091 		 * we throw away META or DATA buffers we risk corrupting
1092 		 * the now read-only version of the filesystem visible to
1093 		 * the user.  Clear B_ERROR so the buffer is not re-dirtied
1094 		 * by the kernel and ref the io so it doesn't get thrown
1095 		 * away.
1096 		 */
1097 		if (bp->b_flags & B_ERROR) {
1098 			lwkt_gettoken(&hmp->fs_token);
1099 			hammer_critical_error(hmp, NULL, bp->b_error,
1100 					      "while flushing meta-data");
1101 			lwkt_reltoken(&hmp->fs_token);
1102 
1103 			switch(iou->io.type) {
1104 			case HAMMER_STRUCTURE_UNDO_BUFFER:
1105 				break;
1106 			default:
1107 				if (iou->io.ioerror == 0) {
1108 					iou->io.ioerror = 1;
1109 					hammer_ref(&iou->io.lock);
1110 				}
1111 				break;
1112 			}
1113 			bp->b_flags &= ~B_ERROR;
1114 			bundirty(bp);
1115 #if 0
1116 			hammer_io_set_modlist(&iou->io);
1117 			iou->io.modified = 1;
1118 #endif
1119 		}
1120 		hammer_stats_disk_write += iou->io.bytes;
1121 		atomic_add_long(&hammer_count_io_running_write, -iou->io.bytes);
1122 		atomic_add_long(&hmp->io_running_space, -iou->io.bytes);
1123 		KKASSERT(hmp->io_running_space >= 0);
1124 		iou->io.running = 0;
1125 
1126 		/*
1127 		 * Remove from iorun list and wakeup any multi-io waiter(s).
1128 		 */
1129 		if (TAILQ_FIRST(&hmp->iorun_list) == &iou->io) {
1130 			ionext = TAILQ_NEXT(&iou->io, iorun_entry);
1131 			if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY)
1132 				wakeup(ionext);
1133 		}
1134 		TAILQ_REMOVE(&hmp->iorun_list, &iou->io, iorun_entry);
1135 	} else {
1136 		hammer_stats_disk_read += iou->io.bytes;
1137 	}
1138 
1139 	if (iou->io.waiting) {
1140 		iou->io.waiting = 0;
1141 		wakeup(iou);
1142 	}
1143 
1144 	/*
1145 	 * If B_LOCKED is set someone wanted to deallocate the bp at some
1146 	 * point, try to do it now.  The operation will fail if there are
1147 	 * refs or if hammer_io_deallocate() is unable to gain the
1148 	 * interlock.
1149 	 */
1150 	if (bp->b_flags & B_LOCKED) {
1151 		atomic_add_int(&hammer_count_io_locked, -1);
1152 		bp->b_flags &= ~B_LOCKED;
1153 		hammer_io_deallocate(bp);
1154 		/* structure may be dead now */
1155 	}
1156 	lwkt_reltoken(&hmp->io_token);
1157 }
1158 
1159 /*
1160  * Callback from kernel when it wishes to deallocate a passively
1161  * associated structure.  This mostly occurs with clean buffers
1162  * but it may be possible for a holding structure to be marked dirty
1163  * while its buffer is passively associated.  The caller owns the bp.
1164  *
1165  * If we cannot disassociate we set B_LOCKED to prevent the buffer
1166  * from getting reused.
1167  *
1168  * WARNING: Because this can be called directly by getnewbuf we cannot
1169  * recurse into the tree.  If a bp cannot be immediately disassociated
1170  * our only recourse is to set B_LOCKED.
1171  *
1172  * WARNING: This may be called from an interrupt via hammer_io_complete()
1173  *
1174  * bioops callback - hold io_token
1175  */
1176 static void
1177 hammer_io_deallocate(struct buf *bp)
1178 {
1179 	hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep);
1180 	hammer_mount_t hmp;
1181 
1182 	hmp = iou->io.hmp;
1183 
1184 	lwkt_gettoken(&hmp->io_token);
1185 
1186 	KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0);
1187 	if (hammer_try_interlock_norefs(&iou->io.lock) == 0) {
1188 		/*
1189 		 * We cannot safely disassociate a bp from a referenced
1190 		 * or interlocked HAMMER structure.
1191 		 */
1192 		bp->b_flags |= B_LOCKED;
1193 		atomic_add_int(&hammer_count_io_locked, 1);
1194 	} else if (iou->io.modified) {
1195 		/*
1196 		 * It is not legal to disassociate a modified buffer.  This
1197 		 * case really shouldn't ever occur.
1198 		 */
1199 		bp->b_flags |= B_LOCKED;
1200 		atomic_add_int(&hammer_count_io_locked, 1);
1201 		hammer_put_interlock(&iou->io.lock, 0);
1202 	} else {
1203 		/*
1204 		 * Disassociate the BP.  If the io has no refs left we
1205 		 * have to add it to the loose list.  The kernel has
1206 		 * locked the buffer and therefore our io must be
1207 		 * in a released state.
1208 		 */
1209 		hammer_io_disassociate(iou);
1210 		if (iou->io.type != HAMMER_STRUCTURE_VOLUME) {
1211 			KKASSERT(iou->io.bp == NULL);
1212 			KKASSERT(iou->io.mod_root == NULL);
1213 			iou->io.mod_root = &hmp->lose_root;
1214 			if (RB_INSERT(hammer_mod_rb_tree, iou->io.mod_root,
1215 				      &iou->io)) {
1216 				hpanic("duplicate entry");
1217 			}
1218 		}
1219 		hammer_put_interlock(&iou->io.lock, 1);
1220 	}
1221 	lwkt_reltoken(&hmp->io_token);
1222 }
1223 
1224 /*
1225  * bioops callback - hold io_token
1226  */
1227 static int
1228 hammer_io_fsync(struct vnode *vp)
1229 {
1230 	/* nothing to do, so io_token not needed */
1231 	return(0);
1232 }
1233 
1234 /*
1235  * NOTE: will not be called unless we tell the kernel about the
1236  * bioops.  Unused... we use the mount's VFS_SYNC instead.
1237  *
1238  * bioops callback - hold io_token
1239  */
1240 static int
1241 hammer_io_sync(struct mount *mp)
1242 {
1243 	/* nothing to do, so io_token not needed */
1244 	return(0);
1245 }
1246 
1247 /*
1248  * bioops callback - hold io_token
1249  */
1250 static void
1251 hammer_io_movedeps(struct buf *bp1, struct buf *bp2)
1252 {
1253 	/* nothing to do, so io_token not needed */
1254 }
1255 
1256 /*
1257  * I/O pre-check for reading and writing.  HAMMER only uses this for
1258  * B_CACHE buffers so checkread just shouldn't happen, but if it does
1259  * allow it.
1260  *
1261  * Writing is a different case.  We don't want the kernel to try to write
1262  * out a buffer that HAMMER may be modifying passively or which has a
1263  * dependancy.  In addition, kernel-demanded writes can only proceed for
1264  * certain types of buffers (i.e. UNDO and DATA types).  Other dirty
1265  * buffer types can only be explicitly written by the flusher.
1266  *
1267  * checkwrite will only be called for bdwrite()n buffers.  If we return
1268  * success the kernel is guaranteed to initiate the buffer write.
1269  *
1270  * bioops callback - hold io_token
1271  */
1272 static int
1273 hammer_io_checkread(struct buf *bp)
1274 {
1275 	/* nothing to do, so io_token not needed */
1276 	return(0);
1277 }
1278 
1279 /*
1280  * The kernel is asking us whether it can write out a dirty buffer or not.
1281  *
1282  * bioops callback - hold io_token
1283  */
1284 static int
1285 hammer_io_checkwrite(struct buf *bp)
1286 {
1287 	hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep);
1288 	hammer_mount_t hmp = io->hmp;
1289 
1290 	/*
1291 	 * This shouldn't happen under normal operation.
1292 	 */
1293 	lwkt_gettoken(&hmp->io_token);
1294 	if (io->type == HAMMER_STRUCTURE_VOLUME ||
1295 	    io->type == HAMMER_STRUCTURE_META_BUFFER) {
1296 		if (!panicstr)
1297 			hpanic("illegal buffer");
1298 		if ((bp->b_flags & B_LOCKED) == 0) {
1299 			bp->b_flags |= B_LOCKED;
1300 			atomic_add_int(&hammer_count_io_locked, 1);
1301 		}
1302 		lwkt_reltoken(&hmp->io_token);
1303 		return(1);
1304 	}
1305 
1306 	/*
1307 	 * We have to be able to interlock the IO to safely modify any
1308 	 * of its fields without holding the fs_token.  If we can't lock
1309 	 * it then we are racing someone.
1310 	 *
1311 	 * Our ownership of the bp lock prevents the io from being ripped
1312 	 * out from under us.
1313 	 */
1314 	if (hammer_try_interlock_norefs(&io->lock) == 0) {
1315 		bp->b_flags |= B_LOCKED;
1316 		atomic_add_int(&hammer_count_io_locked, 1);
1317 		lwkt_reltoken(&hmp->io_token);
1318 		return(1);
1319 	}
1320 
1321 	/*
1322 	 * The modified bit must be cleared prior to the initiation of
1323 	 * any IO (returning 0 initiates the IO).  Because this is a
1324 	 * normal data buffer hammer_io_clear_modify() runs through a
1325 	 * simple degenerate case.
1326 	 *
1327 	 * Return 0 will cause the kernel to initiate the IO, and we
1328 	 * must normally clear the modified bit before we begin.  If
1329 	 * the io has modify_refs we do not clear the modified bit,
1330 	 * otherwise we may miss changes.
1331 	 *
1332 	 * Only data and undo buffers can reach here.  These buffers do
1333 	 * not have terminal crc functions but we temporarily reference
1334 	 * the IO anyway, just in case.
1335 	 */
1336 	if (io->modify_refs == 0 && io->modified) {
1337 		hammer_ref(&io->lock);
1338 		hammer_io_clear_modify(io, 0);
1339 		hammer_rel(&io->lock);
1340 	} else if (io->modified) {
1341 		KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER);
1342 	}
1343 
1344 	/*
1345 	 * The kernel is going to start the IO, set io->running.
1346 	 */
1347 	KKASSERT(io->running == 0);
1348 	io->running = 1;
1349 	atomic_add_long(&io->hmp->io_running_space, io->bytes);
1350 	atomic_add_long(&hammer_count_io_running_write, io->bytes);
1351 	TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
1352 
1353 	hammer_put_interlock(&io->lock, 1);
1354 	lwkt_reltoken(&hmp->io_token);
1355 
1356 	return(0);
1357 }
1358 
1359 /*
1360  * Return non-zero if we wish to delay the kernel's attempt to flush
1361  * this buffer to disk.
1362  *
1363  * bioops callback - hold io_token
1364  */
1365 static int
1366 hammer_io_countdeps(struct buf *bp, int n)
1367 {
1368 	/* nothing to do, so io_token not needed */
1369 	return(0);
1370 }
1371 
1372 struct bio_ops hammer_bioops = {
1373 	.io_start	= hammer_io_start,
1374 	.io_complete	= hammer_io_complete,
1375 	.io_deallocate	= hammer_io_deallocate,
1376 	.io_fsync	= hammer_io_fsync,
1377 	.io_sync	= hammer_io_sync,
1378 	.io_movedeps	= hammer_io_movedeps,
1379 	.io_countdeps	= hammer_io_countdeps,
1380 	.io_checkread	= hammer_io_checkread,
1381 	.io_checkwrite	= hammer_io_checkwrite,
1382 };
1383 
1384 /************************************************************************
1385  *				DIRECT IO OPS 				*
1386  ************************************************************************
1387  *
1388  * These functions operate directly on the buffer cache buffer associated
1389  * with a front-end vnode rather then a back-end device vnode.
1390  */
1391 
1392 /*
1393  * Read a buffer associated with a front-end vnode directly from the
1394  * disk media.  The bio may be issued asynchronously.  If leaf is non-NULL
1395  * we validate the CRC.
1396  *
1397  * We must check for the presence of a HAMMER buffer to handle the case
1398  * where the reblocker has rewritten the data (which it does via the HAMMER
1399  * buffer system, not via the high-level vnode buffer cache), but not yet
1400  * committed the buffer to the media.
1401  */
1402 int
1403 hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
1404 		      hammer_btree_leaf_elm_t leaf)
1405 {
1406 	hammer_off_t buf_offset;
1407 	hammer_off_t zone2_offset;
1408 	hammer_volume_t volume;
1409 	struct buf *bp;
1410 	struct bio *nbio;
1411 	int vol_no;
1412 	int error;
1413 
1414 	buf_offset = bio->bio_offset;
1415 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) ==
1416 		 HAMMER_ZONE_LARGE_DATA);
1417 
1418 	/*
1419 	 * The buffer cache may have an aliased buffer (the reblocker can
1420 	 * write them).  If it does we have to sync any dirty data before
1421 	 * we can build our direct-read.  This is a non-critical code path.
1422 	 */
1423 	bp = bio->bio_buf;
1424 	hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
1425 
1426 	/*
1427 	 * Resolve to a zone-2 offset.  The conversion just requires
1428 	 * munging the top 4 bits but we want to abstract it anyway
1429 	 * so the blockmap code can verify the zone assignment.
1430 	 */
1431 	zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1432 	if (error)
1433 		goto done;
1434 	KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
1435 		 HAMMER_ZONE_RAW_BUFFER);
1436 
1437 	/*
1438 	 * Resolve volume and raw-offset for 3rd level bio.  The
1439 	 * offset will be specific to the volume.
1440 	 */
1441 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
1442 	volume = hammer_get_volume(hmp, vol_no, &error);
1443 	if (error == 0 && zone2_offset >= volume->maxbuf_off)
1444 		error = EIO;
1445 
1446 	if (error == 0) {
1447 		/*
1448 		 * 3rd level bio
1449 		 */
1450 		nbio = push_bio(bio);
1451 		nbio->bio_offset = volume->ondisk->vol_buf_beg +
1452 				   (zone2_offset & HAMMER_OFF_SHORT_MASK);
1453 		hammer_stats_disk_read += bp->b_bufsize;
1454 		vn_strategy(volume->devvp, nbio);
1455 	}
1456 	hammer_rel_volume(volume, 0);
1457 done:
1458 	if (error) {
1459 		hdkprintf("failed @ %016llx\n", (long long)zone2_offset);
1460 		bp->b_error = error;
1461 		bp->b_flags |= B_ERROR;
1462 		biodone(bio);
1463 	}
1464 	return(error);
1465 }
1466 
1467 /*
1468  * This works similarly to hammer_io_direct_read() except instead of
1469  * directly reading from the device into the bio we instead indirectly
1470  * read through the device's buffer cache and then copy the data into
1471  * the bio.
1472  *
1473  * If leaf is non-NULL and validation is enabled, the CRC will be checked.
1474  *
1475  * This routine also executes asynchronously.  It allows hammer strategy
1476  * calls to operate asynchronously when in double_buffer mode (in addition
1477  * to operating asynchronously when in normal mode).
1478  */
1479 int
1480 hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio,
1481 			hammer_btree_leaf_elm_t leaf)
1482 {
1483 	hammer_off_t buf_offset;
1484 	hammer_off_t zone2_offset;
1485 	hammer_volume_t volume;
1486 	struct buf *bp;
1487 	int vol_no;
1488 	int error;
1489 
1490 	buf_offset = bio->bio_offset;
1491 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) ==
1492 		 HAMMER_ZONE_LARGE_DATA);
1493 
1494 	/*
1495 	 * The buffer cache may have an aliased buffer (the reblocker can
1496 	 * write them).  If it does we have to sync any dirty data before
1497 	 * we can build our direct-read.  This is a non-critical code path.
1498 	 */
1499 	bp = bio->bio_buf;
1500 	hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
1501 
1502 	/*
1503 	 * Resolve to a zone-2 offset.  The conversion just requires
1504 	 * munging the top 4 bits but we want to abstract it anyway
1505 	 * so the blockmap code can verify the zone assignment.
1506 	 */
1507 	zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1508 	if (error)
1509 		goto done;
1510 	KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
1511 		 HAMMER_ZONE_RAW_BUFFER);
1512 
1513 	/*
1514 	 * Resolve volume and raw-offset for 3rd level bio.  The
1515 	 * offset will be specific to the volume.
1516 	 */
1517 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
1518 	volume = hammer_get_volume(hmp, vol_no, &error);
1519 	if (error == 0 && zone2_offset >= volume->maxbuf_off)
1520 		error = EIO;
1521 
1522 	if (error == 0) {
1523 		/*
1524 		 * Convert to the raw volume->devvp offset and acquire
1525 		 * the buf, issuing async I/O if necessary.
1526 		 */
1527 		buf_offset = volume->ondisk->vol_buf_beg +
1528 			     (zone2_offset & HAMMER_OFF_SHORT_MASK);
1529 
1530 		if (leaf && hammer_verify_data) {
1531 			bio->bio_caller_info1.uvalue32 = leaf->data_crc;
1532 			bio->bio_caller_info2.index = 1;
1533 		} else {
1534 			bio->bio_caller_info2.index = 0;
1535 		}
1536 		breadcb(volume->devvp, buf_offset, bp->b_bufsize,
1537 			hammer_indirect_callback, bio);
1538 	}
1539 	hammer_rel_volume(volume, 0);
1540 done:
1541 	if (error) {
1542 		hdkprintf("failed @ %016llx\n", (long long)zone2_offset);
1543 		bp->b_error = error;
1544 		bp->b_flags |= B_ERROR;
1545 		biodone(bio);
1546 	}
1547 	return(error);
1548 }
1549 
1550 /*
1551  * Indirect callback on completion.  bio/bp specify the device-backed
1552  * buffer.  bio->bio_caller_info1.ptr holds obio.
1553  *
1554  * obio/obp is the original regular file buffer.  obio->bio_caller_info*
1555  * contains the crc specification.
1556  *
1557  * We are responsible for calling bpdone() and bqrelse() on bio/bp, and
1558  * for calling biodone() on obio.
1559  */
1560 static void
1561 hammer_indirect_callback(struct bio *bio)
1562 {
1563 	struct buf *bp = bio->bio_buf;
1564 	struct buf *obp;
1565 	struct bio *obio;
1566 
1567 	/*
1568 	 * If BIO_DONE is already set the device buffer was already
1569 	 * fully valid (B_CACHE).  If it is not set then I/O was issued
1570 	 * and we have to run I/O completion as the last bio.
1571 	 *
1572 	 * Nobody is waiting for our device I/O to complete, we are
1573 	 * responsible for bqrelse()ing it which means we also have to do
1574 	 * the equivalent of biowait() and clear BIO_DONE (which breadcb()
1575 	 * may have set).
1576 	 *
1577 	 * Any preexisting device buffer should match the requested size,
1578 	 * but due to big-block recycling and other factors there is some
1579 	 * fragility there, so we assert that the device buffer covers
1580 	 * the request.
1581 	 */
1582 	if ((bio->bio_flags & BIO_DONE) == 0)
1583 		bpdone(bp, 0);
1584 	bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
1585 
1586 	obio = bio->bio_caller_info1.ptr;
1587 	obp = obio->bio_buf;
1588 
1589 	if (bp->b_flags & B_ERROR) {
1590 		obp->b_flags |= B_ERROR;
1591 		obp->b_error = bp->b_error;
1592 	} else if (obio->bio_caller_info2.index &&
1593 		   obio->bio_caller_info1.uvalue32 !=
1594 		    crc32(bp->b_data, bp->b_bufsize)) {
1595 		obp->b_flags |= B_ERROR;
1596 		obp->b_error = EIO;
1597 	} else {
1598 		KKASSERT(bp->b_bufsize >= obp->b_bufsize);
1599 		bcopy(bp->b_data, obp->b_data, obp->b_bufsize);
1600 		obp->b_resid = 0;
1601 		obp->b_flags |= B_AGE;
1602 	}
1603 	biodone(obio);
1604 	bqrelse(bp);
1605 }
1606 
1607 /*
1608  * Write a buffer associated with a front-end vnode directly to the
1609  * disk media.  The bio may be issued asynchronously.
1610  *
1611  * The BIO is associated with the specified record and RECG_DIRECT_IO
1612  * is set.  The recorded is added to its object.
1613  */
1614 int
1615 hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
1616 		       hammer_record_t record)
1617 {
1618 	hammer_btree_leaf_elm_t leaf = &record->leaf;
1619 	hammer_off_t buf_offset;
1620 	hammer_off_t zone2_offset;
1621 	hammer_volume_t volume;
1622 	hammer_buffer_t buffer;
1623 	struct buf *bp;
1624 	struct bio *nbio;
1625 	char *ptr;
1626 	int vol_no;
1627 	int error;
1628 
1629 	buf_offset = leaf->data_offset;
1630 
1631 	KKASSERT(buf_offset > HAMMER_ZONE_BTREE);
1632 	KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE);
1633 
1634 	/*
1635 	 * Issue or execute the I/O.  The new memory record must replace
1636 	 * the old one before the I/O completes, otherwise a reaquisition of
1637 	 * the buffer will load the old media data instead of the new.
1638 	 */
1639 	if ((buf_offset & HAMMER_BUFMASK) == 0 &&
1640 	    leaf->data_len >= HAMMER_BUFSIZE) {
1641 		/*
1642 		 * We are using the vnode's bio to write directly to the
1643 		 * media, any hammer_buffer at the same zone-X offset will
1644 		 * now have stale data.
1645 		 */
1646 		zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1647 		vol_no = HAMMER_VOL_DECODE(zone2_offset);
1648 		volume = hammer_get_volume(hmp, vol_no, &error);
1649 
1650 		if (error == 0 && zone2_offset >= volume->maxbuf_off)
1651 			error = EIO;
1652 		if (error == 0) {
1653 			bp = bio->bio_buf;
1654 			KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0);
1655 			/*
1656 			hammer_del_buffers(hmp, buf_offset,
1657 					   zone2_offset, bp->b_bufsize);
1658 			*/
1659 
1660 			/*
1661 			 * Second level bio - cached zone2 offset.
1662 			 *
1663 			 * (We can put our bio_done function in either the
1664 			 *  2nd or 3rd level).
1665 			 */
1666 			nbio = push_bio(bio);
1667 			nbio->bio_offset = zone2_offset;
1668 			nbio->bio_done = hammer_io_direct_write_complete;
1669 			nbio->bio_caller_info1.ptr = record;
1670 			record->zone2_offset = zone2_offset;
1671 			record->gflags |= HAMMER_RECG_DIRECT_IO |
1672 					 HAMMER_RECG_DIRECT_INVAL;
1673 
1674 			/*
1675 			 * Third level bio - raw offset specific to the
1676 			 * correct volume.
1677 			 */
1678 			zone2_offset &= HAMMER_OFF_SHORT_MASK;
1679 			nbio = push_bio(nbio);
1680 			nbio->bio_offset = volume->ondisk->vol_buf_beg +
1681 					   zone2_offset;
1682 			hammer_stats_disk_write += bp->b_bufsize;
1683 			hammer_ip_replace_bulk(hmp, record);
1684 			vn_strategy(volume->devvp, nbio);
1685 			hammer_io_flush_mark(volume);
1686 		}
1687 		hammer_rel_volume(volume, 0);
1688 	} else {
1689 		/*
1690 		 * Must fit in a standard HAMMER buffer.  In this case all
1691 		 * consumers use the HAMMER buffer system and RECG_DIRECT_IO
1692 		 * does not need to be set-up.
1693 		 */
1694 		KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0);
1695 		buffer = NULL;
1696 		ptr = hammer_bread(hmp, buf_offset, &error, &buffer);
1697 		if (error == 0) {
1698 			bp = bio->bio_buf;
1699 			bp->b_flags |= B_AGE;
1700 			hammer_io_modify(&buffer->io, 1);
1701 			bcopy(bp->b_data, ptr, leaf->data_len);
1702 			hammer_io_modify_done(&buffer->io);
1703 			hammer_rel_buffer(buffer, 0);
1704 			bp->b_resid = 0;
1705 			hammer_ip_replace_bulk(hmp, record);
1706 			biodone(bio);
1707 		}
1708 	}
1709 	if (error) {
1710 		/*
1711 		 * Major suckage occured.  Also note:  The record was
1712 		 * never added to the tree so we do not have to worry
1713 		 * about the backend.
1714 		 */
1715 		hdkprintf("failed @ %016llx\n", (long long)leaf->data_offset);
1716 		bp = bio->bio_buf;
1717 		bp->b_resid = 0;
1718 		bp->b_error = EIO;
1719 		bp->b_flags |= B_ERROR;
1720 		biodone(bio);
1721 		record->flags |= HAMMER_RECF_DELETED_FE;
1722 		hammer_rel_mem_record(record);
1723 	}
1724 	return(error);
1725 }
1726 
1727 /*
1728  * On completion of the BIO this callback must disconnect
1729  * it from the hammer_record and chain to the previous bio.
1730  *
1731  * An I/O error forces the mount to read-only.  Data buffers
1732  * are not B_LOCKED like meta-data buffers are, so we have to
1733  * throw the buffer away to prevent the kernel from retrying.
1734  *
1735  * NOTE: MPSAFE callback, only modify fields we have explicit
1736  *	 access to (the bp and the record->gflags).
1737  */
1738 static
1739 void
1740 hammer_io_direct_write_complete(struct bio *nbio)
1741 {
1742 	struct bio *obio;
1743 	struct buf *bp;
1744 	hammer_record_t record;
1745 	hammer_mount_t hmp;
1746 
1747 	record = nbio->bio_caller_info1.ptr;
1748 	KKASSERT(record != NULL);
1749 	hmp = record->ip->hmp;
1750 
1751 	lwkt_gettoken(&hmp->io_token);
1752 
1753 	bp = nbio->bio_buf;
1754 	obio = pop_bio(nbio);
1755 	if (bp->b_flags & B_ERROR) {
1756 		lwkt_gettoken(&hmp->fs_token);
1757 		hammer_critical_error(hmp, record->ip, bp->b_error,
1758 				      "while writing bulk data");
1759 		lwkt_reltoken(&hmp->fs_token);
1760 		bp->b_flags |= B_INVAL;
1761 	}
1762 	biodone(obio);
1763 
1764 	KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO);
1765 	if (record->gflags & HAMMER_RECG_DIRECT_WAIT) {
1766 		record->gflags &= ~(HAMMER_RECG_DIRECT_IO |
1767 				    HAMMER_RECG_DIRECT_WAIT);
1768 		/* record can disappear once DIRECT_IO flag is cleared */
1769 		wakeup(&record->flags);
1770 	} else {
1771 		record->gflags &= ~HAMMER_RECG_DIRECT_IO;
1772 		/* record can disappear once DIRECT_IO flag is cleared */
1773 	}
1774 	lwkt_reltoken(&hmp->io_token);
1775 }
1776 
1777 
1778 /*
1779  * This is called before a record is either committed to the B-Tree
1780  * or destroyed, to resolve any associated direct-IO.
1781  *
1782  * (1) We must wait for any direct-IO related to the record to complete.
1783  *
1784  * (2) We must remove any buffer cache aliases for data accessed via
1785  *     leaf->data_offset or zone2_offset so non-direct-IO consumers
1786  *     (the mirroring and reblocking code) do not see stale data.
1787  */
1788 void
1789 hammer_io_direct_wait(hammer_record_t record)
1790 {
1791 	hammer_mount_t hmp = record->ip->hmp;
1792 
1793 	/*
1794 	 * Wait for I/O to complete
1795 	 */
1796 	if (record->gflags & HAMMER_RECG_DIRECT_IO) {
1797 		lwkt_gettoken(&hmp->io_token);
1798 		while (record->gflags & HAMMER_RECG_DIRECT_IO) {
1799 			record->gflags |= HAMMER_RECG_DIRECT_WAIT;
1800 			tsleep(&record->flags, 0, "hmdiow", 0);
1801 		}
1802 		lwkt_reltoken(&hmp->io_token);
1803 	}
1804 
1805 	/*
1806 	 * Invalidate any related buffer cache aliases associated with the
1807 	 * backing device.  This is needed because the buffer cache buffer
1808 	 * for file data is associated with the file vnode, not the backing
1809 	 * device vnode.
1810 	 *
1811 	 * XXX I do not think this case can occur any more now that
1812 	 * reservations ensure that all such buffers are removed before
1813 	 * an area can be reused.
1814 	 */
1815 	if (record->gflags & HAMMER_RECG_DIRECT_INVAL) {
1816 		KKASSERT(record->leaf.data_offset);
1817 		hammer_del_buffers(hmp, record->leaf.data_offset,
1818 				   record->zone2_offset, record->leaf.data_len,
1819 				   1);
1820 		record->gflags &= ~HAMMER_RECG_DIRECT_INVAL;
1821 	}
1822 }
1823 
1824 /*
1825  * This is called to remove the second-level cached zone-2 offset from
1826  * frontend buffer cache buffers, now stale due to a data relocation.
1827  * These offsets are generated by cluster_read() via VOP_BMAP, or directly
1828  * by hammer_vop_strategy_read().
1829  *
1830  * This is rather nasty because here we have something like the reblocker
1831  * scanning the raw B-Tree with no held references on anything, really,
1832  * other then a shared lock on the B-Tree node, and we have to access the
1833  * frontend's buffer cache to check for and clean out the association.
1834  * Specifically, if the reblocker is moving data on the disk, these cached
1835  * offsets will become invalid.
1836  *
1837  * Only data record types associated with the large-data zone are subject
1838  * to direct-io and need to be checked.
1839  *
1840  */
1841 void
1842 hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf)
1843 {
1844 	struct hammer_inode_info iinfo;
1845 	int zone;
1846 
1847 	if (leaf->base.rec_type != HAMMER_RECTYPE_DATA)
1848 		return;
1849 	zone = HAMMER_ZONE_DECODE(leaf->data_offset);
1850 	if (zone != HAMMER_ZONE_LARGE_DATA_INDEX)
1851 		return;
1852 	iinfo.obj_id = leaf->base.obj_id;
1853 	iinfo.obj_asof = 0;	/* unused */
1854 	iinfo.obj_localization = leaf->base.localization &
1855 				 HAMMER_LOCALIZE_PSEUDOFS_MASK;
1856 	iinfo.u.leaf = leaf;
1857 	hammer_scan_inode_snapshots(hmp, &iinfo,
1858 				    hammer_io_direct_uncache_callback,
1859 				    leaf);
1860 }
1861 
1862 static int
1863 hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data)
1864 {
1865 	hammer_inode_info_t iinfo = data;
1866 	hammer_off_t file_offset;
1867 	struct vnode *vp;
1868 	struct buf *bp;
1869 	int blksize;
1870 
1871 	if (ip->vp == NULL)
1872 		return(0);
1873 	file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len;
1874 	blksize = iinfo->u.leaf->data_len;
1875 	KKASSERT((blksize & HAMMER_BUFMASK) == 0);
1876 
1877 	/*
1878 	 * Warning: FINDBLK_TEST return stable storage but not stable
1879 	 *	    contents.  It happens to be ok in this case.
1880 	 */
1881 	hammer_ref(&ip->lock);
1882 	if (hammer_get_vnode(ip, &vp) == 0) {
1883 		if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL &&
1884 		    bp->b_bio2.bio_offset != NOOFFSET) {
1885 			bp = getblk(ip->vp, file_offset, blksize, 0, 0);
1886 			bp->b_bio2.bio_offset = NOOFFSET;
1887 			brelse(bp);
1888 		}
1889 		vput(vp);
1890 	}
1891 	hammer_rel_inode(ip, 0);
1892 	return(0);
1893 }
1894 
1895 
1896 /*
1897  * This function is called when writes may have occured on the volume,
1898  * indicating that the device may be holding cached writes.
1899  */
1900 static void
1901 hammer_io_flush_mark(hammer_volume_t volume)
1902 {
1903 	atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH);
1904 }
1905 
1906 /*
1907  * This function ensures that the device has flushed any cached writes out.
1908  */
1909 void
1910 hammer_io_flush_sync(hammer_mount_t hmp)
1911 {
1912 	hammer_volume_t volume;
1913 	struct buf *bp_base = NULL;
1914 	struct buf *bp;
1915 
1916 	RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) {
1917 		if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) {
1918 			atomic_clear_int(&volume->vol_flags,
1919 					 HAMMER_VOLF_NEEDFLUSH);
1920 			bp = getpbuf(NULL);
1921 			bp->b_bio1.bio_offset = 0;
1922 			bp->b_bufsize = 0;
1923 			bp->b_bcount = 0;
1924 			bp->b_cmd = BUF_CMD_FLUSH;
1925 			bp->b_bio1.bio_caller_info1.cluster_head = bp_base;
1926 			bp->b_bio1.bio_done = biodone_sync;
1927 			bp->b_bio1.bio_flags |= BIO_SYNC;
1928 			bp_base = bp;
1929 			vn_strategy(volume->devvp, &bp->b_bio1);
1930 		}
1931 	}
1932 	while ((bp = bp_base) != NULL) {
1933 		bp_base = bp->b_bio1.bio_caller_info1.cluster_head;
1934 		biowait(&bp->b_bio1, "hmrFLS");
1935 		relpbuf(bp, NULL);
1936 	}
1937 }
1938 
1939 /*
1940  * Limit the amount of backlog which we allow to build up
1941  */
1942 void
1943 hammer_io_limit_backlog(hammer_mount_t hmp)
1944 {
1945 	waitrunningbufspace();
1946 }
1947