xref: /dragonfly/sys/vfs/hammer/hammer_io.c (revision 10cbe914)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.55 2008/09/15 17:02:49 dillon Exp $
35  */
36 /*
37  * IO Primitives and buffer cache management
38  *
39  * All major data-tracking structures in HAMMER contain a struct hammer_io
40  * which is used to manage their backing store.  We use filesystem buffers
41  * for backing store and we leave them passively associated with their
42  * HAMMER structures.
43  *
44  * If the kernel tries to destroy a passively associated buf which we cannot
45  * yet let go we set B_LOCKED in the buffer and then actively released it
46  * later when we can.
47  *
48  * The io_token is required for anything which might race bioops and bio_done
49  * callbacks, with one exception: A successful hammer_try_interlock_norefs().
50  * the fs_token will be held in all other cases.
51  */
52 
53 #include "hammer.h"
54 #include <sys/fcntl.h>
55 #include <sys/nlookup.h>
56 #include <sys/buf.h>
57 #include <sys/buf2.h>
58 
59 static void hammer_io_modify(hammer_io_t io, int count);
60 static void hammer_io_deallocate(struct buf *bp);
61 #if 0
62 static void hammer_io_direct_read_complete(struct bio *nbio);
63 #endif
64 static void hammer_io_direct_write_complete(struct bio *nbio);
65 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
66 static void hammer_io_set_modlist(struct hammer_io *io);
67 static void hammer_io_flush_mark(hammer_volume_t volume);
68 
69 /*
70  * Initialize a new, already-zero'd hammer_io structure, or reinitialize
71  * an existing hammer_io structure which may have switched to another type.
72  */
73 void
74 hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type)
75 {
76 	io->volume = volume;
77 	io->hmp = volume->io.hmp;
78 	io->type = type;
79 }
80 
81 /*
82  * Helper routine to disassociate a buffer cache buffer from an I/O
83  * structure.  The io must be interlocked and marked appropriately for
84  * reclamation.
85  *
86  * The io must be in a released state with the io->bp owned and
87  * locked by the caller of this function.  When not called from an
88  * io_deallocate() this cannot race an io_deallocate() since the
89  * kernel would be unable to get the buffer lock in that case.
90  * (The released state in this case means we own the bp, not the
91  * hammer_io structure).
92  *
93  * The io may have 0 or 1 references depending on who called us.  The
94  * caller is responsible for dealing with the refs.
95  *
96  * This call can only be made when no action is required on the buffer.
97  *
98  * This function is guaranteed not to race against anything because we
99  * own both the io lock and the bp lock and are interlocked with no
100  * references.
101  */
102 static void
103 hammer_io_disassociate(hammer_io_structure_t iou)
104 {
105 	struct buf *bp = iou->io.bp;
106 
107 	KKASSERT(iou->io.released);
108 	KKASSERT(iou->io.modified == 0);
109 	KKASSERT(LIST_FIRST(&bp->b_dep) == (void *)iou);
110 	buf_dep_init(bp);
111 	iou->io.bp = NULL;
112 
113 	/*
114 	 * If the buffer was locked someone wanted to get rid of it.
115 	 */
116 	if (bp->b_flags & B_LOCKED) {
117 		atomic_add_int(&hammer_count_io_locked, -1);
118 		bp->b_flags &= ~B_LOCKED;
119 	}
120 	if (iou->io.reclaim) {
121 		bp->b_flags |= B_NOCACHE|B_RELBUF;
122 		iou->io.reclaim = 0;
123 	}
124 
125 	switch(iou->io.type) {
126 	case HAMMER_STRUCTURE_VOLUME:
127 		iou->volume.ondisk = NULL;
128 		break;
129 	case HAMMER_STRUCTURE_DATA_BUFFER:
130 	case HAMMER_STRUCTURE_META_BUFFER:
131 	case HAMMER_STRUCTURE_UNDO_BUFFER:
132 		iou->buffer.ondisk = NULL;
133 		break;
134 	case HAMMER_STRUCTURE_DUMMY:
135 		panic("hammer_io_disassociate: bad io type");
136 		break;
137 	}
138 }
139 
140 /*
141  * Wait for any physical IO to complete
142  *
143  * XXX we aren't interlocked against a spinlock or anything so there
144  *     is a small window in the interlock / io->running == 0 test.
145  */
146 void
147 hammer_io_wait(hammer_io_t io)
148 {
149 	if (io->running) {
150 		hammer_mount_t hmp = io->hmp;
151 
152 		lwkt_gettoken(&hmp->io_token);
153 		while (io->running) {
154 			io->waiting = 1;
155 			tsleep_interlock(io, 0);
156 			if (io->running)
157 				tsleep(io, PINTERLOCKED, "hmrflw", hz);
158 		}
159 		lwkt_reltoken(&hmp->io_token);
160 	}
161 }
162 
163 /*
164  * Wait for all currently queued HAMMER-initiated I/Os to complete.
165  *
166  * This is not supposed to count direct I/O's but some can leak
167  * through (for non-full-sized direct I/Os).
168  */
169 void
170 hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush)
171 {
172 	struct hammer_io iodummy;
173 	hammer_io_t io;
174 
175 	/*
176 	 * Degenerate case, no I/O is running
177 	 */
178 	lwkt_gettoken(&hmp->io_token);
179 	if (TAILQ_EMPTY(&hmp->iorun_list)) {
180 		lwkt_reltoken(&hmp->io_token);
181 		if (doflush)
182 			hammer_io_flush_sync(hmp);
183 		return;
184 	}
185 	bzero(&iodummy, sizeof(iodummy));
186 	iodummy.type = HAMMER_STRUCTURE_DUMMY;
187 
188 	/*
189 	 * Add placemarker and then wait until it becomes the head of
190 	 * the list.
191 	 */
192 	TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry);
193 	while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) {
194 		tsleep(&iodummy, 0, ident, 0);
195 	}
196 
197 	/*
198 	 * Chain in case several placemarkers are present.
199 	 */
200 	TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry);
201 	io = TAILQ_FIRST(&hmp->iorun_list);
202 	if (io && io->type == HAMMER_STRUCTURE_DUMMY)
203 		wakeup(io);
204 	lwkt_reltoken(&hmp->io_token);
205 
206 	if (doflush)
207 		hammer_io_flush_sync(hmp);
208 }
209 
210 /*
211  * Clear a flagged error condition on a I/O buffer.  The caller must hold
212  * its own ref on the buffer.
213  */
214 void
215 hammer_io_clear_error(struct hammer_io *io)
216 {
217 	hammer_mount_t hmp = io->hmp;
218 
219 	lwkt_gettoken(&hmp->io_token);
220 	if (io->ioerror) {
221 		io->ioerror = 0;
222 		hammer_rel(&io->lock);
223 		KKASSERT(hammer_isactive(&io->lock));
224 	}
225 	lwkt_reltoken(&hmp->io_token);
226 }
227 
228 void
229 hammer_io_clear_error_noassert(struct hammer_io *io)
230 {
231 	hammer_mount_t hmp = io->hmp;
232 
233 	lwkt_gettoken(&hmp->io_token);
234 	if (io->ioerror) {
235 		io->ioerror = 0;
236 		hammer_rel(&io->lock);
237 	}
238 	lwkt_reltoken(&hmp->io_token);
239 }
240 
241 /*
242  * This is an advisory function only which tells the buffer cache
243  * the bp is not a meta-data buffer, even though it is backed by
244  * a block device.
245  *
246  * This is used by HAMMER's reblocking code to avoid trying to
247  * swapcache the filesystem's data when it is read or written
248  * by the reblocking code.
249  *
250  * The caller has a ref on the buffer preventing the bp from
251  * being disassociated from it.
252  */
253 void
254 hammer_io_notmeta(hammer_buffer_t buffer)
255 {
256 	if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) {
257 		hammer_mount_t hmp = buffer->io.hmp;
258 
259 		lwkt_gettoken(&hmp->io_token);
260 		buffer->io.bp->b_flags |= B_NOTMETA;
261 		lwkt_reltoken(&hmp->io_token);
262 	}
263 }
264 
265 /*
266  * Load bp for a HAMMER structure.  The io must be exclusively locked by
267  * the caller.
268  *
269  * This routine is mostly used on meta-data and small-data blocks.  Generally
270  * speaking HAMMER assumes some locality of reference and will cluster.
271  *
272  * Note that the caller (hammer_ondisk.c) may place further restrictions
273  * on clusterability via the limit (in bytes).  Typically large-data
274  * zones cannot be clustered due to their mixed buffer sizes.  This is
275  * not an issue since such clustering occurs in hammer_vnops at the
276  * regular file layer, whereas this is the buffered block device layer.
277  *
278  * No I/O callbacks can occur while we hold the buffer locked.
279  */
280 int
281 hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit)
282 {
283 	struct buf *bp;
284 	int   error;
285 
286 	if ((bp = io->bp) == NULL) {
287 		atomic_add_int(&hammer_count_io_running_read, io->bytes);
288 		if (hammer_cluster_enable && limit > io->bytes) {
289 			error = cluster_read(devvp, io->offset + limit,
290 					     io->offset, io->bytes,
291 					     HAMMER_CLUSTER_SIZE,
292 					     HAMMER_CLUSTER_SIZE,
293 					     &io->bp);
294 		} else {
295 			error = bread(devvp, io->offset, io->bytes, &io->bp);
296 		}
297 		hammer_stats_disk_read += io->bytes;
298 		atomic_add_int(&hammer_count_io_running_read, -io->bytes);
299 
300 		/*
301 		 * The code generally assumes b_ops/b_dep has been set-up,
302 		 * even if we error out here.
303 		 */
304 		bp = io->bp;
305 		if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
306 			const char *metatype;
307 
308 			switch(io->type) {
309 			case HAMMER_STRUCTURE_VOLUME:
310 				metatype = "volume";
311 				break;
312 			case HAMMER_STRUCTURE_META_BUFFER:
313 				switch(((struct hammer_buffer *)io)->
314 					zoneX_offset & HAMMER_OFF_ZONE_MASK) {
315 				case HAMMER_ZONE_BTREE:
316 					metatype = "btree";
317 					break;
318 				case HAMMER_ZONE_META:
319 					metatype = "meta";
320 					break;
321 				case HAMMER_ZONE_FREEMAP:
322 					metatype = "freemap";
323 					break;
324 				default:
325 					metatype = "meta?";
326 					break;
327 				}
328 				break;
329 			case HAMMER_STRUCTURE_DATA_BUFFER:
330 				metatype = "data";
331 				break;
332 			case HAMMER_STRUCTURE_UNDO_BUFFER:
333 				metatype = "undo";
334 				break;
335 			default:
336 				metatype = "unknown";
337 				break;
338 			}
339 			kprintf("doff %016jx %s\n",
340 				(intmax_t)bp->b_bio2.bio_offset,
341 				metatype);
342 		}
343 		bp->b_flags &= ~B_IODEBUG;
344 		bp->b_ops = &hammer_bioops;
345 		KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
346 
347 		/* io->worklist is locked by the io lock */
348 		LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
349 		BUF_KERNPROC(bp);
350 		KKASSERT(io->modified == 0);
351 		KKASSERT(io->running == 0);
352 		KKASSERT(io->waiting == 0);
353 		io->released = 0;	/* we hold an active lock on bp */
354 	} else {
355 		error = 0;
356 	}
357 	return(error);
358 }
359 
360 /*
361  * Similar to hammer_io_read() but returns a zero'd out buffer instead.
362  * Must be called with the IO exclusively locked.
363  *
364  * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background
365  * I/O by forcing the buffer to not be in a released state before calling
366  * it.
367  *
368  * This function will also mark the IO as modified but it will not
369  * increment the modify_refs count.
370  *
371  * No I/O callbacks can occur while we hold the buffer locked.
372  */
373 int
374 hammer_io_new(struct vnode *devvp, struct hammer_io *io)
375 {
376 	struct buf *bp;
377 
378 	if ((bp = io->bp) == NULL) {
379 		io->bp = getblk(devvp, io->offset, io->bytes, 0, 0);
380 		bp = io->bp;
381 		bp->b_ops = &hammer_bioops;
382 		KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
383 
384 		/* io->worklist is locked by the io lock */
385 		LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
386 		io->released = 0;
387 		KKASSERT(io->running == 0);
388 		io->waiting = 0;
389 		BUF_KERNPROC(bp);
390 	} else {
391 		if (io->released) {
392 			regetblk(bp);
393 			BUF_KERNPROC(bp);
394 			io->released = 0;
395 		}
396 	}
397 	hammer_io_modify(io, 0);
398 	vfs_bio_clrbuf(bp);
399 	return(0);
400 }
401 
402 /*
403  * Advance the activity count on the underlying buffer because
404  * HAMMER does not getblk/brelse on every access.
405  *
406  * The io->bp cannot go away while the buffer is referenced.
407  */
408 void
409 hammer_io_advance(struct hammer_io *io)
410 {
411 	if (io->bp)
412 		buf_act_advance(io->bp);
413 }
414 
415 /*
416  * Remove potential device level aliases against buffers managed by high level
417  * vnodes.  Aliases can also be created due to mixed buffer sizes or via
418  * direct access to the backing store device.
419  *
420  * This is nasty because the buffers are also VMIO-backed.  Even if a buffer
421  * does not exist its backing VM pages might, and we have to invalidate
422  * those as well or a getblk() will reinstate them.
423  *
424  * Buffer cache buffers associated with hammer_buffers cannot be
425  * invalidated.
426  */
427 int
428 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
429 {
430 	hammer_io_structure_t iou;
431 	hammer_mount_t hmp;
432 	hammer_off_t phys_offset;
433 	struct buf *bp;
434 	int error;
435 
436 	hmp = volume->io.hmp;
437 	lwkt_gettoken(&hmp->io_token);
438 
439 	/*
440 	 * If a device buffer already exists for the specified physical
441 	 * offset use that, otherwise instantiate a buffer to cover any
442 	 * related VM pages, set BNOCACHE, and brelse().
443 	 */
444 	phys_offset = volume->ondisk->vol_buf_beg +
445 		      (zone2_offset & HAMMER_OFF_SHORT_MASK);
446 	if ((bp = findblk(volume->devvp, phys_offset, 0)) != NULL)
447 		bremfree(bp);
448 	else
449 		bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0);
450 
451 	if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) {
452 #if 0
453 		hammer_ref(&iou->io.lock);
454 		hammer_io_clear_modify(&iou->io, 1);
455 		bundirty(bp);
456 		iou->io.released = 0;
457 		BUF_KERNPROC(bp);
458 		iou->io.reclaim = 1;
459 		iou->io.waitdep = 1;	/* XXX this is a fs_token field */
460 		KKASSERT(hammer_isactive(&iou->io.lock) == 1);
461 		hammer_rel_buffer(&iou->buffer, 0);
462 		/*hammer_io_deallocate(bp);*/
463 #endif
464 		bqrelse(bp);
465 		error = EAGAIN;
466 	} else {
467 		KKASSERT((bp->b_flags & B_LOCKED) == 0);
468 		bundirty(bp);
469 		bp->b_flags |= B_NOCACHE|B_RELBUF;
470 		brelse(bp);
471 		error = 0;
472 	}
473 	lwkt_reltoken(&hmp->io_token);
474 	return(error);
475 }
476 
477 /*
478  * This routine is called on the last reference to a hammer structure.
479  * The io must be interlocked with a refcount of zero.  The hammer structure
480  * will remain interlocked on return.
481  *
482  * This routine may return a non-NULL bp to the caller for dispoal.
483  * The caller typically brelse()'s the bp.
484  *
485  * The bp may or may not still be passively associated with the IO.  It
486  * will remain passively associated if it is unreleasable (e.g. a modified
487  * meta-data buffer).
488  *
489  * The only requirement here is that modified meta-data and volume-header
490  * buffer may NOT be disassociated from the IO structure, and consequently
491  * we also leave such buffers actively associated with the IO if they already
492  * are (since the kernel can't do anything with them anyway).  Only the
493  * flusher is allowed to write such buffers out.  Modified pure-data and
494  * undo buffers are returned to the kernel but left passively associated
495  * so we can track when the kernel writes the bp out.
496  */
497 struct buf *
498 hammer_io_release(struct hammer_io *io, int flush)
499 {
500 	union hammer_io_structure *iou = (void *)io;
501 	struct buf *bp;
502 
503 	if ((bp = io->bp) == NULL)
504 		return(NULL);
505 
506 	/*
507 	 * Try to flush a dirty IO to disk if asked to by the
508 	 * caller or if the kernel tried to flush the buffer in the past.
509 	 *
510 	 * Kernel-initiated flushes are only allowed for pure-data buffers.
511 	 * meta-data and volume buffers can only be flushed explicitly
512 	 * by HAMMER.
513 	 */
514 	if (io->modified) {
515 		if (flush) {
516 			hammer_io_flush(io, 0);
517 		} else if (bp->b_flags & B_LOCKED) {
518 			switch(io->type) {
519 			case HAMMER_STRUCTURE_DATA_BUFFER:
520 				hammer_io_flush(io, 0);
521 				break;
522 			case HAMMER_STRUCTURE_UNDO_BUFFER:
523 				hammer_io_flush(io, hammer_undo_reclaim(io));
524 				break;
525 			default:
526 				break;
527 			}
528 		} /* else no explicit request to flush the buffer */
529 	}
530 
531 	/*
532 	 * Wait for the IO to complete if asked to.  This occurs when
533 	 * the buffer must be disposed of definitively during an umount
534 	 * or buffer invalidation.
535 	 */
536 	if (io->waitdep && io->running) {
537 		hammer_io_wait(io);
538 	}
539 
540 	/*
541 	 * Return control of the buffer to the kernel (with the provisio
542 	 * that our bioops can override kernel decisions with regards to
543 	 * the buffer).
544 	 */
545 	if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) {
546 		/*
547 		 * Always disassociate the bp if an explicit flush
548 		 * was requested and the IO completed with no error
549 		 * (so unmount can really clean up the structure).
550 		 */
551 		if (io->released) {
552 			regetblk(bp);
553 			BUF_KERNPROC(bp);
554 		} else {
555 			io->released = 1;
556 		}
557 		hammer_io_disassociate((hammer_io_structure_t)io);
558 		/* return the bp */
559 	} else if (io->modified) {
560 		/*
561 		 * Only certain IO types can be released to the kernel if
562 		 * the buffer has been modified.
563 		 *
564 		 * volume and meta-data IO types may only be explicitly
565 		 * flushed by HAMMER.
566 		 */
567 		switch(io->type) {
568 		case HAMMER_STRUCTURE_DATA_BUFFER:
569 		case HAMMER_STRUCTURE_UNDO_BUFFER:
570 			if (io->released == 0) {
571 				io->released = 1;
572 				bdwrite(bp);
573 			}
574 			break;
575 		default:
576 			break;
577 		}
578 		bp = NULL;	/* bp left associated */
579 	} else if (io->released == 0) {
580 		/*
581 		 * Clean buffers can be generally released to the kernel.
582 		 * We leave the bp passively associated with the HAMMER
583 		 * structure and use bioops to disconnect it later on
584 		 * if the kernel wants to discard the buffer.
585 		 *
586 		 * We can steal the structure's ownership of the bp.
587 		 */
588 		io->released = 1;
589 		if (bp->b_flags & B_LOCKED) {
590 			hammer_io_disassociate(iou);
591 			/* return the bp */
592 		} else {
593 			if (io->reclaim) {
594 				hammer_io_disassociate(iou);
595 				/* return the bp */
596 			} else {
597 				/* return the bp (bp passively associated) */
598 			}
599 		}
600 	} else {
601 		/*
602 		 * A released buffer is passively associate with our
603 		 * hammer_io structure.  The kernel cannot destroy it
604 		 * without making a bioops call.  If the kernel (B_LOCKED)
605 		 * or we (reclaim) requested that the buffer be destroyed
606 		 * we destroy it, otherwise we do a quick get/release to
607 		 * reset its position in the kernel's LRU list.
608 		 *
609 		 * Leaving the buffer passively associated allows us to
610 		 * use the kernel's LRU buffer flushing mechanisms rather
611 		 * then rolling our own.
612 		 *
613 		 * XXX there are two ways of doing this.  We can re-acquire
614 		 * and passively release to reset the LRU, or not.
615 		 */
616 		if (io->running == 0) {
617 			regetblk(bp);
618 			if ((bp->b_flags & B_LOCKED) || io->reclaim) {
619 				hammer_io_disassociate(iou);
620 				/* return the bp */
621 			} else {
622 				/* return the bp (bp passively associated) */
623 			}
624 		} else {
625 			/*
626 			 * bp is left passively associated but we do not
627 			 * try to reacquire it.  Interactions with the io
628 			 * structure will occur on completion of the bp's
629 			 * I/O.
630 			 */
631 			bp = NULL;
632 		}
633 	}
634 	return(bp);
635 }
636 
637 /*
638  * This routine is called with a locked IO when a flush is desired and
639  * no other references to the structure exists other then ours.  This
640  * routine is ONLY called when HAMMER believes it is safe to flush a
641  * potentially modified buffer out.
642  *
643  * The locked io or io reference prevents a flush from being initiated
644  * by the kernel.
645  */
646 void
647 hammer_io_flush(struct hammer_io *io, int reclaim)
648 {
649 	struct buf *bp;
650 	hammer_mount_t hmp;
651 
652 	/*
653 	 * Degenerate case - nothing to flush if nothing is dirty.
654 	 */
655 	if (io->modified == 0)
656 		return;
657 
658 	KKASSERT(io->bp);
659 	KKASSERT(io->modify_refs <= 0);
660 
661 	/*
662 	 * Acquire ownership of the bp, particularly before we clear our
663 	 * modified flag.
664 	 *
665 	 * We are going to bawrite() this bp.  Don't leave a window where
666 	 * io->released is set, we actually own the bp rather then our
667 	 * buffer.
668 	 *
669 	 * The io_token should not be required here as only
670 	 */
671 	hmp = io->hmp;
672 	bp = io->bp;
673 	if (io->released) {
674 		regetblk(bp);
675 		/* BUF_KERNPROC(io->bp); */
676 		/* io->released = 0; */
677 		KKASSERT(io->released);
678 		KKASSERT(io->bp == bp);
679 	} else {
680 		io->released = 1;
681 	}
682 
683 	if (reclaim) {
684 		io->reclaim = 1;
685 		if ((bp->b_flags & B_LOCKED) == 0) {
686 			bp->b_flags |= B_LOCKED;
687 			atomic_add_int(&hammer_count_io_locked, 1);
688 		}
689 	}
690 
691 	/*
692 	 * Acquire exclusive access to the bp and then clear the modified
693 	 * state of the buffer prior to issuing I/O to interlock any
694 	 * modifications made while the I/O is in progress.  This shouldn't
695 	 * happen anyway but losing data would be worse.  The modified bit
696 	 * will be rechecked after the IO completes.
697 	 *
698 	 * NOTE: This call also finalizes the buffer's content (inval == 0).
699 	 *
700 	 * This is only legal when lock.refs == 1 (otherwise we might clear
701 	 * the modified bit while there are still users of the cluster
702 	 * modifying the data).
703 	 *
704 	 * Do this before potentially blocking so any attempt to modify the
705 	 * ondisk while we are blocked blocks waiting for us.
706 	 */
707 	hammer_ref(&io->lock);
708 	hammer_io_clear_modify(io, 0);
709 	hammer_rel(&io->lock);
710 
711 	if (hammer_debug_io & 0x0002)
712 		kprintf("hammer io_write %016jx\n", bp->b_bio1.bio_offset);
713 
714 	/*
715 	 * Transfer ownership to the kernel and initiate I/O.
716 	 *
717 	 * NOTE: We do not hold io_token so an atomic op is required to
718 	 *	 update io_running_space.
719 	 */
720 	io->running = 1;
721 	atomic_add_int(&hmp->io_running_space, io->bytes);
722 	atomic_add_int(&hammer_count_io_running_write, io->bytes);
723 	lwkt_gettoken(&hmp->io_token);
724 	TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry);
725 	lwkt_reltoken(&hmp->io_token);
726 	bawrite(bp);
727 	hammer_io_flush_mark(io->volume);
728 }
729 
730 /************************************************************************
731  *				BUFFER DIRTYING				*
732  ************************************************************************
733  *
734  * These routines deal with dependancies created when IO buffers get
735  * modified.  The caller must call hammer_modify_*() on a referenced
736  * HAMMER structure prior to modifying its on-disk data.
737  *
738  * Any intent to modify an IO buffer acquires the related bp and imposes
739  * various write ordering dependancies.
740  */
741 
742 /*
743  * Mark a HAMMER structure as undergoing modification.  Meta-data buffers
744  * are locked until the flusher can deal with them, pure data buffers
745  * can be written out.
746  *
747  * The referenced io prevents races.
748  */
749 static
750 void
751 hammer_io_modify(hammer_io_t io, int count)
752 {
753 	/*
754 	 * io->modify_refs must be >= 0
755 	 */
756 	while (io->modify_refs < 0) {
757 		io->waitmod = 1;
758 		tsleep(io, 0, "hmrmod", 0);
759 	}
760 
761 	/*
762 	 * Shortcut if nothing to do.
763 	 */
764 	KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL);
765 	io->modify_refs += count;
766 	if (io->modified && io->released == 0)
767 		return;
768 
769 	/*
770 	 * NOTE: It is important not to set the modified bit
771 	 *	 until after we have acquired the bp or we risk
772 	 *	 racing against checkwrite.
773 	 */
774 	hammer_lock_ex(&io->lock);
775 	if (io->released) {
776 		regetblk(io->bp);
777 		BUF_KERNPROC(io->bp);
778 		io->released = 0;
779 	}
780 	if (io->modified == 0) {
781 		hammer_io_set_modlist(io);
782 		io->modified = 1;
783 	}
784 	hammer_unlock(&io->lock);
785 }
786 
787 static __inline
788 void
789 hammer_io_modify_done(hammer_io_t io)
790 {
791 	KKASSERT(io->modify_refs > 0);
792 	--io->modify_refs;
793 	if (io->modify_refs == 0 && io->waitmod) {
794 		io->waitmod = 0;
795 		wakeup(io);
796 	}
797 }
798 
799 /*
800  * The write interlock blocks other threads trying to modify a buffer
801  * (they block in hammer_io_modify()) after us, or blocks us while other
802  * threads are in the middle of modifying a buffer.
803  *
804  * The caller also has a ref on the io, however if we are not careful
805  * we will race bioops callbacks (checkwrite).  To deal with this
806  * we must at least acquire and release the io_token, and it is probably
807  * better to hold it through the setting of modify_refs.
808  */
809 void
810 hammer_io_write_interlock(hammer_io_t io)
811 {
812 	hammer_mount_t hmp = io->hmp;
813 
814 	lwkt_gettoken(&hmp->io_token);
815 	while (io->modify_refs != 0) {
816 		io->waitmod = 1;
817 		tsleep(io, 0, "hmrmod", 0);
818 	}
819 	io->modify_refs = -1;
820 	lwkt_reltoken(&hmp->io_token);
821 }
822 
823 void
824 hammer_io_done_interlock(hammer_io_t io)
825 {
826 	KKASSERT(io->modify_refs == -1);
827 	io->modify_refs = 0;
828 	if (io->waitmod) {
829 		io->waitmod = 0;
830 		wakeup(io);
831 	}
832 }
833 
834 /*
835  * Caller intends to modify a volume's ondisk structure.
836  *
837  * This is only allowed if we are the flusher or we have a ref on the
838  * sync_lock.
839  */
840 void
841 hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume,
842 		     void *base, int len)
843 {
844 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
845 
846 	hammer_io_modify(&volume->io, 1);
847 	if (len) {
848 		intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk;
849 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
850 		hammer_generate_undo(trans,
851 			 HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset),
852 			 base, len);
853 	}
854 }
855 
856 /*
857  * Caller intends to modify a buffer's ondisk structure.
858  *
859  * This is only allowed if we are the flusher or we have a ref on the
860  * sync_lock.
861  */
862 void
863 hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer,
864 		     void *base, int len)
865 {
866 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
867 
868 	hammer_io_modify(&buffer->io, 1);
869 	if (len) {
870 		intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk;
871 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
872 		hammer_generate_undo(trans,
873 				     buffer->zone2_offset + rel_offset,
874 				     base, len);
875 	}
876 }
877 
878 void
879 hammer_modify_volume_done(hammer_volume_t volume)
880 {
881 	hammer_io_modify_done(&volume->io);
882 }
883 
884 void
885 hammer_modify_buffer_done(hammer_buffer_t buffer)
886 {
887 	hammer_io_modify_done(&buffer->io);
888 }
889 
890 /*
891  * Mark an entity as not being dirty any more and finalize any
892  * delayed adjustments to the buffer.
893  *
894  * Delayed adjustments are an important performance enhancement, allowing
895  * us to avoid recalculating B-Tree node CRCs over and over again when
896  * making bulk-modifications to the B-Tree.
897  *
898  * If inval is non-zero delayed adjustments are ignored.
899  *
900  * This routine may dereference related btree nodes and cause the
901  * buffer to be dereferenced.  The caller must own a reference on io.
902  */
903 void
904 hammer_io_clear_modify(struct hammer_io *io, int inval)
905 {
906 	hammer_mount_t hmp;
907 
908 	/*
909 	 * io_token is needed to avoid races on mod_list
910 	 */
911 	if (io->modified == 0)
912 		return;
913 	hmp = io->hmp;
914 	lwkt_gettoken(&hmp->io_token);
915 	if (io->modified == 0) {
916 		lwkt_reltoken(&hmp->io_token);
917 		return;
918 	}
919 
920 	/*
921 	 * Take us off the mod-list and clear the modified bit.
922 	 */
923 	KKASSERT(io->mod_list != NULL);
924 	if (io->mod_list == &io->hmp->volu_list ||
925 	    io->mod_list == &io->hmp->meta_list) {
926 		io->hmp->locked_dirty_space -= io->bytes;
927 		atomic_add_int(&hammer_count_dirtybufspace, -io->bytes);
928 	}
929 	TAILQ_REMOVE(io->mod_list, io, mod_entry);
930 	io->mod_list = NULL;
931 	io->modified = 0;
932 
933 	lwkt_reltoken(&hmp->io_token);
934 
935 	/*
936 	 * If this bit is not set there are no delayed adjustments.
937 	 */
938 	if (io->gencrc == 0)
939 		return;
940 	io->gencrc = 0;
941 
942 	/*
943 	 * Finalize requested CRCs.  The NEEDSCRC flag also holds a reference
944 	 * on the node (& underlying buffer).  Release the node after clearing
945 	 * the flag.
946 	 */
947 	if (io->type == HAMMER_STRUCTURE_META_BUFFER) {
948 		hammer_buffer_t buffer = (void *)io;
949 		hammer_node_t node;
950 
951 restart:
952 		TAILQ_FOREACH(node, &buffer->clist, entry) {
953 			if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0)
954 				continue;
955 			node->flags &= ~HAMMER_NODE_NEEDSCRC;
956 			KKASSERT(node->ondisk);
957 			if (inval == 0)
958 				node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE);
959 			hammer_rel_node(node);
960 			goto restart;
961 		}
962 	}
963 	/* caller must still have ref on io */
964 	KKASSERT(hammer_isactive(&io->lock));
965 }
966 
967 /*
968  * Clear the IO's modify list.  Even though the IO is no longer modified
969  * it may still be on the lose_list.  This routine is called just before
970  * the governing hammer_buffer is destroyed.
971  *
972  * mod_list requires io_token protection.
973  */
974 void
975 hammer_io_clear_modlist(struct hammer_io *io)
976 {
977 	hammer_mount_t hmp = io->hmp;
978 
979 	KKASSERT(io->modified == 0);
980 	if (io->mod_list) {
981 		lwkt_gettoken(&hmp->io_token);
982 		if (io->mod_list) {
983 			KKASSERT(io->mod_list == &io->hmp->lose_list);
984 			TAILQ_REMOVE(io->mod_list, io, mod_entry);
985 			io->mod_list = NULL;
986 		}
987 		lwkt_reltoken(&hmp->io_token);
988 	}
989 }
990 
991 static void
992 hammer_io_set_modlist(struct hammer_io *io)
993 {
994 	struct hammer_mount *hmp = io->hmp;
995 
996 	lwkt_gettoken(&hmp->io_token);
997 	KKASSERT(io->mod_list == NULL);
998 
999 	switch(io->type) {
1000 	case HAMMER_STRUCTURE_VOLUME:
1001 		io->mod_list = &hmp->volu_list;
1002 		hmp->locked_dirty_space += io->bytes;
1003 		atomic_add_int(&hammer_count_dirtybufspace, io->bytes);
1004 		break;
1005 	case HAMMER_STRUCTURE_META_BUFFER:
1006 		io->mod_list = &hmp->meta_list;
1007 		hmp->locked_dirty_space += io->bytes;
1008 		atomic_add_int(&hammer_count_dirtybufspace, io->bytes);
1009 		break;
1010 	case HAMMER_STRUCTURE_UNDO_BUFFER:
1011 		io->mod_list = &hmp->undo_list;
1012 		break;
1013 	case HAMMER_STRUCTURE_DATA_BUFFER:
1014 		io->mod_list = &hmp->data_list;
1015 		break;
1016 	case HAMMER_STRUCTURE_DUMMY:
1017 		panic("hammer_io_disassociate: bad io type");
1018 		break;
1019 	}
1020 	TAILQ_INSERT_TAIL(io->mod_list, io, mod_entry);
1021 	lwkt_reltoken(&hmp->io_token);
1022 }
1023 
1024 /************************************************************************
1025  *				HAMMER_BIOOPS				*
1026  ************************************************************************
1027  *
1028  */
1029 
1030 /*
1031  * Pre-IO initiation kernel callback - cluster build only
1032  *
1033  * bioops callback - hold io_token
1034  */
1035 static void
1036 hammer_io_start(struct buf *bp)
1037 {
1038 	/* nothing to do, so io_token not needed */
1039 }
1040 
1041 /*
1042  * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT!
1043  *
1044  * NOTE: HAMMER may modify a data buffer after we have initiated write
1045  *	 I/O.
1046  *
1047  * NOTE: MPSAFE callback
1048  *
1049  * bioops callback - hold io_token
1050  */
1051 static void
1052 hammer_io_complete(struct buf *bp)
1053 {
1054 	union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep);
1055 	struct hammer_mount *hmp = iou->io.hmp;
1056 	struct hammer_io *ionext;
1057 
1058 	lwkt_gettoken(&hmp->io_token);
1059 
1060 	KKASSERT(iou->io.released == 1);
1061 
1062 	/*
1063 	 * Deal with people waiting for I/O to drain
1064 	 */
1065 	if (iou->io.running) {
1066 		/*
1067 		 * Deal with critical write errors.  Once a critical error
1068 		 * has been flagged in hmp the UNDO FIFO will not be updated.
1069 		 * That way crash recover will give us a consistent
1070 		 * filesystem.
1071 		 *
1072 		 * Because of this we can throw away failed UNDO buffers.  If
1073 		 * we throw away META or DATA buffers we risk corrupting
1074 		 * the now read-only version of the filesystem visible to
1075 		 * the user.  Clear B_ERROR so the buffer is not re-dirtied
1076 		 * by the kernel and ref the io so it doesn't get thrown
1077 		 * away.
1078 		 */
1079 		if (bp->b_flags & B_ERROR) {
1080 			lwkt_gettoken(&hmp->fs_token);
1081 			hammer_critical_error(hmp, NULL, bp->b_error,
1082 					      "while flushing meta-data");
1083 			lwkt_reltoken(&hmp->fs_token);
1084 
1085 			switch(iou->io.type) {
1086 			case HAMMER_STRUCTURE_UNDO_BUFFER:
1087 				break;
1088 			default:
1089 				if (iou->io.ioerror == 0) {
1090 					iou->io.ioerror = 1;
1091 					hammer_ref(&iou->io.lock);
1092 				}
1093 				break;
1094 			}
1095 			bp->b_flags &= ~B_ERROR;
1096 			bundirty(bp);
1097 #if 0
1098 			hammer_io_set_modlist(&iou->io);
1099 			iou->io.modified = 1;
1100 #endif
1101 		}
1102 		hammer_stats_disk_write += iou->io.bytes;
1103 		atomic_add_int(&hammer_count_io_running_write, -iou->io.bytes);
1104 		atomic_add_int(&hmp->io_running_space, -iou->io.bytes);
1105 		if (hmp->io_running_wakeup &&
1106 		    hmp->io_running_space < hammer_limit_running_io / 2) {
1107 		    hmp->io_running_wakeup = 0;
1108 		    wakeup(&hmp->io_running_wakeup);
1109 		}
1110 		KKASSERT(hmp->io_running_space >= 0);
1111 		iou->io.running = 0;
1112 
1113 		/*
1114 		 * Remove from iorun list and wakeup any multi-io waiter(s).
1115 		 */
1116 		if (TAILQ_FIRST(&hmp->iorun_list) == &iou->io) {
1117 			ionext = TAILQ_NEXT(&iou->io, iorun_entry);
1118 			if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY)
1119 				wakeup(ionext);
1120 		}
1121 		TAILQ_REMOVE(&hmp->iorun_list, &iou->io, iorun_entry);
1122 	} else {
1123 		hammer_stats_disk_read += iou->io.bytes;
1124 	}
1125 
1126 	if (iou->io.waiting) {
1127 		iou->io.waiting = 0;
1128 		wakeup(iou);
1129 	}
1130 
1131 	/*
1132 	 * If B_LOCKED is set someone wanted to deallocate the bp at some
1133 	 * point, try to do it now.  The operation will fail if there are
1134 	 * refs or if hammer_io_deallocate() is unable to gain the
1135 	 * interlock.
1136 	 */
1137 	if (bp->b_flags & B_LOCKED) {
1138 		atomic_add_int(&hammer_count_io_locked, -1);
1139 		bp->b_flags &= ~B_LOCKED;
1140 		hammer_io_deallocate(bp);
1141 		/* structure may be dead now */
1142 	}
1143 	lwkt_reltoken(&hmp->io_token);
1144 }
1145 
1146 /*
1147  * Callback from kernel when it wishes to deallocate a passively
1148  * associated structure.  This mostly occurs with clean buffers
1149  * but it may be possible for a holding structure to be marked dirty
1150  * while its buffer is passively associated.  The caller owns the bp.
1151  *
1152  * If we cannot disassociate we set B_LOCKED to prevent the buffer
1153  * from getting reused.
1154  *
1155  * WARNING: Because this can be called directly by getnewbuf we cannot
1156  * recurse into the tree.  If a bp cannot be immediately disassociated
1157  * our only recourse is to set B_LOCKED.
1158  *
1159  * WARNING: This may be called from an interrupt via hammer_io_complete()
1160  *
1161  * bioops callback - hold io_token
1162  */
1163 static void
1164 hammer_io_deallocate(struct buf *bp)
1165 {
1166 	hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep);
1167 	hammer_mount_t hmp;
1168 
1169 	hmp = iou->io.hmp;
1170 
1171 	lwkt_gettoken(&hmp->io_token);
1172 
1173 	KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0);
1174 	if (hammer_try_interlock_norefs(&iou->io.lock) == 0) {
1175 		/*
1176 		 * We cannot safely disassociate a bp from a referenced
1177 		 * or interlocked HAMMER structure.
1178 		 */
1179 		bp->b_flags |= B_LOCKED;
1180 		atomic_add_int(&hammer_count_io_locked, 1);
1181 	} else if (iou->io.modified) {
1182 		/*
1183 		 * It is not legal to disassociate a modified buffer.  This
1184 		 * case really shouldn't ever occur.
1185 		 */
1186 		bp->b_flags |= B_LOCKED;
1187 		atomic_add_int(&hammer_count_io_locked, 1);
1188 		hammer_put_interlock(&iou->io.lock, 0);
1189 	} else {
1190 		/*
1191 		 * Disassociate the BP.  If the io has no refs left we
1192 		 * have to add it to the loose list.  The kernel has
1193 		 * locked the buffer and therefore our io must be
1194 		 * in a released state.
1195 		 */
1196 		hammer_io_disassociate(iou);
1197 		if (iou->io.type != HAMMER_STRUCTURE_VOLUME) {
1198 			KKASSERT(iou->io.bp == NULL);
1199 			KKASSERT(iou->io.mod_list == NULL);
1200 			iou->io.mod_list = &hmp->lose_list;
1201 			TAILQ_INSERT_TAIL(iou->io.mod_list, &iou->io, mod_entry);
1202 		}
1203 		hammer_put_interlock(&iou->io.lock, 1);
1204 	}
1205 	lwkt_reltoken(&hmp->io_token);
1206 }
1207 
1208 /*
1209  * bioops callback - hold io_token
1210  */
1211 static int
1212 hammer_io_fsync(struct vnode *vp)
1213 {
1214 	/* nothing to do, so io_token not needed */
1215 	return(0);
1216 }
1217 
1218 /*
1219  * NOTE: will not be called unless we tell the kernel about the
1220  * bioops.  Unused... we use the mount's VFS_SYNC instead.
1221  *
1222  * bioops callback - hold io_token
1223  */
1224 static int
1225 hammer_io_sync(struct mount *mp)
1226 {
1227 	/* nothing to do, so io_token not needed */
1228 	return(0);
1229 }
1230 
1231 /*
1232  * bioops callback - hold io_token
1233  */
1234 static void
1235 hammer_io_movedeps(struct buf *bp1, struct buf *bp2)
1236 {
1237 	/* nothing to do, so io_token not needed */
1238 }
1239 
1240 /*
1241  * I/O pre-check for reading and writing.  HAMMER only uses this for
1242  * B_CACHE buffers so checkread just shouldn't happen, but if it does
1243  * allow it.
1244  *
1245  * Writing is a different case.  We don't want the kernel to try to write
1246  * out a buffer that HAMMER may be modifying passively or which has a
1247  * dependancy.  In addition, kernel-demanded writes can only proceed for
1248  * certain types of buffers (i.e. UNDO and DATA types).  Other dirty
1249  * buffer types can only be explicitly written by the flusher.
1250  *
1251  * checkwrite will only be called for bdwrite()n buffers.  If we return
1252  * success the kernel is guaranteed to initiate the buffer write.
1253  *
1254  * bioops callback - hold io_token
1255  */
1256 static int
1257 hammer_io_checkread(struct buf *bp)
1258 {
1259 	/* nothing to do, so io_token not needed */
1260 	return(0);
1261 }
1262 
1263 /*
1264  * The kernel is asking us whether it can write out a dirty buffer or not.
1265  *
1266  * bioops callback - hold io_token
1267  */
1268 static int
1269 hammer_io_checkwrite(struct buf *bp)
1270 {
1271 	hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep);
1272 	hammer_mount_t hmp = io->hmp;
1273 
1274 	/*
1275 	 * This shouldn't happen under normal operation.
1276 	 */
1277 	lwkt_gettoken(&hmp->io_token);
1278 	if (io->type == HAMMER_STRUCTURE_VOLUME ||
1279 	    io->type == HAMMER_STRUCTURE_META_BUFFER) {
1280 		if (!panicstr)
1281 			panic("hammer_io_checkwrite: illegal buffer");
1282 		if ((bp->b_flags & B_LOCKED) == 0) {
1283 			bp->b_flags |= B_LOCKED;
1284 			atomic_add_int(&hammer_count_io_locked, 1);
1285 		}
1286 		lwkt_reltoken(&hmp->io_token);
1287 		return(1);
1288 	}
1289 
1290 	/*
1291 	 * We have to be able to interlock the IO to safely modify any
1292 	 * of its fields without holding the fs_token.  If we can't lock
1293 	 * it then we are racing someone.
1294 	 *
1295 	 * Our ownership of the bp lock prevents the io from being ripped
1296 	 * out from under us.
1297 	 */
1298 	if (hammer_try_interlock_norefs(&io->lock) == 0) {
1299 		bp->b_flags |= B_LOCKED;
1300 		atomic_add_int(&hammer_count_io_locked, 1);
1301 		lwkt_reltoken(&hmp->io_token);
1302 		return(1);
1303 	}
1304 
1305 	/*
1306 	 * The modified bit must be cleared prior to the initiation of
1307 	 * any IO (returning 0 initiates the IO).  Because this is a
1308 	 * normal data buffer hammer_io_clear_modify() runs through a
1309 	 * simple degenerate case.
1310 	 *
1311 	 * Return 0 will cause the kernel to initiate the IO, and we
1312 	 * must normally clear the modified bit before we begin.  If
1313 	 * the io has modify_refs we do not clear the modified bit,
1314 	 * otherwise we may miss changes.
1315 	 *
1316 	 * Only data and undo buffers can reach here.  These buffers do
1317 	 * not have terminal crc functions but we temporarily reference
1318 	 * the IO anyway, just in case.
1319 	 */
1320 	if (io->modify_refs == 0 && io->modified) {
1321 		hammer_ref(&io->lock);
1322 		hammer_io_clear_modify(io, 0);
1323 		hammer_rel(&io->lock);
1324 	} else if (io->modified) {
1325 		KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER);
1326 	}
1327 
1328 	/*
1329 	 * The kernel is going to start the IO, set io->running.
1330 	 */
1331 	KKASSERT(io->running == 0);
1332 	io->running = 1;
1333 	atomic_add_int(&io->hmp->io_running_space, io->bytes);
1334 	atomic_add_int(&hammer_count_io_running_write, io->bytes);
1335 	TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
1336 
1337 	hammer_put_interlock(&io->lock, 1);
1338 	lwkt_reltoken(&hmp->io_token);
1339 
1340 	return(0);
1341 }
1342 
1343 /*
1344  * Return non-zero if we wish to delay the kernel's attempt to flush
1345  * this buffer to disk.
1346  *
1347  * bioops callback - hold io_token
1348  */
1349 static int
1350 hammer_io_countdeps(struct buf *bp, int n)
1351 {
1352 	/* nothing to do, so io_token not needed */
1353 	return(0);
1354 }
1355 
1356 struct bio_ops hammer_bioops = {
1357 	.io_start	= hammer_io_start,
1358 	.io_complete	= hammer_io_complete,
1359 	.io_deallocate	= hammer_io_deallocate,
1360 	.io_fsync	= hammer_io_fsync,
1361 	.io_sync	= hammer_io_sync,
1362 	.io_movedeps	= hammer_io_movedeps,
1363 	.io_countdeps	= hammer_io_countdeps,
1364 	.io_checkread	= hammer_io_checkread,
1365 	.io_checkwrite	= hammer_io_checkwrite,
1366 };
1367 
1368 /************************************************************************
1369  *				DIRECT IO OPS 				*
1370  ************************************************************************
1371  *
1372  * These functions operate directly on the buffer cache buffer associated
1373  * with a front-end vnode rather then a back-end device vnode.
1374  */
1375 
1376 /*
1377  * Read a buffer associated with a front-end vnode directly from the
1378  * disk media.  The bio may be issued asynchronously.  If leaf is non-NULL
1379  * we validate the CRC.
1380  *
1381  * We must check for the presence of a HAMMER buffer to handle the case
1382  * where the reblocker has rewritten the data (which it does via the HAMMER
1383  * buffer system, not via the high-level vnode buffer cache), but not yet
1384  * committed the buffer to the media.
1385  */
1386 int
1387 hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
1388 		      hammer_btree_leaf_elm_t leaf)
1389 {
1390 	hammer_off_t buf_offset;
1391 	hammer_off_t zone2_offset;
1392 	hammer_volume_t volume;
1393 	struct buf *bp;
1394 	struct bio *nbio;
1395 	int vol_no;
1396 	int error;
1397 
1398 	buf_offset = bio->bio_offset;
1399 	KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) ==
1400 		 HAMMER_ZONE_LARGE_DATA);
1401 
1402 	/*
1403 	 * The buffer cache may have an aliased buffer (the reblocker can
1404 	 * write them).  If it does we have to sync any dirty data before
1405 	 * we can build our direct-read.  This is a non-critical code path.
1406 	 */
1407 	bp = bio->bio_buf;
1408 	hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
1409 
1410 	/*
1411 	 * Resolve to a zone-2 offset.  The conversion just requires
1412 	 * munging the top 4 bits but we want to abstract it anyway
1413 	 * so the blockmap code can verify the zone assignment.
1414 	 */
1415 	zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1416 	if (error)
1417 		goto done;
1418 	KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
1419 		 HAMMER_ZONE_RAW_BUFFER);
1420 
1421 	/*
1422 	 * Resolve volume and raw-offset for 3rd level bio.  The
1423 	 * offset will be specific to the volume.
1424 	 */
1425 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
1426 	volume = hammer_get_volume(hmp, vol_no, &error);
1427 	if (error == 0 && zone2_offset >= volume->maxbuf_off)
1428 		error = EIO;
1429 
1430 	if (error == 0) {
1431 		/*
1432 		 * 3rd level bio
1433 		 */
1434 		nbio = push_bio(bio);
1435 		nbio->bio_offset = volume->ondisk->vol_buf_beg +
1436 				   (zone2_offset & HAMMER_OFF_SHORT_MASK);
1437 #if 0
1438 		/*
1439 		 * XXX disabled - our CRC check doesn't work if the OS
1440 		 * does bogus_page replacement on the direct-read.
1441 		 */
1442 		if (leaf && hammer_verify_data) {
1443 			nbio->bio_done = hammer_io_direct_read_complete;
1444 			nbio->bio_caller_info1.uvalue32 = leaf->data_crc;
1445 		}
1446 #endif
1447 		hammer_stats_disk_read += bp->b_bufsize;
1448 		vn_strategy(volume->devvp, nbio);
1449 	}
1450 	hammer_rel_volume(volume, 0);
1451 done:
1452 	if (error) {
1453 		kprintf("hammer_direct_read: failed @ %016llx\n",
1454 			(long long)zone2_offset);
1455 		bp->b_error = error;
1456 		bp->b_flags |= B_ERROR;
1457 		biodone(bio);
1458 	}
1459 	return(error);
1460 }
1461 
1462 #if 0
1463 /*
1464  * On completion of the BIO this callback must check the data CRC
1465  * and chain to the previous bio.
1466  *
1467  * MPSAFE - since we do not modify and hammer_records we do not need
1468  *	    io_token.
1469  *
1470  * NOTE: MPSAFE callback
1471  */
1472 static
1473 void
1474 hammer_io_direct_read_complete(struct bio *nbio)
1475 {
1476 	struct bio *obio;
1477 	struct buf *bp;
1478 	u_int32_t rec_crc = nbio->bio_caller_info1.uvalue32;
1479 
1480 	bp = nbio->bio_buf;
1481 	if (crc32(bp->b_data, bp->b_bufsize) != rec_crc) {
1482 		kprintf("HAMMER: data_crc error @%016llx/%d\n",
1483 			nbio->bio_offset, bp->b_bufsize);
1484 		if (hammer_debug_critical)
1485 			Debugger("data_crc on read");
1486 		bp->b_flags |= B_ERROR;
1487 		bp->b_error = EIO;
1488 	}
1489 	obio = pop_bio(nbio);
1490 	biodone(obio);
1491 }
1492 #endif
1493 
1494 /*
1495  * Write a buffer associated with a front-end vnode directly to the
1496  * disk media.  The bio may be issued asynchronously.
1497  *
1498  * The BIO is associated with the specified record and RECG_DIRECT_IO
1499  * is set.  The recorded is added to its object.
1500  */
1501 int
1502 hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
1503 		       hammer_record_t record)
1504 {
1505 	hammer_btree_leaf_elm_t leaf = &record->leaf;
1506 	hammer_off_t buf_offset;
1507 	hammer_off_t zone2_offset;
1508 	hammer_volume_t volume;
1509 	hammer_buffer_t buffer;
1510 	struct buf *bp;
1511 	struct bio *nbio;
1512 	char *ptr;
1513 	int vol_no;
1514 	int error;
1515 
1516 	buf_offset = leaf->data_offset;
1517 
1518 	KKASSERT(buf_offset > HAMMER_ZONE_BTREE);
1519 	KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE);
1520 
1521 	/*
1522 	 * Issue or execute the I/O.  The new memory record must replace
1523 	 * the old one before the I/O completes, otherwise a reaquisition of
1524 	 * the buffer will load the old media data instead of the new.
1525 	 */
1526 	if ((buf_offset & HAMMER_BUFMASK) == 0 &&
1527 	    leaf->data_len >= HAMMER_BUFSIZE) {
1528 		/*
1529 		 * We are using the vnode's bio to write directly to the
1530 		 * media, any hammer_buffer at the same zone-X offset will
1531 		 * now have stale data.
1532 		 */
1533 		zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1534 		vol_no = HAMMER_VOL_DECODE(zone2_offset);
1535 		volume = hammer_get_volume(hmp, vol_no, &error);
1536 
1537 		if (error == 0 && zone2_offset >= volume->maxbuf_off)
1538 			error = EIO;
1539 		if (error == 0) {
1540 			bp = bio->bio_buf;
1541 			KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0);
1542 			/*
1543 			hammer_del_buffers(hmp, buf_offset,
1544 					   zone2_offset, bp->b_bufsize);
1545 			*/
1546 
1547 			/*
1548 			 * Second level bio - cached zone2 offset.
1549 			 *
1550 			 * (We can put our bio_done function in either the
1551 			 *  2nd or 3rd level).
1552 			 */
1553 			nbio = push_bio(bio);
1554 			nbio->bio_offset = zone2_offset;
1555 			nbio->bio_done = hammer_io_direct_write_complete;
1556 			nbio->bio_caller_info1.ptr = record;
1557 			record->zone2_offset = zone2_offset;
1558 			record->gflags |= HAMMER_RECG_DIRECT_IO |
1559 					 HAMMER_RECG_DIRECT_INVAL;
1560 
1561 			/*
1562 			 * Third level bio - raw offset specific to the
1563 			 * correct volume.
1564 			 */
1565 			zone2_offset &= HAMMER_OFF_SHORT_MASK;
1566 			nbio = push_bio(nbio);
1567 			nbio->bio_offset = volume->ondisk->vol_buf_beg +
1568 					   zone2_offset;
1569 			hammer_stats_disk_write += bp->b_bufsize;
1570 			hammer_ip_replace_bulk(hmp, record);
1571 			vn_strategy(volume->devvp, nbio);
1572 			hammer_io_flush_mark(volume);
1573 		}
1574 		hammer_rel_volume(volume, 0);
1575 	} else {
1576 		/*
1577 		 * Must fit in a standard HAMMER buffer.  In this case all
1578 		 * consumers use the HAMMER buffer system and RECG_DIRECT_IO
1579 		 * does not need to be set-up.
1580 		 */
1581 		KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0);
1582 		buffer = NULL;
1583 		ptr = hammer_bread(hmp, buf_offset, &error, &buffer);
1584 		if (error == 0) {
1585 			bp = bio->bio_buf;
1586 			bp->b_flags |= B_AGE;
1587 			hammer_io_modify(&buffer->io, 1);
1588 			bcopy(bp->b_data, ptr, leaf->data_len);
1589 			hammer_io_modify_done(&buffer->io);
1590 			hammer_rel_buffer(buffer, 0);
1591 			bp->b_resid = 0;
1592 			hammer_ip_replace_bulk(hmp, record);
1593 			biodone(bio);
1594 		}
1595 	}
1596 	if (error) {
1597 		/*
1598 		 * Major suckage occured.  Also note:  The record was
1599 		 * never added to the tree so we do not have to worry
1600 		 * about the backend.
1601 		 */
1602 		kprintf("hammer_direct_write: failed @ %016llx\n",
1603 			(long long)leaf->data_offset);
1604 		bp = bio->bio_buf;
1605 		bp->b_resid = 0;
1606 		bp->b_error = EIO;
1607 		bp->b_flags |= B_ERROR;
1608 		biodone(bio);
1609 		record->flags |= HAMMER_RECF_DELETED_FE;
1610 		hammer_rel_mem_record(record);
1611 	}
1612 	return(error);
1613 }
1614 
1615 /*
1616  * On completion of the BIO this callback must disconnect
1617  * it from the hammer_record and chain to the previous bio.
1618  *
1619  * An I/O error forces the mount to read-only.  Data buffers
1620  * are not B_LOCKED like meta-data buffers are, so we have to
1621  * throw the buffer away to prevent the kernel from retrying.
1622  *
1623  * NOTE: MPSAFE callback, only modify fields we have explicit
1624  *	 access to (the bp and the record->gflags).
1625  */
1626 static
1627 void
1628 hammer_io_direct_write_complete(struct bio *nbio)
1629 {
1630 	struct bio *obio;
1631 	struct buf *bp;
1632 	hammer_record_t record;
1633 	hammer_mount_t hmp;
1634 
1635 	record = nbio->bio_caller_info1.ptr;
1636 	KKASSERT(record != NULL);
1637 	hmp = record->ip->hmp;
1638 
1639 	lwkt_gettoken(&hmp->io_token);
1640 
1641 	bp = nbio->bio_buf;
1642 	obio = pop_bio(nbio);
1643 	if (bp->b_flags & B_ERROR) {
1644 		lwkt_gettoken(&hmp->fs_token);
1645 		hammer_critical_error(hmp, record->ip,
1646 				      bp->b_error,
1647 				      "while writing bulk data");
1648 		lwkt_reltoken(&hmp->fs_token);
1649 		bp->b_flags |= B_INVAL;
1650 	}
1651 	biodone(obio);
1652 
1653 	KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO);
1654 	if (record->gflags & HAMMER_RECG_DIRECT_WAIT) {
1655 		record->gflags &= ~(HAMMER_RECG_DIRECT_IO |
1656 				    HAMMER_RECG_DIRECT_WAIT);
1657 		/* record can disappear once DIRECT_IO flag is cleared */
1658 		wakeup(&record->flags);
1659 	} else {
1660 		record->gflags &= ~HAMMER_RECG_DIRECT_IO;
1661 		/* record can disappear once DIRECT_IO flag is cleared */
1662 	}
1663 	lwkt_reltoken(&hmp->io_token);
1664 }
1665 
1666 
1667 /*
1668  * This is called before a record is either committed to the B-Tree
1669  * or destroyed, to resolve any associated direct-IO.
1670  *
1671  * (1) We must wait for any direct-IO related to the record to complete.
1672  *
1673  * (2) We must remove any buffer cache aliases for data accessed via
1674  *     leaf->data_offset or zone2_offset so non-direct-IO consumers
1675  *     (the mirroring and reblocking code) do not see stale data.
1676  */
1677 void
1678 hammer_io_direct_wait(hammer_record_t record)
1679 {
1680 	hammer_mount_t hmp = record->ip->hmp;
1681 
1682 	/*
1683 	 * Wait for I/O to complete
1684 	 */
1685 	if (record->gflags & HAMMER_RECG_DIRECT_IO) {
1686 		lwkt_gettoken(&hmp->io_token);
1687 		while (record->gflags & HAMMER_RECG_DIRECT_IO) {
1688 			record->gflags |= HAMMER_RECG_DIRECT_WAIT;
1689 			tsleep(&record->flags, 0, "hmdiow", 0);
1690 		}
1691 		lwkt_reltoken(&hmp->io_token);
1692 	}
1693 
1694 	/*
1695 	 * Invalidate any related buffer cache aliases associated with the
1696 	 * backing device.  This is needed because the buffer cache buffer
1697 	 * for file data is associated with the file vnode, not the backing
1698 	 * device vnode.
1699 	 *
1700 	 * XXX I do not think this case can occur any more now that
1701 	 * reservations ensure that all such buffers are removed before
1702 	 * an area can be reused.
1703 	 */
1704 	if (record->gflags & HAMMER_RECG_DIRECT_INVAL) {
1705 		KKASSERT(record->leaf.data_offset);
1706 		hammer_del_buffers(hmp, record->leaf.data_offset,
1707 				   record->zone2_offset, record->leaf.data_len,
1708 				   1);
1709 		record->gflags &= ~HAMMER_RECG_DIRECT_INVAL;
1710 	}
1711 }
1712 
1713 /*
1714  * This is called to remove the second-level cached zone-2 offset from
1715  * frontend buffer cache buffers, now stale due to a data relocation.
1716  * These offsets are generated by cluster_read() via VOP_BMAP, or directly
1717  * by hammer_vop_strategy_read().
1718  *
1719  * This is rather nasty because here we have something like the reblocker
1720  * scanning the raw B-Tree with no held references on anything, really,
1721  * other then a shared lock on the B-Tree node, and we have to access the
1722  * frontend's buffer cache to check for and clean out the association.
1723  * Specifically, if the reblocker is moving data on the disk, these cached
1724  * offsets will become invalid.
1725  *
1726  * Only data record types associated with the large-data zone are subject
1727  * to direct-io and need to be checked.
1728  *
1729  */
1730 void
1731 hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf)
1732 {
1733 	struct hammer_inode_info iinfo;
1734 	int zone;
1735 
1736 	if (leaf->base.rec_type != HAMMER_RECTYPE_DATA)
1737 		return;
1738 	zone = HAMMER_ZONE_DECODE(leaf->data_offset);
1739 	if (zone != HAMMER_ZONE_LARGE_DATA_INDEX)
1740 		return;
1741 	iinfo.obj_id = leaf->base.obj_id;
1742 	iinfo.obj_asof = 0;	/* unused */
1743 	iinfo.obj_localization = leaf->base.localization &
1744 				 HAMMER_LOCALIZE_PSEUDOFS_MASK;
1745 	iinfo.u.leaf = leaf;
1746 	hammer_scan_inode_snapshots(hmp, &iinfo,
1747 				    hammer_io_direct_uncache_callback,
1748 				    leaf);
1749 }
1750 
1751 static int
1752 hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data)
1753 {
1754 	hammer_inode_info_t iinfo = data;
1755 	hammer_off_t data_offset;
1756 	hammer_off_t file_offset;
1757 	struct vnode *vp;
1758 	struct buf *bp;
1759 	int blksize;
1760 
1761 	if (ip->vp == NULL)
1762 		return(0);
1763 	data_offset = iinfo->u.leaf->data_offset;
1764 	file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len;
1765 	blksize = iinfo->u.leaf->data_len;
1766 	KKASSERT((blksize & HAMMER_BUFMASK) == 0);
1767 
1768 	/*
1769 	 * Warning: FINDBLK_TEST return stable storage but not stable
1770 	 *	    contents.  It happens to be ok in this case.
1771 	 */
1772 	hammer_ref(&ip->lock);
1773 	if (hammer_get_vnode(ip, &vp) == 0) {
1774 		if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL &&
1775 		    bp->b_bio2.bio_offset != NOOFFSET) {
1776 			bp = getblk(ip->vp, file_offset, blksize, 0, 0);
1777 			bp->b_bio2.bio_offset = NOOFFSET;
1778 			brelse(bp);
1779 		}
1780 		vput(vp);
1781 	}
1782 	hammer_rel_inode(ip, 0);
1783 	return(0);
1784 }
1785 
1786 
1787 /*
1788  * This function is called when writes may have occured on the volume,
1789  * indicating that the device may be holding cached writes.
1790  */
1791 static void
1792 hammer_io_flush_mark(hammer_volume_t volume)
1793 {
1794 	atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH);
1795 }
1796 
1797 /*
1798  * This function ensures that the device has flushed any cached writes out.
1799  */
1800 void
1801 hammer_io_flush_sync(hammer_mount_t hmp)
1802 {
1803 	hammer_volume_t volume;
1804 	struct buf *bp_base = NULL;
1805 	struct buf *bp;
1806 
1807 	RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) {
1808 		if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) {
1809 			atomic_clear_int(&volume->vol_flags,
1810 					 HAMMER_VOLF_NEEDFLUSH);
1811 			bp = getpbuf(NULL);
1812 			bp->b_bio1.bio_offset = 0;
1813 			bp->b_bufsize = 0;
1814 			bp->b_bcount = 0;
1815 			bp->b_cmd = BUF_CMD_FLUSH;
1816 			bp->b_bio1.bio_caller_info1.cluster_head = bp_base;
1817 			bp->b_bio1.bio_done = biodone_sync;
1818 			bp->b_bio1.bio_flags |= BIO_SYNC;
1819 			bp_base = bp;
1820 			vn_strategy(volume->devvp, &bp->b_bio1);
1821 		}
1822 	}
1823 	while ((bp = bp_base) != NULL) {
1824 		bp_base = bp->b_bio1.bio_caller_info1.cluster_head;
1825 		biowait(&bp->b_bio1, "hmrFLS");
1826 		relpbuf(bp, NULL);
1827 	}
1828 }
1829 
1830 /*
1831  * Limit the amount of backlog which we allow to build up
1832  */
1833 void
1834 hammer_io_limit_backlog(hammer_mount_t hmp)
1835 {
1836         while (hmp->io_running_space > hammer_limit_running_io) {
1837                 hmp->io_running_wakeup = 1;
1838                 tsleep(&hmp->io_running_wakeup, 0, "hmiolm", hz / 10);
1839         }
1840 }
1841