xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision 8835adf8)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 /*
38  * Implements an abstraction layer for synchronous and asynchronous
39  * buffered device I/O.  Can be used for OS-abstraction but the main
40  * purpose is to allow larger buffers to be used against hammer2_chain's
41  * using smaller allocations, without causing deadlocks.
42  *
43  */
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
45 static void dio_write_stats_update(hammer2_io_t *dio);
46 
47 static int
48 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
49 {
50 	if (io1->pbase < io2->pbase)
51 		return(-1);
52 	if (io1->pbase > io2->pbase)
53 		return(1);
54 	return(0);
55 }
56 
57 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
58 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
59 		off_t, pbase);
60 
61 struct hammer2_cleanupcb_info {
62 	struct hammer2_io_tree tmptree;
63 	int	count;
64 };
65 
66 static __inline
67 uint64_t
68 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
69 {
70 	uint64_t mask;
71 	int i;
72 
73 	if (bytes < 1024)	/* smaller chunks not supported */
74 		return 0;
75 
76 	/*
77 	 * Calculate crc check mask for larger chunks
78 	 */
79 	i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
80 	     HAMMER2_PBUFMASK) >> 10;
81 	if (i == 0 && bytes == HAMMER2_PBUFSIZE)
82 		return((uint64_t)-1);
83 	mask = ((uint64_t)1U << (bytes >> 10)) - 1;
84 	mask <<= i;
85 
86 	return mask;
87 }
88 
89 #define HAMMER2_GETBLK_GOOD	0
90 #define HAMMER2_GETBLK_QUEUED	1
91 #define HAMMER2_GETBLK_OWNED	2
92 
93 /*
94  * Allocate/Locate the requested dio, reference it, issue or queue iocb.
95  */
96 void
97 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
98 		  hammer2_iocb_t *iocb)
99 {
100 	hammer2_io_t *dio;
101 	hammer2_io_t *xio;
102 	off_t pbase;
103 	off_t pmask;
104 	/*
105 	 * XXX after free, buffer reuse case w/ different size can clash
106 	 * with dio cache.  Lets avoid it for now.  Ultimate we need to
107 	 * invalidate the dio cache when freeing blocks to allow a mix
108 	 * of 16KB and 64KB block sizes).
109 	 */
110 	/*int psize = hammer2_devblksize(lsize);*/
111 	int psize = HAMMER2_PBUFSIZE;
112 	int refs;
113 
114 	pmask = ~(hammer2_off_t)(psize - 1);
115 
116 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
117 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
118 	pbase = lbase & pmask;
119 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
120 
121 	/*
122 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
123 	 */
124 	hammer2_spin_sh(&hmp->io_spin);
125 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
126 	if (dio) {
127 		if ((atomic_fetchadd_int(&dio->refs, 1) &
128 		     HAMMER2_DIO_MASK) == 0) {
129 			atomic_add_int(&dio->hmp->iofree_count, -1);
130 		}
131 		hammer2_spin_unsh(&hmp->io_spin);
132 	} else {
133 		hammer2_spin_unsh(&hmp->io_spin);
134 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
135 		dio->hmp = hmp;
136 		dio->pbase = pbase;
137 		dio->psize = psize;
138 		dio->btype = iocb->btype;
139 		dio->refs = 1;
140 		hammer2_spin_init(&dio->spin, "h2dio");
141 		TAILQ_INIT(&dio->iocbq);
142 		hammer2_spin_ex(&hmp->io_spin);
143 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
144 		if (xio == NULL) {
145 			atomic_add_int(&hammer2_dio_count, 1);
146 			hammer2_spin_unex(&hmp->io_spin);
147 		} else {
148 			if ((atomic_fetchadd_int(&xio->refs, 1) &
149 			     HAMMER2_DIO_MASK) == 0) {
150 				atomic_add_int(&xio->hmp->iofree_count, -1);
151 			}
152 			hammer2_spin_unex(&hmp->io_spin);
153 			kfree(dio, M_HAMMER2);
154 			dio = xio;
155 		}
156 	}
157 
158 	/*
159 	 * Obtain/Validate the buffer.
160 	 */
161 	iocb->dio = dio;
162 
163 	if (dio->act < 5)	/* SMP race ok */
164 		++dio->act;
165 
166 	for (;;) {
167 		refs = dio->refs;
168 		cpu_ccfence();
169 
170 		/*
171 		 * Issue the iocb immediately if the buffer is already good.
172 		 * Once set GOOD cannot be cleared until refs drops to 0.
173 		 *
174 		 * lfence required because dio's are not interlocked for
175 		 * the DIO_GOOD test.
176 		 */
177 		if (refs & HAMMER2_DIO_GOOD) {
178 			cpu_lfence();
179 			iocb->callback(iocb);
180 			break;
181 		}
182 
183 		/*
184 		 * Try to own the DIO by setting INPROG so we can issue
185 		 * I/O on it.
186 		 */
187 		if (refs & HAMMER2_DIO_INPROG) {
188 			/*
189 			 * If DIO_INPROG is already set then set WAITING and
190 			 * queue the iocb.
191 			 */
192 			hammer2_spin_ex(&dio->spin);
193 			if (atomic_cmpset_int(&dio->refs, refs,
194 					      refs | HAMMER2_DIO_WAITING)) {
195 				iocb->flags |= HAMMER2_IOCB_ONQ |
196 					       HAMMER2_IOCB_INPROG;
197 				TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
198 				hammer2_spin_unex(&dio->spin);
199 				break;
200 			}
201 			hammer2_spin_unex(&dio->spin);
202 			/* retry */
203 		} else {
204 			/*
205 			 * If DIO_INPROG is not set then set it and issue the
206 			 * callback immediately to start I/O.
207 			 */
208 			if (atomic_cmpset_int(&dio->refs, refs,
209 					      refs | HAMMER2_DIO_INPROG)) {
210 				iocb->flags |= HAMMER2_IOCB_INPROG;
211 				iocb->callback(iocb);
212 				break;
213 			}
214 			/* retry */
215 		}
216 		/* retry */
217 	}
218 }
219 
220 /*
221  * Quickly obtain a good DIO buffer, return NULL if the system no longer
222  * caches the data.
223  */
224 hammer2_io_t *
225 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
226 {
227 	hammer2_iocb_t iocb;
228 	hammer2_io_t *dio;
229 	struct buf *bp;
230 	off_t pbase;
231 	off_t pmask;
232 	int psize = HAMMER2_PBUFSIZE;
233 	int orefs;
234 	int nrefs;
235 
236 	pmask = ~(hammer2_off_t)(psize - 1);
237 
238 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
239 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
240 	pbase = lbase & pmask;
241 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
242 
243 	/*
244 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
245 	 */
246 	hammer2_spin_sh(&hmp->io_spin);
247 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
248 	if (dio == NULL) {
249 		hammer2_spin_unsh(&hmp->io_spin);
250 		return NULL;
251 	}
252 
253 	if ((atomic_fetchadd_int(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
254 		atomic_add_int(&dio->hmp->iofree_count, -1);
255 	hammer2_spin_unsh(&hmp->io_spin);
256 
257 	if (dio->act < 5)	/* SMP race ok */
258 		++dio->act;
259 
260 	/*
261 	 * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
262 	 * the system does not have the data already cached.
263 	 */
264 	nrefs = -1;
265 	for (;;) {
266 		orefs = dio->refs;
267 		cpu_ccfence();
268 
269 		/*
270 		 * Issue the iocb immediately if the buffer is already good.
271 		 * Once set GOOD cannot be cleared until refs drops to 0.
272 		 *
273 		 * lfence required because dio is not interlockedf for
274 		 * the DIO_GOOD test.
275 		 */
276 		if (orefs & HAMMER2_DIO_GOOD) {
277 			cpu_lfence();
278 			break;
279 		}
280 
281 		/*
282 		 * Try to own the DIO by setting INPROG so we can issue
283 		 * I/O on it.  INPROG might already be set, in which case
284 		 * there is no way we can do this non-blocking so we punt.
285 		 */
286 		if ((orefs & HAMMER2_DIO_INPROG))
287 			break;
288 		nrefs = orefs | HAMMER2_DIO_INPROG;
289 		if (atomic_cmpset_int(&dio->refs, orefs, nrefs) == 0)
290 			continue;
291 
292 		/*
293 		 * We own DIO_INPROG, try to set DIO_GOOD.
294 		 *
295 		 * For now do not use GETBLK_NOWAIT because
296 		 */
297 		bp = dio->bp;
298 		dio->bp = NULL;
299 		if (bp == NULL) {
300 #if 0
301 			bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0);
302 #endif
303 			bread(hmp->devvp, dio->pbase, dio->psize, &bp);
304 		}
305 		if (bp) {
306 			if ((bp->b_flags & B_ERROR) == 0 &&
307 			    (bp->b_flags & B_CACHE)) {
308 				dio->bp = bp;	/* assign BEFORE setting flag */
309 				atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
310 			} else {
311 				bqrelse(bp);
312 				bp = NULL;
313 			}
314 		}
315 
316 		/*
317 		 * Clear DIO_INPROG.
318 		 *
319 		 * This is actually a bit complicated, see
320 		 * hammer2_io_complete() for more information.
321 		 */
322 		iocb.dio = dio;
323 		iocb.flags = HAMMER2_IOCB_INPROG;
324 		hammer2_io_complete(&iocb);
325 		break;
326 	}
327 
328 	/*
329 	 * Only return the dio if its buffer is good.
330 	 */
331 	if ((dio->refs & HAMMER2_DIO_GOOD) == 0) {
332 		hammer2_io_putblk(&dio);
333 	}
334 	return dio;
335 }
336 
337 /*
338  * The originator of the iocb is finished with it.
339  */
340 void
341 hammer2_io_complete(hammer2_iocb_t *iocb)
342 {
343 	hammer2_io_t *dio = iocb->dio;
344 	hammer2_iocb_t *cbtmp;
345 	uint32_t orefs;
346 	uint32_t nrefs;
347 	uint32_t oflags;
348 	uint32_t nflags;
349 
350 	/*
351 	 * If IOCB_INPROG was not set completion is synchronous due to the
352 	 * buffer already being good.  We can simply set IOCB_DONE and return.
353 	 * In this situation DIO_INPROG is not set and we have no visibility
354 	 * on dio->bp.
355 	 */
356 	if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
357 		atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
358 		return;
359 	}
360 
361 	/*
362 	 * The iocb was queued, obtained DIO_INPROG, and its callback was
363 	 * made.  The callback is now complete.  We still own DIO_INPROG.
364 	 *
365 	 * We can set DIO_GOOD if no error occurred, which gives certain
366 	 * stability guarantees to dio->bp and allows other accessors to
367 	 * short-cut access.  DIO_GOOD cannot be cleared until the last
368 	 * ref is dropped.
369 	 */
370 	KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
371 	if (dio->bp) {
372 		BUF_KERNPROC(dio->bp);
373 		if ((dio->bp->b_flags & B_ERROR) == 0) {
374 			KKASSERT(dio->bp->b_flags & B_CACHE);
375 			atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
376 		}
377 	}
378 
379 	/*
380 	 * Clean up the dio before marking the iocb as being done.  If another
381 	 * iocb is pending we chain to it while leaving DIO_INPROG set (it
382 	 * will call io completion and presumably clear DIO_INPROG).
383 	 *
384 	 * Otherwise if no other iocbs are pending we clear DIO_INPROG before
385 	 * finishing up the cbio.  This means that DIO_INPROG is cleared at
386 	 * the end of the chain before ANY of the cbios are marked done.
387 	 *
388 	 * NOTE: The TAILQ is not stable until the spin-lock is held.
389 	 */
390 	for (;;) {
391 		orefs = dio->refs;
392 		nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
393 
394 		if (orefs & HAMMER2_DIO_WAITING) {
395 			hammer2_spin_ex(&dio->spin);
396 			cbtmp = TAILQ_FIRST(&dio->iocbq);
397 			if (cbtmp) {
398 				/*
399 				 * NOTE: flags not adjusted in this case.
400 				 *	 Flags will be adjusted by the last
401 				 *	 iocb.
402 				 */
403 				TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
404 				hammer2_spin_unex(&dio->spin);
405 				cbtmp->callback(cbtmp);	/* chained */
406 				break;
407 			} else if (atomic_cmpset_int(&dio->refs,
408 						     orefs, nrefs)) {
409 				hammer2_spin_unex(&dio->spin);
410 				break;
411 			}
412 			hammer2_spin_unex(&dio->spin);
413 			/* retry */
414 		} else if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
415 			break;
416 		} /* else retry */
417 		/* retry */
418 	}
419 
420 	/*
421 	 * Mark the iocb as done and wakeup any waiters.  This is done after
422 	 * all iocb chains have been called back and after DIO_INPROG has been
423 	 * cleared.  This avoids races against ref count drops by the waiting
424 	 * threads (a hard but not impossible SMP race) which might result in
425 	 * a 1->0 transition of the refs while DIO_INPROG is still set.
426 	 */
427 	for (;;) {
428 		oflags = iocb->flags;
429 		cpu_ccfence();
430 		nflags = oflags;
431 		nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
432 		nflags |= HAMMER2_IOCB_DONE;
433 
434 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
435 			if (oflags & HAMMER2_IOCB_WAKEUP)
436 				wakeup(iocb);
437 			/* SMP: iocb is now stale */
438 			break;
439 		}
440 		/* retry */
441 	}
442 	iocb = NULL;
443 
444 }
445 
446 /*
447  * Wait for an iocb's I/O to finish.
448  */
449 void
450 hammer2_iocb_wait(hammer2_iocb_t *iocb)
451 {
452 	uint32_t oflags;
453 	uint32_t nflags;
454 
455 	for (;;) {
456 		oflags = iocb->flags;
457 		cpu_ccfence();
458 		nflags = oflags | HAMMER2_IOCB_WAKEUP;
459 		if (oflags & HAMMER2_IOCB_DONE)
460 			break;
461 		tsleep_interlock(iocb, 0);
462 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
463 			tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
464 		}
465 	}
466 
467 }
468 
469 /*
470  * Release our ref on *diop.
471  *
472  * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
473  * then dispose of the underlying buffer.
474  */
475 void
476 hammer2_io_putblk(hammer2_io_t **diop)
477 {
478 	hammer2_dev_t *hmp;
479 	hammer2_io_t *dio;
480 	hammer2_iocb_t iocb;
481 	struct buf *bp;
482 	off_t peof;
483 	off_t pbase;
484 	int psize;
485 	int orefs;
486 	int nrefs;
487 
488 	dio = *diop;
489 	*diop = NULL;
490 	hmp = dio->hmp;
491 
492 	/*
493 	 * Drop refs.
494 	 *
495 	 * On the 1->0 transition clear flags and set INPROG.
496 	 *
497 	 * On the 1->0 transition if INPROG is already set, another thread
498 	 * is in lastdrop and we can just return after the transition.
499 	 *
500 	 * On any other transition we can generally just return.
501 	 */
502 	for (;;) {
503 		orefs = dio->refs;
504 		cpu_ccfence();
505 		nrefs = orefs - 1;
506 
507 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
508 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
509 			/*
510 			 * Lastdrop case, INPROG can be set.
511 			 */
512 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
513 			nrefs |= HAMMER2_DIO_INPROG;
514 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
515 				break;
516 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
517 			/*
518 			 * Lastdrop case, INPROG already set.
519 			 */
520 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
521 				atomic_add_int(&hmp->iofree_count, 1);
522 				return;
523 			}
524 		} else {
525 			/*
526 			 * Normal drop case.
527 			 */
528 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
529 				return;
530 		}
531 		cpu_pause();
532 		/* retry */
533 	}
534 
535 	/*
536 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
537 	 * have been cleared.
538 	 *
539 	 * We can now dispose of the buffer, and should do it before calling
540 	 * io_complete() in case there's a race against a new reference
541 	 * which causes io_complete() to chain and instantiate the bp again.
542 	 */
543 	pbase = dio->pbase;
544 	psize = dio->psize;
545 	bp = dio->bp;
546 	dio->bp = NULL;
547 
548 	if (orefs & HAMMER2_DIO_GOOD) {
549 		KKASSERT(bp != NULL);
550 		if (orefs & HAMMER2_DIO_DIRTY) {
551 			int hce;
552 
553 			dio_write_stats_update(dio);
554 			if ((hce = hammer2_cluster_enable) > 0) {
555 				peof = (pbase + HAMMER2_SEGMASK64) &
556 				       ~HAMMER2_SEGMASK64;
557 				cluster_write(bp, peof, psize, hce);
558 			} else {
559 				bp->b_flags |= B_CLUSTEROK;
560 				bdwrite(bp);
561 			}
562 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
563 			brelse(bp);
564 		} else {
565 			bqrelse(bp);
566 		}
567 	} else if (bp) {
568 		if (orefs & HAMMER2_DIO_DIRTY) {
569 			dio_write_stats_update(dio);
570 			bdwrite(bp);
571 		} else {
572 			brelse(bp);
573 		}
574 	}
575 
576 	/*
577 	 * The instant we call io_complete dio is a free agent again and
578 	 * can be ripped out from under us.
579 	 *
580 	 * we can cleanup our final DIO_INPROG by simulating an iocb
581 	 * completion.
582 	 */
583 	hmp = dio->hmp;				/* extract fields */
584 	atomic_add_int(&hmp->iofree_count, 1);
585 	cpu_ccfence();
586 
587 	iocb.dio = dio;
588 	iocb.flags = HAMMER2_IOCB_INPROG;
589 	hammer2_io_complete(&iocb);
590 	dio = NULL;				/* dio stale */
591 
592 	/*
593 	 * We cache free buffers so re-use cases can use a shared lock, but
594 	 * if too many build up we have to clean them out.
595 	 */
596 	if (hmp->iofree_count > 65536) {
597 		struct hammer2_cleanupcb_info info;
598 
599 		RB_INIT(&info.tmptree);
600 		hammer2_spin_ex(&hmp->io_spin);
601 		if (hmp->iofree_count > 65536) {
602 			info.count = hmp->iofree_count / 4;
603 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
604 				hammer2_io_cleanup_callback, &info);
605 		}
606 		hammer2_spin_unex(&hmp->io_spin);
607 		hammer2_io_cleanup(hmp, &info.tmptree);
608 	}
609 }
610 
611 /*
612  * Cleanup any dio's with (INPROG | refs) == 0.
613  *
614  * Called to clean up cached DIOs on umount after all activity has been
615  * flushed.
616  */
617 static
618 int
619 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
620 {
621 	struct hammer2_cleanupcb_info *info = arg;
622 	hammer2_io_t *xio;
623 
624 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
625 		if (dio->act > 0) {
626 			--dio->act;
627 			return 0;
628 		}
629 		KKASSERT(dio->bp == NULL);
630 		RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
631 		xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
632 		KKASSERT(xio == NULL);
633 		if (--info->count <= 0)	/* limit scan */
634 			return(-1);
635 	}
636 	return 0;
637 }
638 
639 void
640 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
641 {
642 	hammer2_io_t *dio;
643 
644 	while ((dio = RB_ROOT(tree)) != NULL) {
645 		RB_REMOVE(hammer2_io_tree, tree, dio);
646 		KKASSERT(dio->bp == NULL &&
647 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
648 		kfree(dio, M_HAMMER2);
649 		atomic_add_int(&hammer2_dio_count, -1);
650 		atomic_add_int(&hmp->iofree_count, -1);
651 	}
652 }
653 
654 /*
655  * Returns a pointer to the requested data.
656  */
657 char *
658 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
659 {
660 	struct buf *bp;
661 	int off;
662 
663 	bp = dio->bp;
664 	KKASSERT(bp != NULL);
665 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
666 	KKASSERT(off >= 0 && off < bp->b_bufsize);
667 	return(bp->b_data + off);
668 }
669 
670 /*
671  * Keep track of good CRCs in dio->good_crc_mask. XXX needs to be done
672  * in the chain structure, but chain structure needs to be persistent as
673  * well on refs=0 and it isn't.
674  */
675 int
676 hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp)
677 {
678 	hammer2_io_t *dio;
679 	uint64_t mask;
680 
681 	if ((dio = chain->dio) != NULL && chain->bytes >= 1024) {
682 		mask = hammer2_io_mask(dio, chain->bref.data_off, chain->bytes);
683 		*maskp = mask;
684 		if ((dio->crc_good_mask & mask) == mask)
685 			return 1;
686 		return 0;
687 	}
688 	*maskp = 0;
689 
690 	return 0;
691 }
692 
693 void
694 hammer2_io_crc_setmask(hammer2_io_t *dio, uint64_t mask)
695 {
696 	if (dio) {
697 		if (sizeof(long) == 8) {
698 			atomic_set_long(&dio->crc_good_mask, mask);
699 		} else {
700 #if _BYTE_ORDER == _LITTLE_ENDIAN
701 			atomic_set_int(&((int *)&dio->crc_good_mask)[0],
702 					(uint32_t)mask);
703 			atomic_set_int(&((int *)&dio->crc_good_mask)[1],
704 					(uint32_t)(mask >> 32));
705 #else
706 			atomic_set_int(&((int *)&dio->crc_good_mask)[0],
707 					(uint32_t)(mask >> 32));
708 			atomic_set_int(&((int *)&dio->crc_good_mask)[1],
709 					(uint32_t)mask);
710 #endif
711 		}
712 	}
713 }
714 
715 void
716 hammer2_io_crc_clrmask(hammer2_io_t *dio, uint64_t mask)
717 {
718 	if (dio) {
719 		if (sizeof(long) == 8) {
720 			atomic_clear_long(&dio->crc_good_mask, mask);
721 		} else {
722 #if _BYTE_ORDER == _LITTLE_ENDIAN
723 			atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
724 					(uint32_t)mask);
725 			atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
726 					(uint32_t)(mask >> 32));
727 #else
728 			atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
729 					(uint32_t)(mask >> 32));
730 			atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
731 					(uint32_t)mask);
732 #endif
733 		}
734 	}
735 }
736 
737 /*
738  * Helpers for hammer2_io_new*() functions
739  */
740 static
741 void
742 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
743 {
744 	hammer2_io_t *dio = iocb->dio;
745 	int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
746 
747 	/*
748 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
749 	 * can't mess with it other than zero the requested range.
750 	 *
751 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
752 	 * do what needs to be done with dio->bp.
753 	 */
754 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
755 		if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
756 			if (iocb->lsize == dio->psize) {
757 				/*
758 				 * Fully covered buffer, try to optimize to
759 				 * avoid any I/O.  We might already have the
760 				 * buffer due to iocb chaining.
761 				 */
762 				if (dio->bp == NULL) {
763 					dio->bp = getblk(dio->hmp->devvp,
764 							 dio->pbase, dio->psize,
765 							 gbctl, 0);
766 				}
767 				if (dio->bp) {
768 					vfs_bio_clrbuf(dio->bp);
769 					dio->bp->b_flags |= B_CACHE;
770 				}
771 			} else if (iocb->flags & HAMMER2_IOCB_QUICK) {
772 				/*
773 				 * Partial buffer, quick mode.  Do nothing.
774 				 * Do not instantiate the buffer or try to
775 				 * mark it B_CACHE because other portions of
776 				 * the buffer might have to be read by other
777 				 * accessors.
778 				 */
779 			} else if (dio->bp == NULL ||
780 				   (dio->bp->b_flags & B_CACHE) == 0) {
781 				/*
782 				 * Partial buffer, normal mode, requires
783 				 * read-before-write.  Chain the read.
784 				 *
785 				 * We might already have the buffer due to
786 				 * iocb chaining.  XXX unclear if we really
787 				 * need to write/release it and reacquire
788 				 * in that case.
789 				 *
790 				 * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
791 				 */
792 				if (dio->bp) {
793 					if (dio->refs & HAMMER2_DIO_DIRTY) {
794 						dio_write_stats_update(dio);
795 						bdwrite(dio->bp);
796 					} else {
797 						bqrelse(dio->bp);
798 					}
799 					dio->bp = NULL;
800 				}
801 				atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
802 				breadcb(dio->hmp->devvp,
803 					dio->pbase, dio->psize,
804 					hammer2_io_callback, iocb);
805 				return;
806 			} /* else buffer is good */
807 		} /* else callback from breadcb is complete */
808 	}
809 	if (dio->bp) {
810 		if (iocb->flags & HAMMER2_IOCB_ZERO)
811 			bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
812 		atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
813 	}
814 	hammer2_io_complete(iocb);
815 }
816 
817 static
818 int
819 _hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
820 	        hammer2_io_t **diop, int flags)
821 {
822 	hammer2_iocb_t iocb;
823 	hammer2_io_t *dio;
824 
825 	iocb.callback = hammer2_iocb_new_callback;
826 	iocb.cluster = NULL;
827 	iocb.chain = NULL;
828 	iocb.ptr = NULL;
829 	iocb.lbase = lbase;
830 	iocb.lsize = lsize;
831 	iocb.flags = flags;
832 	iocb.btype = btype;
833 	iocb.error = 0;
834 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
835 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
836 		hammer2_iocb_wait(&iocb);
837 	dio = *diop = iocb.dio;
838 
839 	return (iocb.error);
840 }
841 
842 int
843 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
844 	       hammer2_io_t **diop)
845 {
846 	return(_hammer2_io_new(hmp, btype, lbase, lsize,
847 			       diop, HAMMER2_IOCB_ZERO));
848 }
849 
850 int
851 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
852 		 hammer2_io_t **diop)
853 {
854 	return(_hammer2_io_new(hmp, btype, lbase, lsize, diop, 0));
855 }
856 
857 int
858 hammer2_io_newq(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
859 		hammer2_io_t **diop)
860 {
861 	return(_hammer2_io_new(hmp, btype, lbase, lsize,
862 			       diop, HAMMER2_IOCB_QUICK));
863 }
864 
865 static
866 void
867 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
868 {
869 	hammer2_io_t *dio = iocb->dio;
870 	off_t peof;
871 	int error;
872 
873 	/*
874 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
875 	 * can't mess with it other than zero the requested range.
876 	 *
877 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
878 	 * do what needs to be done with dio->bp.
879 	 */
880 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
881 		int hce;
882 
883 		if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
884 			/*
885 			 * Already good, likely due to being chained from
886 			 * another iocb.
887 			 */
888 			error = 0;
889 		} else if ((hce = hammer2_cluster_enable) > 0) {
890 			/*
891 			 * Synchronous cluster I/O for now.
892 			 */
893 			if (dio->bp) {
894 				bqrelse(dio->bp);
895 				dio->bp = NULL;
896 			}
897 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
898 			       ~HAMMER2_SEGMASK64;
899 			error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
900 					     dio->psize,
901 					     dio->psize, HAMMER2_PBUFSIZE*hce,
902 					     &dio->bp);
903 		} else {
904 			/*
905 			 * Synchronous I/O for now.
906 			 */
907 			if (dio->bp) {
908 				bqrelse(dio->bp);
909 				dio->bp = NULL;
910 			}
911 			error = bread(dio->hmp->devvp, dio->pbase,
912 				      dio->psize, &dio->bp);
913 		}
914 		if (error) {
915 			brelse(dio->bp);
916 			dio->bp = NULL;
917 		}
918 	}
919 	hammer2_io_complete(iocb);
920 }
921 
922 int
923 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
924 		hammer2_io_t **diop)
925 {
926 	hammer2_iocb_t iocb;
927 	hammer2_io_t *dio;
928 
929 	iocb.callback = hammer2_iocb_bread_callback;
930 	iocb.cluster = NULL;
931 	iocb.chain = NULL;
932 	iocb.ptr = NULL;
933 	iocb.lbase = lbase;
934 	iocb.lsize = lsize;
935 	iocb.btype = btype;
936 	iocb.flags = 0;
937 	iocb.error = 0;
938 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
939 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
940 		hammer2_iocb_wait(&iocb);
941 	dio = *diop = iocb.dio;
942 
943 	return (iocb.error);
944 }
945 
946 /*
947  * System buf/bio async callback extracts the iocb and chains
948  * to the iocb callback.
949  */
950 void
951 hammer2_io_callback(struct bio *bio)
952 {
953 	struct buf *dbp = bio->bio_buf;
954 	hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
955 	hammer2_io_t *dio;
956 
957 	dio = iocb->dio;
958 	if ((bio->bio_flags & BIO_DONE) == 0)
959 		bpdone(dbp, 0);
960 	bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
961 	dio->bp = bio->bio_buf;
962 	iocb->callback(iocb);
963 }
964 
965 void
966 hammer2_io_bawrite(hammer2_io_t **diop)
967 {
968 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
969 	hammer2_io_putblk(diop);
970 }
971 
972 void
973 hammer2_io_bdwrite(hammer2_io_t **diop)
974 {
975 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
976 	hammer2_io_putblk(diop);
977 }
978 
979 int
980 hammer2_io_bwrite(hammer2_io_t **diop)
981 {
982 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
983 	hammer2_io_putblk(diop);
984 	return (0);	/* XXX */
985 }
986 
987 void
988 hammer2_io_setdirty(hammer2_io_t *dio)
989 {
990 	atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
991 }
992 
993 void
994 hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
995 {
996 	uint64_t mask = hammer2_io_mask(dio, off, bytes);
997 
998 	hammer2_io_crc_clrmask(dio, mask);
999 	if ((u_int)dio->psize == bytes)
1000 		dio->bp->b_flags |= B_INVAL | B_RELBUF;
1001 }
1002 
1003 void
1004 hammer2_io_brelse(hammer2_io_t **diop)
1005 {
1006 	hammer2_io_putblk(diop);
1007 }
1008 
1009 void
1010 hammer2_io_bqrelse(hammer2_io_t **diop)
1011 {
1012 	hammer2_io_putblk(diop);
1013 }
1014 
1015 int
1016 hammer2_io_isdirty(hammer2_io_t *dio)
1017 {
1018 	return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
1019 }
1020 
1021 static
1022 void
1023 dio_write_stats_update(hammer2_io_t *dio)
1024 {
1025 	long *counterp;
1026 
1027 	switch(dio->btype) {
1028 	case 0:
1029 		return;
1030 	case HAMMER2_BREF_TYPE_DATA:
1031 		counterp = &hammer2_iod_file_write;
1032 		break;
1033 	case HAMMER2_BREF_TYPE_INODE:
1034 		counterp = &hammer2_iod_meta_write;
1035 		break;
1036 	case HAMMER2_BREF_TYPE_INDIRECT:
1037 		counterp = &hammer2_iod_indr_write;
1038 		break;
1039 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1040 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1041 		counterp = &hammer2_iod_fmap_write;
1042 		break;
1043 	default:
1044 		counterp = &hammer2_iod_volu_write;
1045 		break;
1046 	}
1047 	*counterp += dio->psize;
1048 }
1049