xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision 279dd846)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 /*
38  * Implements an abstraction layer for synchronous and asynchronous
39  * buffered device I/O.  Can be used for OS-abstraction but the main
40  * purpose is to allow larger buffers to be used against hammer2_chain's
41  * using smaller allocations, without causing deadlocks.
42  *
43  */
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
45 
46 static int
47 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
48 {
49 	if (io1->pbase < io2->pbase)
50 		return(-1);
51 	if (io1->pbase > io2->pbase)
52 		return(1);
53 	return(0);
54 }
55 
56 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
57 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
58 		off_t, pbase);
59 
60 struct hammer2_cleanupcb_info {
61 	struct hammer2_io_tree tmptree;
62 	int	count;
63 };
64 
65 #define HAMMER2_GETBLK_GOOD	0
66 #define HAMMER2_GETBLK_QUEUED	1
67 #define HAMMER2_GETBLK_OWNED	2
68 
69 /*
70  * Allocate/Locate the requested dio, reference it, issue or queue iocb.
71  */
72 void
73 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
74 		  hammer2_iocb_t *iocb)
75 {
76 	hammer2_io_t *dio;
77 	hammer2_io_t *xio;
78 	off_t pbase;
79 	off_t pmask;
80 	/*
81 	 * XXX after free, buffer reuse case w/ different size can clash
82 	 * with dio cache.  Lets avoid it for now.  Ultimate we need to
83 	 * invalidate the dio cache when freeing blocks to allow a mix
84 	 * of 16KB and 64KB block sizes).
85 	 */
86 	/*int psize = hammer2_devblksize(lsize);*/
87 	int psize = HAMMER2_PBUFSIZE;
88 	int refs;
89 
90 	pmask = ~(hammer2_off_t)(psize - 1);
91 
92 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
93 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
94 	pbase = lbase & pmask;
95 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
96 
97 	/*
98 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
99 	 */
100 	hammer2_spin_sh(&hmp->io_spin);
101 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
102 	if (dio) {
103 		if ((atomic_fetchadd_int(&dio->refs, 1) &
104 		     HAMMER2_DIO_MASK) == 0) {
105 			atomic_add_int(&dio->hmp->iofree_count, -1);
106 		}
107 		hammer2_spin_unsh(&hmp->io_spin);
108 	} else {
109 		hammer2_spin_unsh(&hmp->io_spin);
110 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
111 		dio->hmp = hmp;
112 		dio->pbase = pbase;
113 		dio->psize = psize;
114 		dio->refs = 1;
115 		hammer2_spin_init(&dio->spin, "h2dio");
116 		TAILQ_INIT(&dio->iocbq);
117 		hammer2_spin_ex(&hmp->io_spin);
118 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
119 		if (xio == NULL) {
120 			atomic_add_int(&hammer2_dio_count, 1);
121 			hammer2_spin_unex(&hmp->io_spin);
122 		} else {
123 			if ((atomic_fetchadd_int(&xio->refs, 1) &
124 			     HAMMER2_DIO_MASK) == 0) {
125 				atomic_add_int(&xio->hmp->iofree_count, -1);
126 			}
127 			hammer2_spin_unex(&hmp->io_spin);
128 			kfree(dio, M_HAMMER2);
129 			dio = xio;
130 		}
131 	}
132 
133 	/*
134 	 * Obtain/Validate the buffer.
135 	 */
136 	iocb->dio = dio;
137 
138 	if (dio->act < 5)	/* SMP race ok */
139 		++dio->act;
140 
141 	for (;;) {
142 		refs = dio->refs;
143 		cpu_ccfence();
144 
145 		/*
146 		 * Issue the iocb immediately if the buffer is already good.
147 		 * Once set GOOD cannot be cleared until refs drops to 0.
148 		 *
149 		 * lfence required because dio is not interlockedf for
150 		 * the DIO_GOOD test.
151 		 */
152 		if (refs & HAMMER2_DIO_GOOD) {
153 			cpu_lfence();
154 			iocb->callback(iocb);
155 			break;
156 		}
157 
158 		/*
159 		 * Try to own the DIO by setting INPROG so we can issue
160 		 * I/O on it.
161 		 */
162 		if (refs & HAMMER2_DIO_INPROG) {
163 			/*
164 			 * If DIO_INPROG is already set then set WAITING and
165 			 * queue the iocb.
166 			 */
167 			hammer2_spin_ex(&dio->spin);
168 			if (atomic_cmpset_int(&dio->refs, refs,
169 					      refs | HAMMER2_DIO_WAITING)) {
170 				iocb->flags |= HAMMER2_IOCB_ONQ |
171 					       HAMMER2_IOCB_INPROG;
172 				TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
173 				hammer2_spin_unex(&dio->spin);
174 				break;
175 			}
176 			hammer2_spin_unex(&dio->spin);
177 			/* retry */
178 		} else {
179 			/*
180 			 * If DIO_INPROG is not set then set it and issue the
181 			 * callback immediately to start I/O.
182 			 */
183 			if (atomic_cmpset_int(&dio->refs, refs,
184 					      refs | HAMMER2_DIO_INPROG)) {
185 				iocb->flags |= HAMMER2_IOCB_INPROG;
186 				iocb->callback(iocb);
187 				break;
188 			}
189 			/* retry */
190 		}
191 		/* retry */
192 	}
193 }
194 
195 /*
196  * Quickly obtain a good DIO buffer, return NULL if the system no longer
197  * caches the data.
198  */
199 hammer2_io_t *
200 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
201 {
202 	hammer2_iocb_t iocb;
203 	hammer2_io_t *dio;
204 	struct buf *bp;
205 	off_t pbase;
206 	off_t pmask;
207 	int psize = HAMMER2_PBUFSIZE;
208 	int orefs;
209 	int nrefs;
210 
211 	pmask = ~(hammer2_off_t)(psize - 1);
212 
213 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
214 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
215 	pbase = lbase & pmask;
216 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
217 
218 	/*
219 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
220 	 */
221 	hammer2_spin_sh(&hmp->io_spin);
222 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
223 	if (dio == NULL) {
224 		hammer2_spin_unsh(&hmp->io_spin);
225 		return NULL;
226 	}
227 
228 	if ((atomic_fetchadd_int(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
229 		atomic_add_int(&dio->hmp->iofree_count, -1);
230 	hammer2_spin_unsh(&hmp->io_spin);
231 
232 	if (dio->act < 5)	/* SMP race ok */
233 		++dio->act;
234 
235 	/*
236 	 * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
237 	 * the system does not have the data already cached.
238 	 */
239 	nrefs = -1;
240 	for (;;) {
241 		orefs = dio->refs;
242 		cpu_ccfence();
243 
244 		/*
245 		 * Issue the iocb immediately if the buffer is already good.
246 		 * Once set GOOD cannot be cleared until refs drops to 0.
247 		 *
248 		 * lfence required because dio is not interlockedf for
249 		 * the DIO_GOOD test.
250 		 */
251 		if (orefs & HAMMER2_DIO_GOOD) {
252 			cpu_lfence();
253 			break;
254 		}
255 
256 		/*
257 		 * Try to own the DIO by setting INPROG so we can issue
258 		 * I/O on it.  INPROG might already be set, in which case
259 		 * there is no way we can do this non-blocking so we punt.
260 		 */
261 		if ((orefs & HAMMER2_DIO_INPROG))
262 			break;
263 		nrefs = orefs | HAMMER2_DIO_INPROG;
264 		if (atomic_cmpset_int(&dio->refs, orefs, nrefs) == 0)
265 			continue;
266 
267 		/*
268 		 * We own DIO_INPROG, try to set DIO_GOOD.
269 		 *
270 		 * For now do not use GETBLK_NOWAIT because
271 		 */
272 		bp = dio->bp;
273 		dio->bp = NULL;
274 		if (bp == NULL) {
275 #if 0
276 			bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0);
277 #endif
278 			bread(hmp->devvp, dio->pbase, dio->psize, &bp);
279 		}
280 		if (bp) {
281 			if ((bp->b_flags & B_ERROR) == 0 &&
282 			    (bp->b_flags & B_CACHE)) {
283 				dio->bp = bp;	/* assign BEFORE setting flag */
284 				atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
285 			} else {
286 				bqrelse(bp);
287 				bp = NULL;
288 			}
289 		}
290 
291 		/*
292 		 * Clear DIO_INPROG.
293 		 *
294 		 * This is actually a bit complicated, see
295 		 * hammer2_io_complete() for more information.
296 		 */
297 		iocb.dio = dio;
298 		iocb.flags = HAMMER2_IOCB_INPROG;
299 		hammer2_io_complete(&iocb);
300 		break;
301 	}
302 
303 	/*
304 	 * Only return the dio if its buffer is good.
305 	 */
306 	if ((dio->refs & HAMMER2_DIO_GOOD) == 0) {
307 		hammer2_io_putblk(&dio);
308 	}
309 	return dio;
310 }
311 
312 /*
313  * The originator of the iocb is finished with it.
314  */
315 void
316 hammer2_io_complete(hammer2_iocb_t *iocb)
317 {
318 	hammer2_io_t *dio = iocb->dio;
319 	hammer2_iocb_t *cbtmp;
320 	uint32_t orefs;
321 	uint32_t nrefs;
322 	uint32_t oflags;
323 	uint32_t nflags;
324 
325 	/*
326 	 * If IOCB_INPROG was not set completion is synchronous due to the
327 	 * buffer already being good.  We can simply set IOCB_DONE and return.
328 	 * In this situation DIO_INPROG is not set and we have no visibility
329 	 * on dio->bp.
330 	 */
331 	if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
332 		atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
333 		return;
334 	}
335 
336 	/*
337 	 * The iocb was queued, obtained DIO_INPROG, and its callback was
338 	 * made.  The callback is now complete.  We still own DIO_INPROG.
339 	 *
340 	 * We can set DIO_GOOD if no error occurred, which gives certain
341 	 * stability guarantees to dio->bp and allows other accessors to
342 	 * short-cut access.  DIO_GOOD cannot be cleared until the last
343 	 * ref is dropped.
344 	 */
345 	KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
346 	if (dio->bp) {
347 		BUF_KERNPROC(dio->bp);
348 		if ((dio->bp->b_flags & B_ERROR) == 0) {
349 			KKASSERT(dio->bp->b_flags & B_CACHE);
350 			atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
351 		}
352 	}
353 
354 	/*
355 	 * Clean up the dio before marking the iocb as being done.  If another
356 	 * iocb is pending we chain to it while leaving DIO_INPROG set (it
357 	 * will call io completion and presumably clear DIO_INPROG).
358 	 *
359 	 * Otherwise if no other iocbs are pending we clear DIO_INPROG before
360 	 * finishing up the cbio.  This means that DIO_INPROG is cleared at
361 	 * the end of the chain before ANY of the cbios are marked done.
362 	 *
363 	 * NOTE: The TAILQ is not stable until the spin-lock is held.
364 	 */
365 	for (;;) {
366 		orefs = dio->refs;
367 		nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
368 
369 		if (orefs & HAMMER2_DIO_WAITING) {
370 			hammer2_spin_ex(&dio->spin);
371 			cbtmp = TAILQ_FIRST(&dio->iocbq);
372 			if (cbtmp) {
373 				/*
374 				 * NOTE: flags not adjusted in this case.
375 				 *	 Flags will be adjusted by the last
376 				 *	 iocb.
377 				 */
378 				TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
379 				hammer2_spin_unex(&dio->spin);
380 				cbtmp->callback(cbtmp);	/* chained */
381 				break;
382 			} else if (atomic_cmpset_int(&dio->refs,
383 						     orefs, nrefs)) {
384 				hammer2_spin_unex(&dio->spin);
385 				break;
386 			}
387 			hammer2_spin_unex(&dio->spin);
388 			/* retry */
389 		} else if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
390 			break;
391 		} /* else retry */
392 		/* retry */
393 	}
394 
395 	/*
396 	 * Mark the iocb as done and wakeup any waiters.  This is done after
397 	 * all iocb chains have been called back and after DIO_INPROG has been
398 	 * cleared.  This avoids races against ref count drops by the waiting
399 	 * threads (a hard but not impossible SMP race) which might result in
400 	 * a 1->0 transition of the refs while DIO_INPROG is still set.
401 	 */
402 	for (;;) {
403 		oflags = iocb->flags;
404 		cpu_ccfence();
405 		nflags = oflags;
406 		nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
407 		nflags |= HAMMER2_IOCB_DONE;
408 
409 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
410 			if (oflags & HAMMER2_IOCB_WAKEUP)
411 				wakeup(iocb);
412 			/* SMP: iocb is now stale */
413 			break;
414 		}
415 		/* retry */
416 	}
417 	iocb = NULL;
418 
419 }
420 
421 /*
422  * Wait for an iocb's I/O to finish.
423  */
424 void
425 hammer2_iocb_wait(hammer2_iocb_t *iocb)
426 {
427 	uint32_t oflags;
428 	uint32_t nflags;
429 
430 	for (;;) {
431 		oflags = iocb->flags;
432 		cpu_ccfence();
433 		nflags = oflags | HAMMER2_IOCB_WAKEUP;
434 		if (oflags & HAMMER2_IOCB_DONE)
435 			break;
436 		tsleep_interlock(iocb, 0);
437 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
438 			tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
439 		}
440 	}
441 
442 }
443 
444 /*
445  * Release our ref on *diop.
446  *
447  * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
448  * then dispose of the underlying buffer.
449  */
450 void
451 hammer2_io_putblk(hammer2_io_t **diop)
452 {
453 	hammer2_dev_t *hmp;
454 	hammer2_io_t *dio;
455 	hammer2_iocb_t iocb;
456 	struct buf *bp;
457 	off_t peof;
458 	off_t pbase;
459 	int psize;
460 	int orefs;
461 	int nrefs;
462 
463 	dio = *diop;
464 	*diop = NULL;
465 	hmp = dio->hmp;
466 
467 	/*
468 	 * Drop refs.
469 	 *
470 	 * On the 1->0 transition clear flags and set INPROG.
471 	 *
472 	 * On the 1->0 transition if INPROG is already set, another thread
473 	 * is in lastdrop and we can just return after the transition.
474 	 *
475 	 * On any other transition we can generally just return.
476 	 */
477 	for (;;) {
478 		orefs = dio->refs;
479 		cpu_ccfence();
480 		nrefs = orefs - 1;
481 
482 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
483 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
484 			/*
485 			 * Lastdrop case, INPROG can be set.
486 			 */
487 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
488 			nrefs |= HAMMER2_DIO_INPROG;
489 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
490 				break;
491 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
492 			/*
493 			 * Lastdrop case, INPROG already set.
494 			 */
495 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
496 				atomic_add_int(&hmp->iofree_count, 1);
497 				return;
498 			}
499 		} else {
500 			/*
501 			 * Normal drop case.
502 			 */
503 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
504 				return;
505 		}
506 		cpu_pause();
507 		/* retry */
508 	}
509 
510 	/*
511 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
512 	 * have been cleared.
513 	 *
514 	 * We can now dispose of the buffer, and should do it before calling
515 	 * io_complete() in case there's a race against a new reference
516 	 * which causes io_complete() to chain and instantiate the bp again.
517 	 */
518 	pbase = dio->pbase;
519 	psize = dio->psize;
520 	bp = dio->bp;
521 	dio->bp = NULL;
522 
523 	if (orefs & HAMMER2_DIO_GOOD) {
524 		KKASSERT(bp != NULL);
525 		if (orefs & HAMMER2_DIO_DIRTY) {
526 			if (hammer2_cluster_enable) {
527 				peof = (pbase + HAMMER2_SEGMASK64) &
528 				       ~HAMMER2_SEGMASK64;
529 				cluster_write(bp, peof, psize, 4);
530 			} else {
531 				bp->b_flags |= B_CLUSTEROK;
532 				bdwrite(bp);
533 			}
534 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
535 			brelse(bp);
536 		} else {
537 			bqrelse(bp);
538 		}
539 	} else if (bp) {
540 		if (orefs & HAMMER2_DIO_DIRTY) {
541 			bdwrite(bp);
542 		} else {
543 			brelse(bp);
544 		}
545 	}
546 
547 	/*
548 	 * The instant we call io_complete dio is a free agent again and
549 	 * can be ripped out from under us.
550 	 *
551 	 * we can cleanup our final DIO_INPROG by simulating an iocb
552 	 * completion.
553 	 */
554 	hmp = dio->hmp;				/* extract fields */
555 	atomic_add_int(&hmp->iofree_count, 1);
556 	cpu_ccfence();
557 
558 	iocb.dio = dio;
559 	iocb.flags = HAMMER2_IOCB_INPROG;
560 	hammer2_io_complete(&iocb);
561 	dio = NULL;				/* dio stale */
562 
563 	/*
564 	 * We cache free buffers so re-use cases can use a shared lock, but
565 	 * if too many build up we have to clean them out.
566 	 */
567 	if (hmp->iofree_count > 65536) {
568 		struct hammer2_cleanupcb_info info;
569 
570 		RB_INIT(&info.tmptree);
571 		hammer2_spin_ex(&hmp->io_spin);
572 		if (hmp->iofree_count > 65536) {
573 			info.count = hmp->iofree_count / 4;
574 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
575 				hammer2_io_cleanup_callback, &info);
576 		}
577 		hammer2_spin_unex(&hmp->io_spin);
578 		hammer2_io_cleanup(hmp, &info.tmptree);
579 	}
580 }
581 
582 /*
583  * Cleanup any dio's with (INPROG | refs) == 0.
584  *
585  * Called to clean up cached DIOs on umount after all activity has been
586  * flushed.
587  */
588 static
589 int
590 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
591 {
592 	struct hammer2_cleanupcb_info *info = arg;
593 	hammer2_io_t *xio;
594 
595 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
596 		if (dio->act > 0) {
597 			--dio->act;
598 			return 0;
599 		}
600 		KKASSERT(dio->bp == NULL);
601 		RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
602 		xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
603 		KKASSERT(xio == NULL);
604 		if (--info->count <= 0)	/* limit scan */
605 			return(-1);
606 	}
607 	return 0;
608 }
609 
610 void
611 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
612 {
613 	hammer2_io_t *dio;
614 
615 	while ((dio = RB_ROOT(tree)) != NULL) {
616 		RB_REMOVE(hammer2_io_tree, tree, dio);
617 		KKASSERT(dio->bp == NULL &&
618 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
619 		kfree(dio, M_HAMMER2);
620 		atomic_add_int(&hammer2_dio_count, -1);
621 		atomic_add_int(&hmp->iofree_count, -1);
622 	}
623 }
624 
625 /*
626  * Returns a pointer to the requested data.
627  */
628 char *
629 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
630 {
631 	struct buf *bp;
632 	int off;
633 
634 	bp = dio->bp;
635 	KKASSERT(bp != NULL);
636 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
637 	KKASSERT(off >= 0 && off < bp->b_bufsize);
638 	return(bp->b_data + off);
639 }
640 
641 /*
642  * Helpers for hammer2_io_new*() functions
643  */
644 static
645 void
646 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
647 {
648 	hammer2_io_t *dio = iocb->dio;
649 	int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
650 
651 	/*
652 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
653 	 * can't mess with it other than zero the requested range.
654 	 *
655 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
656 	 * do what needs to be done with dio->bp.
657 	 */
658 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
659 		if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
660 			if (iocb->lsize == dio->psize) {
661 				/*
662 				 * Fully covered buffer, try to optimize to
663 				 * avoid any I/O.  We might already have the
664 				 * buffer due to iocb chaining.
665 				 */
666 				if (dio->bp == NULL) {
667 					dio->bp = getblk(dio->hmp->devvp,
668 							 dio->pbase, dio->psize,
669 							 gbctl, 0);
670 				}
671 				if (dio->bp) {
672 					vfs_bio_clrbuf(dio->bp);
673 					dio->bp->b_flags |= B_CACHE;
674 				}
675 			} else if (iocb->flags & HAMMER2_IOCB_QUICK) {
676 				/*
677 				 * Partial buffer, quick mode.  Do nothing.
678 				 * Do not instantiate the buffer or try to
679 				 * mark it B_CACHE because other portions of
680 				 * the buffer might have to be read by other
681 				 * accessors.
682 				 */
683 			} else if (dio->bp == NULL ||
684 				   (dio->bp->b_flags & B_CACHE) == 0) {
685 				/*
686 				 * Partial buffer, normal mode, requires
687 				 * read-before-write.  Chain the read.
688 				 *
689 				 * We might already have the buffer due to
690 				 * iocb chaining.  XXX unclear if we really
691 				 * need to write/release it and reacquire
692 				 * in that case.
693 				 *
694 				 * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
695 				 */
696 				if (dio->bp) {
697 					if (dio->refs & HAMMER2_DIO_DIRTY)
698 						bdwrite(dio->bp);
699 					else
700 						bqrelse(dio->bp);
701 					dio->bp = NULL;
702 				}
703 				atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
704 				breadcb(dio->hmp->devvp,
705 					dio->pbase, dio->psize,
706 					hammer2_io_callback, iocb);
707 				return;
708 			} /* else buffer is good */
709 		} /* else callback from breadcb is complete */
710 	}
711 	if (dio->bp) {
712 		if (iocb->flags & HAMMER2_IOCB_ZERO)
713 			bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
714 		atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
715 	}
716 	hammer2_io_complete(iocb);
717 }
718 
719 static
720 int
721 _hammer2_io_new(hammer2_dev_t *hmp, off_t lbase, int lsize,
722 	        hammer2_io_t **diop, int flags)
723 {
724 	hammer2_iocb_t iocb;
725 	hammer2_io_t *dio;
726 
727 	iocb.callback = hammer2_iocb_new_callback;
728 	iocb.cluster = NULL;
729 	iocb.chain = NULL;
730 	iocb.ptr = NULL;
731 	iocb.lbase = lbase;
732 	iocb.lsize = lsize;
733 	iocb.flags = flags;
734 	iocb.error = 0;
735 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
736 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
737 		hammer2_iocb_wait(&iocb);
738 	dio = *diop = iocb.dio;
739 
740 	return (iocb.error);
741 }
742 
743 int
744 hammer2_io_new(hammer2_dev_t *hmp, off_t lbase, int lsize,
745 	       hammer2_io_t **diop)
746 {
747 	return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_ZERO));
748 }
749 
750 int
751 hammer2_io_newnz(hammer2_dev_t *hmp, off_t lbase, int lsize,
752 	       hammer2_io_t **diop)
753 {
754 	return(_hammer2_io_new(hmp, lbase, lsize, diop, 0));
755 }
756 
757 int
758 hammer2_io_newq(hammer2_dev_t *hmp, off_t lbase, int lsize,
759 	       hammer2_io_t **diop)
760 {
761 	return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_QUICK));
762 }
763 
764 static
765 void
766 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
767 {
768 	hammer2_io_t *dio = iocb->dio;
769 	off_t peof;
770 	int error;
771 
772 	/*
773 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
774 	 * can't mess with it other than zero the requested range.
775 	 *
776 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
777 	 * do what needs to be done with dio->bp.
778 	 */
779 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
780 		if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
781 			/*
782 			 * Already good, likely due to being chained from
783 			 * another iocb.
784 			 */
785 			error = 0;
786 		} else if (hammer2_cluster_enable) {
787 			/*
788 			 * Synchronous cluster I/O for now.
789 			 */
790 			if (dio->bp) {
791 				bqrelse(dio->bp);
792 				dio->bp = NULL;
793 			}
794 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
795 			       ~HAMMER2_SEGMASK64;
796 			error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
797 					     dio->psize,
798 					     dio->psize, HAMMER2_PBUFSIZE*4,
799 					     &dio->bp);
800 		} else {
801 			/*
802 			 * Synchronous I/O for now.
803 			 */
804 			if (dio->bp) {
805 				bqrelse(dio->bp);
806 				dio->bp = NULL;
807 			}
808 			error = bread(dio->hmp->devvp, dio->pbase,
809 				      dio->psize, &dio->bp);
810 		}
811 		if (error) {
812 			brelse(dio->bp);
813 			dio->bp = NULL;
814 		}
815 	}
816 	hammer2_io_complete(iocb);
817 }
818 
819 int
820 hammer2_io_bread(hammer2_dev_t *hmp, off_t lbase, int lsize,
821 		hammer2_io_t **diop)
822 {
823 	hammer2_iocb_t iocb;
824 	hammer2_io_t *dio;
825 
826 	iocb.callback = hammer2_iocb_bread_callback;
827 	iocb.cluster = NULL;
828 	iocb.chain = NULL;
829 	iocb.ptr = NULL;
830 	iocb.lbase = lbase;
831 	iocb.lsize = lsize;
832 	iocb.flags = 0;
833 	iocb.error = 0;
834 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
835 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
836 		hammer2_iocb_wait(&iocb);
837 	dio = *diop = iocb.dio;
838 
839 	return (iocb.error);
840 }
841 
842 /*
843  * System buf/bio async callback extracts the iocb and chains
844  * to the iocb callback.
845  */
846 void
847 hammer2_io_callback(struct bio *bio)
848 {
849 	struct buf *dbp = bio->bio_buf;
850 	hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
851 	hammer2_io_t *dio;
852 
853 	dio = iocb->dio;
854 	if ((bio->bio_flags & BIO_DONE) == 0)
855 		bpdone(dbp, 0);
856 	bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
857 	dio->bp = bio->bio_buf;
858 	iocb->callback(iocb);
859 }
860 
861 void
862 hammer2_io_bawrite(hammer2_io_t **diop)
863 {
864 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
865 	hammer2_io_putblk(diop);
866 }
867 
868 void
869 hammer2_io_bdwrite(hammer2_io_t **diop)
870 {
871 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
872 	hammer2_io_putblk(diop);
873 }
874 
875 int
876 hammer2_io_bwrite(hammer2_io_t **diop)
877 {
878 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
879 	hammer2_io_putblk(diop);
880 	return (0);	/* XXX */
881 }
882 
883 void
884 hammer2_io_setdirty(hammer2_io_t *dio)
885 {
886 	atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
887 }
888 
889 void
890 hammer2_io_setinval(hammer2_io_t *dio, u_int bytes)
891 {
892 	if ((u_int)dio->psize == bytes)
893 		dio->bp->b_flags |= B_INVAL | B_RELBUF;
894 }
895 
896 void
897 hammer2_io_brelse(hammer2_io_t **diop)
898 {
899 	hammer2_io_putblk(diop);
900 }
901 
902 void
903 hammer2_io_bqrelse(hammer2_io_t **diop)
904 {
905 	hammer2_io_putblk(diop);
906 }
907 
908 int
909 hammer2_io_isdirty(hammer2_io_t *dio)
910 {
911 	return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
912 }
913