xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision a4fe36f1)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 /*
38  * Implements an abstraction layer for synchronous and asynchronous
39  * buffered device I/O.  Can be used for OS-abstraction but the main
40  * purpose is to allow larger buffers to be used against hammer2_chain's
41  * using smaller allocations, without causing deadlocks.
42  *
43  */
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
45 
46 static int
47 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
48 {
49 	if (io1->pbase < io2->pbase)
50 		return(-1);
51 	if (io1->pbase > io2->pbase)
52 		return(1);
53 	return(0);
54 }
55 
56 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
57 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
58 		off_t, pbase);
59 
60 struct hammer2_cleanupcb_info {
61 	struct hammer2_io_tree tmptree;
62 	int	count;
63 };
64 
65 static __inline
66 uint64_t
67 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
68 {
69 	uint64_t mask;
70 	int i;
71 
72 	if (bytes < 1024)	/* smaller chunks not supported */
73 		return 0;
74 
75 	/*
76 	 * Calculate crc check mask for larger chunks
77 	 */
78 	i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
79 	     HAMMER2_PBUFMASK) >> 10;
80 	if (i == 0 && bytes == HAMMER2_PBUFSIZE)
81 		return((uint64_t)-1);
82 	mask = ((uint64_t)1U << (bytes >> 10)) - 1;
83 	mask <<= i;
84 
85 	return mask;
86 }
87 
88 #define HAMMER2_GETBLK_GOOD	0
89 #define HAMMER2_GETBLK_QUEUED	1
90 #define HAMMER2_GETBLK_OWNED	2
91 
92 /*
93  * Allocate/Locate the requested dio, reference it, issue or queue iocb.
94  */
95 void
96 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
97 		  hammer2_iocb_t *iocb)
98 {
99 	hammer2_io_t *dio;
100 	hammer2_io_t *xio;
101 	off_t pbase;
102 	off_t pmask;
103 	/*
104 	 * XXX after free, buffer reuse case w/ different size can clash
105 	 * with dio cache.  Lets avoid it for now.  Ultimate we need to
106 	 * invalidate the dio cache when freeing blocks to allow a mix
107 	 * of 16KB and 64KB block sizes).
108 	 */
109 	/*int psize = hammer2_devblksize(lsize);*/
110 	int psize = HAMMER2_PBUFSIZE;
111 	int refs;
112 
113 	pmask = ~(hammer2_off_t)(psize - 1);
114 
115 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
116 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
117 	pbase = lbase & pmask;
118 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
119 
120 	/*
121 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
122 	 */
123 	hammer2_spin_sh(&hmp->io_spin);
124 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
125 	if (dio) {
126 		if ((atomic_fetchadd_int(&dio->refs, 1) &
127 		     HAMMER2_DIO_MASK) == 0) {
128 			atomic_add_int(&dio->hmp->iofree_count, -1);
129 		}
130 		hammer2_spin_unsh(&hmp->io_spin);
131 	} else {
132 		hammer2_spin_unsh(&hmp->io_spin);
133 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
134 		dio->hmp = hmp;
135 		dio->pbase = pbase;
136 		dio->psize = psize;
137 		dio->refs = 1;
138 		hammer2_spin_init(&dio->spin, "h2dio");
139 		TAILQ_INIT(&dio->iocbq);
140 		hammer2_spin_ex(&hmp->io_spin);
141 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
142 		if (xio == NULL) {
143 			atomic_add_int(&hammer2_dio_count, 1);
144 			hammer2_spin_unex(&hmp->io_spin);
145 		} else {
146 			if ((atomic_fetchadd_int(&xio->refs, 1) &
147 			     HAMMER2_DIO_MASK) == 0) {
148 				atomic_add_int(&xio->hmp->iofree_count, -1);
149 			}
150 			hammer2_spin_unex(&hmp->io_spin);
151 			kfree(dio, M_HAMMER2);
152 			dio = xio;
153 		}
154 	}
155 
156 	/*
157 	 * Obtain/Validate the buffer.
158 	 */
159 	iocb->dio = dio;
160 
161 	if (dio->act < 5)	/* SMP race ok */
162 		++dio->act;
163 
164 	for (;;) {
165 		refs = dio->refs;
166 		cpu_ccfence();
167 
168 		/*
169 		 * Issue the iocb immediately if the buffer is already good.
170 		 * Once set GOOD cannot be cleared until refs drops to 0.
171 		 *
172 		 * lfence required because dio's are not interlocked for
173 		 * the DIO_GOOD test.
174 		 */
175 		if (refs & HAMMER2_DIO_GOOD) {
176 			cpu_lfence();
177 			iocb->callback(iocb);
178 			break;
179 		}
180 
181 		/*
182 		 * Try to own the DIO by setting INPROG so we can issue
183 		 * I/O on it.
184 		 */
185 		if (refs & HAMMER2_DIO_INPROG) {
186 			/*
187 			 * If DIO_INPROG is already set then set WAITING and
188 			 * queue the iocb.
189 			 */
190 			hammer2_spin_ex(&dio->spin);
191 			if (atomic_cmpset_int(&dio->refs, refs,
192 					      refs | HAMMER2_DIO_WAITING)) {
193 				iocb->flags |= HAMMER2_IOCB_ONQ |
194 					       HAMMER2_IOCB_INPROG;
195 				TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
196 				hammer2_spin_unex(&dio->spin);
197 				break;
198 			}
199 			hammer2_spin_unex(&dio->spin);
200 			/* retry */
201 		} else {
202 			/*
203 			 * If DIO_INPROG is not set then set it and issue the
204 			 * callback immediately to start I/O.
205 			 */
206 			if (atomic_cmpset_int(&dio->refs, refs,
207 					      refs | HAMMER2_DIO_INPROG)) {
208 				iocb->flags |= HAMMER2_IOCB_INPROG;
209 				iocb->callback(iocb);
210 				break;
211 			}
212 			/* retry */
213 		}
214 		/* retry */
215 	}
216 }
217 
218 /*
219  * Quickly obtain a good DIO buffer, return NULL if the system no longer
220  * caches the data.
221  */
222 hammer2_io_t *
223 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
224 {
225 	hammer2_iocb_t iocb;
226 	hammer2_io_t *dio;
227 	struct buf *bp;
228 	off_t pbase;
229 	off_t pmask;
230 	int psize = HAMMER2_PBUFSIZE;
231 	int orefs;
232 	int nrefs;
233 
234 	pmask = ~(hammer2_off_t)(psize - 1);
235 
236 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
237 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
238 	pbase = lbase & pmask;
239 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
240 
241 	/*
242 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
243 	 */
244 	hammer2_spin_sh(&hmp->io_spin);
245 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
246 	if (dio == NULL) {
247 		hammer2_spin_unsh(&hmp->io_spin);
248 		return NULL;
249 	}
250 
251 	if ((atomic_fetchadd_int(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
252 		atomic_add_int(&dio->hmp->iofree_count, -1);
253 	hammer2_spin_unsh(&hmp->io_spin);
254 
255 	if (dio->act < 5)	/* SMP race ok */
256 		++dio->act;
257 
258 	/*
259 	 * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
260 	 * the system does not have the data already cached.
261 	 */
262 	nrefs = -1;
263 	for (;;) {
264 		orefs = dio->refs;
265 		cpu_ccfence();
266 
267 		/*
268 		 * Issue the iocb immediately if the buffer is already good.
269 		 * Once set GOOD cannot be cleared until refs drops to 0.
270 		 *
271 		 * lfence required because dio is not interlockedf for
272 		 * the DIO_GOOD test.
273 		 */
274 		if (orefs & HAMMER2_DIO_GOOD) {
275 			cpu_lfence();
276 			break;
277 		}
278 
279 		/*
280 		 * Try to own the DIO by setting INPROG so we can issue
281 		 * I/O on it.  INPROG might already be set, in which case
282 		 * there is no way we can do this non-blocking so we punt.
283 		 */
284 		if ((orefs & HAMMER2_DIO_INPROG))
285 			break;
286 		nrefs = orefs | HAMMER2_DIO_INPROG;
287 		if (atomic_cmpset_int(&dio->refs, orefs, nrefs) == 0)
288 			continue;
289 
290 		/*
291 		 * We own DIO_INPROG, try to set DIO_GOOD.
292 		 *
293 		 * For now do not use GETBLK_NOWAIT because
294 		 */
295 		bp = dio->bp;
296 		dio->bp = NULL;
297 		if (bp == NULL) {
298 #if 0
299 			bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0);
300 #endif
301 			bread(hmp->devvp, dio->pbase, dio->psize, &bp);
302 		}
303 		if (bp) {
304 			if ((bp->b_flags & B_ERROR) == 0 &&
305 			    (bp->b_flags & B_CACHE)) {
306 				dio->bp = bp;	/* assign BEFORE setting flag */
307 				atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
308 			} else {
309 				bqrelse(bp);
310 				bp = NULL;
311 			}
312 		}
313 
314 		/*
315 		 * Clear DIO_INPROG.
316 		 *
317 		 * This is actually a bit complicated, see
318 		 * hammer2_io_complete() for more information.
319 		 */
320 		iocb.dio = dio;
321 		iocb.flags = HAMMER2_IOCB_INPROG;
322 		hammer2_io_complete(&iocb);
323 		break;
324 	}
325 
326 	/*
327 	 * Only return the dio if its buffer is good.
328 	 */
329 	if ((dio->refs & HAMMER2_DIO_GOOD) == 0) {
330 		hammer2_io_putblk(&dio);
331 	}
332 	return dio;
333 }
334 
335 /*
336  * The originator of the iocb is finished with it.
337  */
338 void
339 hammer2_io_complete(hammer2_iocb_t *iocb)
340 {
341 	hammer2_io_t *dio = iocb->dio;
342 	hammer2_iocb_t *cbtmp;
343 	uint32_t orefs;
344 	uint32_t nrefs;
345 	uint32_t oflags;
346 	uint32_t nflags;
347 
348 	/*
349 	 * If IOCB_INPROG was not set completion is synchronous due to the
350 	 * buffer already being good.  We can simply set IOCB_DONE and return.
351 	 * In this situation DIO_INPROG is not set and we have no visibility
352 	 * on dio->bp.
353 	 */
354 	if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
355 		atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
356 		return;
357 	}
358 
359 	/*
360 	 * The iocb was queued, obtained DIO_INPROG, and its callback was
361 	 * made.  The callback is now complete.  We still own DIO_INPROG.
362 	 *
363 	 * We can set DIO_GOOD if no error occurred, which gives certain
364 	 * stability guarantees to dio->bp and allows other accessors to
365 	 * short-cut access.  DIO_GOOD cannot be cleared until the last
366 	 * ref is dropped.
367 	 */
368 	KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
369 	if (dio->bp) {
370 		BUF_KERNPROC(dio->bp);
371 		if ((dio->bp->b_flags & B_ERROR) == 0) {
372 			KKASSERT(dio->bp->b_flags & B_CACHE);
373 			atomic_set_int(&dio->refs, HAMMER2_DIO_GOOD);
374 		}
375 	}
376 
377 	/*
378 	 * Clean up the dio before marking the iocb as being done.  If another
379 	 * iocb is pending we chain to it while leaving DIO_INPROG set (it
380 	 * will call io completion and presumably clear DIO_INPROG).
381 	 *
382 	 * Otherwise if no other iocbs are pending we clear DIO_INPROG before
383 	 * finishing up the cbio.  This means that DIO_INPROG is cleared at
384 	 * the end of the chain before ANY of the cbios are marked done.
385 	 *
386 	 * NOTE: The TAILQ is not stable until the spin-lock is held.
387 	 */
388 	for (;;) {
389 		orefs = dio->refs;
390 		nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
391 
392 		if (orefs & HAMMER2_DIO_WAITING) {
393 			hammer2_spin_ex(&dio->spin);
394 			cbtmp = TAILQ_FIRST(&dio->iocbq);
395 			if (cbtmp) {
396 				/*
397 				 * NOTE: flags not adjusted in this case.
398 				 *	 Flags will be adjusted by the last
399 				 *	 iocb.
400 				 */
401 				TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
402 				hammer2_spin_unex(&dio->spin);
403 				cbtmp->callback(cbtmp);	/* chained */
404 				break;
405 			} else if (atomic_cmpset_int(&dio->refs,
406 						     orefs, nrefs)) {
407 				hammer2_spin_unex(&dio->spin);
408 				break;
409 			}
410 			hammer2_spin_unex(&dio->spin);
411 			/* retry */
412 		} else if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
413 			break;
414 		} /* else retry */
415 		/* retry */
416 	}
417 
418 	/*
419 	 * Mark the iocb as done and wakeup any waiters.  This is done after
420 	 * all iocb chains have been called back and after DIO_INPROG has been
421 	 * cleared.  This avoids races against ref count drops by the waiting
422 	 * threads (a hard but not impossible SMP race) which might result in
423 	 * a 1->0 transition of the refs while DIO_INPROG is still set.
424 	 */
425 	for (;;) {
426 		oflags = iocb->flags;
427 		cpu_ccfence();
428 		nflags = oflags;
429 		nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
430 		nflags |= HAMMER2_IOCB_DONE;
431 
432 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
433 			if (oflags & HAMMER2_IOCB_WAKEUP)
434 				wakeup(iocb);
435 			/* SMP: iocb is now stale */
436 			break;
437 		}
438 		/* retry */
439 	}
440 	iocb = NULL;
441 
442 }
443 
444 /*
445  * Wait for an iocb's I/O to finish.
446  */
447 void
448 hammer2_iocb_wait(hammer2_iocb_t *iocb)
449 {
450 	uint32_t oflags;
451 	uint32_t nflags;
452 
453 	for (;;) {
454 		oflags = iocb->flags;
455 		cpu_ccfence();
456 		nflags = oflags | HAMMER2_IOCB_WAKEUP;
457 		if (oflags & HAMMER2_IOCB_DONE)
458 			break;
459 		tsleep_interlock(iocb, 0);
460 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
461 			tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
462 		}
463 	}
464 
465 }
466 
467 /*
468  * Release our ref on *diop.
469  *
470  * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
471  * then dispose of the underlying buffer.
472  */
473 void
474 hammer2_io_putblk(hammer2_io_t **diop)
475 {
476 	hammer2_dev_t *hmp;
477 	hammer2_io_t *dio;
478 	hammer2_iocb_t iocb;
479 	struct buf *bp;
480 	off_t peof;
481 	off_t pbase;
482 	int psize;
483 	int orefs;
484 	int nrefs;
485 
486 	dio = *diop;
487 	*diop = NULL;
488 	hmp = dio->hmp;
489 
490 	/*
491 	 * Drop refs.
492 	 *
493 	 * On the 1->0 transition clear flags and set INPROG.
494 	 *
495 	 * On the 1->0 transition if INPROG is already set, another thread
496 	 * is in lastdrop and we can just return after the transition.
497 	 *
498 	 * On any other transition we can generally just return.
499 	 */
500 	for (;;) {
501 		orefs = dio->refs;
502 		cpu_ccfence();
503 		nrefs = orefs - 1;
504 
505 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
506 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
507 			/*
508 			 * Lastdrop case, INPROG can be set.
509 			 */
510 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
511 			nrefs |= HAMMER2_DIO_INPROG;
512 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
513 				break;
514 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
515 			/*
516 			 * Lastdrop case, INPROG already set.
517 			 */
518 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs)) {
519 				atomic_add_int(&hmp->iofree_count, 1);
520 				return;
521 			}
522 		} else {
523 			/*
524 			 * Normal drop case.
525 			 */
526 			if (atomic_cmpset_int(&dio->refs, orefs, nrefs))
527 				return;
528 		}
529 		cpu_pause();
530 		/* retry */
531 	}
532 
533 	/*
534 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
535 	 * have been cleared.
536 	 *
537 	 * We can now dispose of the buffer, and should do it before calling
538 	 * io_complete() in case there's a race against a new reference
539 	 * which causes io_complete() to chain and instantiate the bp again.
540 	 */
541 	pbase = dio->pbase;
542 	psize = dio->psize;
543 	bp = dio->bp;
544 	dio->bp = NULL;
545 
546 	if (orefs & HAMMER2_DIO_GOOD) {
547 		KKASSERT(bp != NULL);
548 		if (orefs & HAMMER2_DIO_DIRTY) {
549 			int hce;
550 
551 			if ((hce = hammer2_cluster_enable) > 0) {
552 				peof = (pbase + HAMMER2_SEGMASK64) &
553 				       ~HAMMER2_SEGMASK64;
554 				cluster_write(bp, peof, psize, hce);
555 			} else {
556 				bp->b_flags |= B_CLUSTEROK;
557 				bdwrite(bp);
558 			}
559 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
560 			brelse(bp);
561 		} else {
562 			bqrelse(bp);
563 		}
564 	} else if (bp) {
565 		if (orefs & HAMMER2_DIO_DIRTY) {
566 			bdwrite(bp);
567 		} else {
568 			brelse(bp);
569 		}
570 	}
571 
572 	/*
573 	 * The instant we call io_complete dio is a free agent again and
574 	 * can be ripped out from under us.
575 	 *
576 	 * we can cleanup our final DIO_INPROG by simulating an iocb
577 	 * completion.
578 	 */
579 	hmp = dio->hmp;				/* extract fields */
580 	atomic_add_int(&hmp->iofree_count, 1);
581 	cpu_ccfence();
582 
583 	iocb.dio = dio;
584 	iocb.flags = HAMMER2_IOCB_INPROG;
585 	hammer2_io_complete(&iocb);
586 	dio = NULL;				/* dio stale */
587 
588 	/*
589 	 * We cache free buffers so re-use cases can use a shared lock, but
590 	 * if too many build up we have to clean them out.
591 	 */
592 	if (hmp->iofree_count > 65536) {
593 		struct hammer2_cleanupcb_info info;
594 
595 		RB_INIT(&info.tmptree);
596 		hammer2_spin_ex(&hmp->io_spin);
597 		if (hmp->iofree_count > 65536) {
598 			info.count = hmp->iofree_count / 4;
599 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
600 				hammer2_io_cleanup_callback, &info);
601 		}
602 		hammer2_spin_unex(&hmp->io_spin);
603 		hammer2_io_cleanup(hmp, &info.tmptree);
604 	}
605 }
606 
607 /*
608  * Cleanup any dio's with (INPROG | refs) == 0.
609  *
610  * Called to clean up cached DIOs on umount after all activity has been
611  * flushed.
612  */
613 static
614 int
615 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
616 {
617 	struct hammer2_cleanupcb_info *info = arg;
618 	hammer2_io_t *xio;
619 
620 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
621 		if (dio->act > 0) {
622 			--dio->act;
623 			return 0;
624 		}
625 		KKASSERT(dio->bp == NULL);
626 		RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
627 		xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
628 		KKASSERT(xio == NULL);
629 		if (--info->count <= 0)	/* limit scan */
630 			return(-1);
631 	}
632 	return 0;
633 }
634 
635 void
636 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
637 {
638 	hammer2_io_t *dio;
639 
640 	while ((dio = RB_ROOT(tree)) != NULL) {
641 		RB_REMOVE(hammer2_io_tree, tree, dio);
642 		KKASSERT(dio->bp == NULL &&
643 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
644 		kfree(dio, M_HAMMER2);
645 		atomic_add_int(&hammer2_dio_count, -1);
646 		atomic_add_int(&hmp->iofree_count, -1);
647 	}
648 }
649 
650 /*
651  * Returns a pointer to the requested data.
652  */
653 char *
654 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
655 {
656 	struct buf *bp;
657 	int off;
658 
659 	bp = dio->bp;
660 	KKASSERT(bp != NULL);
661 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
662 	KKASSERT(off >= 0 && off < bp->b_bufsize);
663 	return(bp->b_data + off);
664 }
665 
666 /*
667  * Keep track of good CRCs in dio->good_crc_mask. XXX needs to be done
668  * in the chain structure, but chain structure needs to be persistent as
669  * well on refs=0 and it isn't.
670  */
671 int
672 hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp)
673 {
674 	hammer2_io_t *dio;
675 	uint64_t mask;
676 
677 	if ((dio = chain->dio) != NULL && chain->bytes >= 1024) {
678 		mask = hammer2_io_mask(dio, chain->bref.data_off, chain->bytes);
679 		*maskp = mask;
680 		if ((dio->crc_good_mask & mask) == mask)
681 			return 1;
682 		return 0;
683 	}
684 	*maskp = 0;
685 
686 	return 0;
687 }
688 
689 void
690 hammer2_io_crc_setmask(hammer2_io_t *dio, uint64_t mask)
691 {
692 	if (dio) {
693 		if (sizeof(long) == 8) {
694 			atomic_set_long(&dio->crc_good_mask, mask);
695 		} else {
696 #if _BYTE_ORDER == _LITTLE_ENDIAN
697 			atomic_set_int(&((int *)&dio->crc_good_mask)[0],
698 					(uint32_t)mask);
699 			atomic_set_int(&((int *)&dio->crc_good_mask)[1],
700 					(uint32_t)(mask >> 32));
701 #else
702 			atomic_set_int(&((int *)&dio->crc_good_mask)[0],
703 					(uint32_t)(mask >> 32));
704 			atomic_set_int(&((int *)&dio->crc_good_mask)[1],
705 					(uint32_t)mask);
706 #endif
707 		}
708 	}
709 }
710 
711 void
712 hammer2_io_crc_clrmask(hammer2_io_t *dio, uint64_t mask)
713 {
714 	if (dio) {
715 		if (sizeof(long) == 8) {
716 			atomic_clear_long(&dio->crc_good_mask, mask);
717 		} else {
718 #if _BYTE_ORDER == _LITTLE_ENDIAN
719 			atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
720 					(uint32_t)mask);
721 			atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
722 					(uint32_t)(mask >> 32));
723 #else
724 			atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
725 					(uint32_t)(mask >> 32));
726 			atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
727 					(uint32_t)mask);
728 #endif
729 		}
730 	}
731 }
732 
733 /*
734  * Helpers for hammer2_io_new*() functions
735  */
736 static
737 void
738 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
739 {
740 	hammer2_io_t *dio = iocb->dio;
741 	int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
742 
743 	/*
744 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
745 	 * can't mess with it other than zero the requested range.
746 	 *
747 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
748 	 * do what needs to be done with dio->bp.
749 	 */
750 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
751 		if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
752 			if (iocb->lsize == dio->psize) {
753 				/*
754 				 * Fully covered buffer, try to optimize to
755 				 * avoid any I/O.  We might already have the
756 				 * buffer due to iocb chaining.
757 				 */
758 				if (dio->bp == NULL) {
759 					dio->bp = getblk(dio->hmp->devvp,
760 							 dio->pbase, dio->psize,
761 							 gbctl, 0);
762 				}
763 				if (dio->bp) {
764 					vfs_bio_clrbuf(dio->bp);
765 					dio->bp->b_flags |= B_CACHE;
766 				}
767 			} else if (iocb->flags & HAMMER2_IOCB_QUICK) {
768 				/*
769 				 * Partial buffer, quick mode.  Do nothing.
770 				 * Do not instantiate the buffer or try to
771 				 * mark it B_CACHE because other portions of
772 				 * the buffer might have to be read by other
773 				 * accessors.
774 				 */
775 			} else if (dio->bp == NULL ||
776 				   (dio->bp->b_flags & B_CACHE) == 0) {
777 				/*
778 				 * Partial buffer, normal mode, requires
779 				 * read-before-write.  Chain the read.
780 				 *
781 				 * We might already have the buffer due to
782 				 * iocb chaining.  XXX unclear if we really
783 				 * need to write/release it and reacquire
784 				 * in that case.
785 				 *
786 				 * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
787 				 */
788 				if (dio->bp) {
789 					if (dio->refs & HAMMER2_DIO_DIRTY)
790 						bdwrite(dio->bp);
791 					else
792 						bqrelse(dio->bp);
793 					dio->bp = NULL;
794 				}
795 				atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
796 				breadcb(dio->hmp->devvp,
797 					dio->pbase, dio->psize,
798 					hammer2_io_callback, iocb);
799 				return;
800 			} /* else buffer is good */
801 		} /* else callback from breadcb is complete */
802 	}
803 	if (dio->bp) {
804 		if (iocb->flags & HAMMER2_IOCB_ZERO)
805 			bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
806 		atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
807 	}
808 	hammer2_io_complete(iocb);
809 }
810 
811 static
812 int
813 _hammer2_io_new(hammer2_dev_t *hmp, off_t lbase, int lsize,
814 	        hammer2_io_t **diop, int flags)
815 {
816 	hammer2_iocb_t iocb;
817 	hammer2_io_t *dio;
818 
819 	iocb.callback = hammer2_iocb_new_callback;
820 	iocb.cluster = NULL;
821 	iocb.chain = NULL;
822 	iocb.ptr = NULL;
823 	iocb.lbase = lbase;
824 	iocb.lsize = lsize;
825 	iocb.flags = flags;
826 	iocb.error = 0;
827 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
828 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
829 		hammer2_iocb_wait(&iocb);
830 	dio = *diop = iocb.dio;
831 
832 	return (iocb.error);
833 }
834 
835 int
836 hammer2_io_new(hammer2_dev_t *hmp, off_t lbase, int lsize,
837 	       hammer2_io_t **diop)
838 {
839 	return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_ZERO));
840 }
841 
842 int
843 hammer2_io_newnz(hammer2_dev_t *hmp, off_t lbase, int lsize,
844 	       hammer2_io_t **diop)
845 {
846 	return(_hammer2_io_new(hmp, lbase, lsize, diop, 0));
847 }
848 
849 int
850 hammer2_io_newq(hammer2_dev_t *hmp, off_t lbase, int lsize,
851 	       hammer2_io_t **diop)
852 {
853 	return(_hammer2_io_new(hmp, lbase, lsize, diop, HAMMER2_IOCB_QUICK));
854 }
855 
856 static
857 void
858 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
859 {
860 	hammer2_io_t *dio = iocb->dio;
861 	off_t peof;
862 	int error;
863 
864 	/*
865 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
866 	 * can't mess with it other than zero the requested range.
867 	 *
868 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
869 	 * do what needs to be done with dio->bp.
870 	 */
871 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
872 		int hce;
873 
874 		if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
875 			/*
876 			 * Already good, likely due to being chained from
877 			 * another iocb.
878 			 */
879 			error = 0;
880 		} else if ((hce = hammer2_cluster_enable) > 0) {
881 			/*
882 			 * Synchronous cluster I/O for now.
883 			 */
884 			if (dio->bp) {
885 				bqrelse(dio->bp);
886 				dio->bp = NULL;
887 			}
888 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
889 			       ~HAMMER2_SEGMASK64;
890 			error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
891 					     dio->psize,
892 					     dio->psize, HAMMER2_PBUFSIZE*hce,
893 					     &dio->bp);
894 		} else {
895 			/*
896 			 * Synchronous I/O for now.
897 			 */
898 			if (dio->bp) {
899 				bqrelse(dio->bp);
900 				dio->bp = NULL;
901 			}
902 			error = bread(dio->hmp->devvp, dio->pbase,
903 				      dio->psize, &dio->bp);
904 		}
905 		if (error) {
906 			brelse(dio->bp);
907 			dio->bp = NULL;
908 		}
909 	}
910 	hammer2_io_complete(iocb);
911 }
912 
913 int
914 hammer2_io_bread(hammer2_dev_t *hmp, off_t lbase, int lsize,
915 		hammer2_io_t **diop)
916 {
917 	hammer2_iocb_t iocb;
918 	hammer2_io_t *dio;
919 
920 	iocb.callback = hammer2_iocb_bread_callback;
921 	iocb.cluster = NULL;
922 	iocb.chain = NULL;
923 	iocb.ptr = NULL;
924 	iocb.lbase = lbase;
925 	iocb.lsize = lsize;
926 	iocb.flags = 0;
927 	iocb.error = 0;
928 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
929 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
930 		hammer2_iocb_wait(&iocb);
931 	dio = *diop = iocb.dio;
932 
933 	return (iocb.error);
934 }
935 
936 /*
937  * System buf/bio async callback extracts the iocb and chains
938  * to the iocb callback.
939  */
940 void
941 hammer2_io_callback(struct bio *bio)
942 {
943 	struct buf *dbp = bio->bio_buf;
944 	hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
945 	hammer2_io_t *dio;
946 
947 	dio = iocb->dio;
948 	if ((bio->bio_flags & BIO_DONE) == 0)
949 		bpdone(dbp, 0);
950 	bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
951 	dio->bp = bio->bio_buf;
952 	iocb->callback(iocb);
953 }
954 
955 void
956 hammer2_io_bawrite(hammer2_io_t **diop)
957 {
958 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
959 	hammer2_io_putblk(diop);
960 }
961 
962 void
963 hammer2_io_bdwrite(hammer2_io_t **diop)
964 {
965 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
966 	hammer2_io_putblk(diop);
967 }
968 
969 int
970 hammer2_io_bwrite(hammer2_io_t **diop)
971 {
972 	atomic_set_int(&(*diop)->refs, HAMMER2_DIO_DIRTY);
973 	hammer2_io_putblk(diop);
974 	return (0);	/* XXX */
975 }
976 
977 void
978 hammer2_io_setdirty(hammer2_io_t *dio)
979 {
980 	atomic_set_int(&dio->refs, HAMMER2_DIO_DIRTY);
981 }
982 
983 void
984 hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
985 {
986 	uint64_t mask = hammer2_io_mask(dio, off, bytes);
987 
988 	hammer2_io_crc_clrmask(dio, mask);
989 	if ((u_int)dio->psize == bytes)
990 		dio->bp->b_flags |= B_INVAL | B_RELBUF;
991 }
992 
993 void
994 hammer2_io_brelse(hammer2_io_t **diop)
995 {
996 	hammer2_io_putblk(diop);
997 }
998 
999 void
1000 hammer2_io_bqrelse(hammer2_io_t **diop)
1001 {
1002 	hammer2_io_putblk(diop);
1003 }
1004 
1005 int
1006 hammer2_io_isdirty(hammer2_io_t *dio)
1007 {
1008 	return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
1009 }
1010