xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision 38b5d46c)
1 /*
2  * Copyright (c) 2013-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 /*
38  * Implements an abstraction layer for synchronous and asynchronous
39  * buffered device I/O.  Can be used for OS-abstraction but the main
40  * purpose is to allow larger buffers to be used against hammer2_chain's
41  * using smaller allocations, without causing deadlocks.
42  *
43  */
44 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
45 static void dio_write_stats_update(hammer2_io_t *dio);
46 
47 static int
48 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
49 {
50 	if (io1->pbase < io2->pbase)
51 		return(-1);
52 	if (io1->pbase > io2->pbase)
53 		return(1);
54 	return(0);
55 }
56 
57 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
58 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
59 		off_t, pbase);
60 
61 struct hammer2_cleanupcb_info {
62 	struct hammer2_io_tree tmptree;
63 	int	count;
64 };
65 
66 static __inline
67 uint64_t
68 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
69 {
70 	uint64_t mask;
71 	int i;
72 
73 	if (bytes < 1024)	/* smaller chunks not supported */
74 		return 0;
75 
76 	/*
77 	 * Calculate crc check mask for larger chunks
78 	 */
79 	i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
80 	     HAMMER2_PBUFMASK) >> 10;
81 	if (i == 0 && bytes == HAMMER2_PBUFSIZE)
82 		return((uint64_t)-1);
83 	mask = ((uint64_t)1U << (bytes >> 10)) - 1;
84 	mask <<= i;
85 
86 	return mask;
87 }
88 
89 #define HAMMER2_GETBLK_GOOD	0
90 #define HAMMER2_GETBLK_QUEUED	1
91 #define HAMMER2_GETBLK_OWNED	2
92 
93 /*
94  * Allocate/Locate the requested dio, reference it, issue or queue iocb.
95  */
96 void
97 hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize,
98 		  hammer2_iocb_t *iocb)
99 {
100 	hammer2_io_t *dio;
101 	hammer2_io_t *xio;
102 	off_t pbase;
103 	off_t pmask;
104 	/*
105 	 * XXX after free, buffer reuse case w/ different size can clash
106 	 * with dio cache.  Lets avoid it for now.  Ultimate we need to
107 	 * invalidate the dio cache when freeing blocks to allow a mix
108 	 * of 16KB and 64KB block sizes).
109 	 */
110 	/*int psize = hammer2_devblksize(lsize);*/
111 	int psize = HAMMER2_PBUFSIZE;
112 	uint64_t refs;
113 
114 	pmask = ~(hammer2_off_t)(psize - 1);
115 
116 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
117 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
118 	pbase = lbase & pmask;
119 	if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
120 		kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
121 			pbase, lbase, lsize, pmask);
122 	}
123 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
124 
125 	/*
126 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
127 	 */
128 	hammer2_spin_sh(&hmp->io_spin);
129 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
130 	if (dio) {
131 		if ((atomic_fetchadd_64(&dio->refs, 1) &
132 		     HAMMER2_DIO_MASK) == 0) {
133 			atomic_add_int(&dio->hmp->iofree_count, -1);
134 		}
135 		hammer2_spin_unsh(&hmp->io_spin);
136 	} else {
137 		hammer2_spin_unsh(&hmp->io_spin);
138 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
139 		dio->hmp = hmp;
140 		dio->pbase = pbase;
141 		dio->psize = psize;
142 		dio->btype = iocb->btype;
143 		dio->refs = 1;
144 		hammer2_spin_init(&dio->spin, "h2dio");
145 		TAILQ_INIT(&dio->iocbq);
146 		hammer2_spin_ex(&hmp->io_spin);
147 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
148 		if (xio == NULL) {
149 			atomic_add_int(&hammer2_dio_count, 1);
150 			hammer2_spin_unex(&hmp->io_spin);
151 		} else {
152 			if ((atomic_fetchadd_64(&xio->refs, 1) &
153 			     HAMMER2_DIO_MASK) == 0) {
154 				atomic_add_int(&xio->hmp->iofree_count, -1);
155 			}
156 			hammer2_spin_unex(&hmp->io_spin);
157 			kfree(dio, M_HAMMER2);
158 			dio = xio;
159 		}
160 	}
161 
162 	/*
163 	 * Obtain/Validate the buffer.
164 	 */
165 	iocb->dio = dio;
166 
167 	if (dio->act < 5)	/* SMP race ok */
168 		++dio->act;
169 
170 	for (;;) {
171 		refs = dio->refs;
172 		cpu_ccfence();
173 
174 		/*
175 		 * Issue the iocb immediately if the buffer is already good.
176 		 * Once set GOOD cannot be cleared until refs drops to 0.
177 		 *
178 		 * lfence required because dio's are not interlocked for
179 		 * the DIO_GOOD test.
180 		 */
181 		if (refs & HAMMER2_DIO_GOOD) {
182 			cpu_lfence();
183 			iocb->callback(iocb);
184 			break;
185 		}
186 
187 		/*
188 		 * Try to own the DIO by setting INPROG so we can issue
189 		 * I/O on it.
190 		 */
191 		if (refs & HAMMER2_DIO_INPROG) {
192 			/*
193 			 * If DIO_INPROG is already set then set WAITING and
194 			 * queue the iocb.
195 			 */
196 			hammer2_spin_ex(&dio->spin);
197 			if (atomic_cmpset_64(&dio->refs, refs,
198 					      refs | HAMMER2_DIO_WAITING)) {
199 				iocb->flags |= HAMMER2_IOCB_ONQ |
200 					       HAMMER2_IOCB_INPROG;
201 				TAILQ_INSERT_TAIL(&dio->iocbq, iocb, entry);
202 				hammer2_spin_unex(&dio->spin);
203 				break;
204 			}
205 			hammer2_spin_unex(&dio->spin);
206 			/* retry */
207 		} else {
208 			/*
209 			 * If DIO_INPROG is not set then set it and issue the
210 			 * callback immediately to start I/O.
211 			 */
212 			if (atomic_cmpset_64(&dio->refs, refs,
213 					      refs | HAMMER2_DIO_INPROG)) {
214 				iocb->flags |= HAMMER2_IOCB_INPROG;
215 				iocb->callback(iocb);
216 				break;
217 			}
218 			/* retry */
219 		}
220 		/* retry */
221 	}
222 }
223 
224 /*
225  * Quickly obtain a good DIO buffer, return NULL if the system no longer
226  * caches the data.
227  */
228 hammer2_io_t *
229 hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize)
230 {
231 	hammer2_iocb_t iocb;
232 	hammer2_io_t *dio;
233 	struct buf *bp;
234 	off_t pbase;
235 	off_t pmask;
236 	int psize = HAMMER2_PBUFSIZE;
237 	uint64_t orefs;
238 	uint64_t nrefs;
239 
240 	pmask = ~(hammer2_off_t)(psize - 1);
241 
242 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
243 	lbase &= ~HAMMER2_OFF_MASK_RADIX;
244 	pbase = lbase & pmask;
245 	if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
246 		kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
247 			pbase, lbase, lsize, pmask);
248 	}
249 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
250 
251 	/*
252 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
253 	 */
254 	hammer2_spin_sh(&hmp->io_spin);
255 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
256 	if (dio == NULL) {
257 		hammer2_spin_unsh(&hmp->io_spin);
258 		return NULL;
259 	}
260 
261 	if ((atomic_fetchadd_64(&dio->refs, 1) & HAMMER2_DIO_MASK) == 0)
262 		atomic_add_int(&dio->hmp->iofree_count, -1);
263 	hammer2_spin_unsh(&hmp->io_spin);
264 
265 	if (dio->act < 5)	/* SMP race ok */
266 		++dio->act;
267 
268 	/*
269 	 * Obtain/validate the buffer.  Do NOT issue I/O.  Discard if
270 	 * the system does not have the data already cached.
271 	 */
272 	nrefs = (uint64_t)-1;
273 	for (;;) {
274 		orefs = dio->refs;
275 		cpu_ccfence();
276 
277 		/*
278 		 * Issue the iocb immediately if the buffer is already good.
279 		 * Once set GOOD cannot be cleared until refs drops to 0.
280 		 *
281 		 * lfence required because dio is not interlockedf for
282 		 * the DIO_GOOD test.
283 		 */
284 		if (orefs & HAMMER2_DIO_GOOD) {
285 			cpu_lfence();
286 			break;
287 		}
288 
289 		/*
290 		 * Try to own the DIO by setting INPROG so we can issue
291 		 * I/O on it.  INPROG might already be set, in which case
292 		 * there is no way we can do this non-blocking so we punt.
293 		 */
294 		if ((orefs & HAMMER2_DIO_INPROG))
295 			break;
296 		nrefs = orefs | HAMMER2_DIO_INPROG;
297 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs) == 0)
298 			continue;
299 
300 		/*
301 		 * We own DIO_INPROG, try to set DIO_GOOD.
302 		 *
303 		 * For now do not use GETBLK_NOWAIT because
304 		 */
305 		bp = dio->bp;
306 		dio->bp = NULL;
307 		if (bp == NULL) {
308 #if 0
309 			bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0);
310 #endif
311 			bread(hmp->devvp, dio->pbase, dio->psize, &bp);
312 		}
313 
314 		/*
315 		 * System buffer must also have remained cached.
316 		 */
317 		if (bp) {
318 			if ((bp->b_flags & B_ERROR) == 0 &&
319 			    (bp->b_flags & B_CACHE)) {
320 				dio->bp = bp;	/* assign BEFORE setting flag */
321 				atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
322 			} else {
323 				bqrelse(bp);
324 				bp = NULL;
325 			}
326 		}
327 
328 		/*
329 		 * Clear DIO_INPROG.
330 		 *
331 		 * This is actually a bit complicated, see
332 		 * hammer2_io_complete() for more information.
333 		 */
334 		iocb.dio = dio;
335 		iocb.flags = HAMMER2_IOCB_INPROG;
336 		hammer2_io_complete(&iocb);
337 		break;
338 	}
339 
340 	/*
341 	 * Only return the dio if its buffer is good.  If the buffer is not
342 	 * good be sure to clear INVALOK, meaning that invalidation is no
343 	 * longer acceptable
344 	 */
345 	if ((dio->refs & HAMMER2_DIO_GOOD) == 0) {
346 		hammer2_io_putblk(&dio);
347 	}
348 	return dio;
349 }
350 
351 /*
352  * Make sure that INVALOK is cleared on the dio associated with the specified
353  * data offset.  Called from bulkfree when a block becomes reusable.
354  */
355 void
356 hammer2_io_resetinval(hammer2_dev_t *hmp, off_t data_off)
357 {
358 	hammer2_io_t *dio;
359 
360 	data_off &= ~HAMMER2_PBUFMASK64;
361 	hammer2_spin_sh(&hmp->io_spin);
362 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, data_off);
363 	if (dio)
364 		atomic_clear_64(&dio->refs, HAMMER2_DIO_INVALOK);
365 	hammer2_spin_unsh(&hmp->io_spin);
366 }
367 
368 /*
369  * The originator of the iocb is finished with it.
370  */
371 void
372 hammer2_io_complete(hammer2_iocb_t *iocb)
373 {
374 	hammer2_io_t *dio = iocb->dio;
375 	hammer2_iocb_t *cbtmp;
376 	uint64_t orefs;
377 	uint64_t nrefs;
378 	uint32_t oflags;
379 	uint32_t nflags;
380 
381 	/*
382 	 * If IOCB_INPROG was not set completion is synchronous due to the
383 	 * buffer already being good.  We can simply set IOCB_DONE and return.
384 	 *
385 	 * In this situation DIO_INPROG is not set and we have no visibility
386 	 * on dio->bp.  We should not try to mess with dio->bp because another
387 	 * thread may be finishing up its processing.  dio->bp should already
388 	 * be set to BUF_KERNPROC()!
389 	 */
390 	if ((iocb->flags & HAMMER2_IOCB_INPROG) == 0) {
391 		atomic_set_int(&iocb->flags, HAMMER2_IOCB_DONE);
392 		return;
393 	}
394 
395 	/*
396 	 * The iocb was queued, obtained DIO_INPROG, and its callback was
397 	 * made.  The callback is now complete.  We still own DIO_INPROG.
398 	 *
399 	 * We can set DIO_GOOD if no error occurred, which gives certain
400 	 * stability guarantees to dio->bp and allows other accessors to
401 	 * short-cut access.  DIO_GOOD cannot be cleared until the last
402 	 * ref is dropped.
403 	 */
404 	KKASSERT(dio->refs & HAMMER2_DIO_INPROG);
405 	if (dio->bp) {
406 		BUF_KERNPROC(dio->bp);
407 		if ((dio->bp->b_flags & B_ERROR) == 0) {
408 			KKASSERT(dio->bp->b_flags & B_CACHE);
409 			atomic_set_64(&dio->refs, HAMMER2_DIO_GOOD);
410 		}
411 	}
412 
413 	/*
414 	 * Clean up the dio before marking the iocb as being done.  If another
415 	 * iocb is pending we chain to it while leaving DIO_INPROG set (it
416 	 * will call io completion and presumably clear DIO_INPROG).
417 	 *
418 	 * Otherwise if no other iocbs are pending we clear DIO_INPROG before
419 	 * finishing up the cbio.  This means that DIO_INPROG is cleared at
420 	 * the end of the chain before ANY of the cbios are marked done.
421 	 *
422 	 * NOTE: The TAILQ is not stable until the spin-lock is held.
423 	 */
424 	for (;;) {
425 		orefs = dio->refs;
426 		nrefs = orefs & ~(HAMMER2_DIO_WAITING | HAMMER2_DIO_INPROG);
427 
428 		if (orefs & HAMMER2_DIO_WAITING) {
429 			hammer2_spin_ex(&dio->spin);
430 			cbtmp = TAILQ_FIRST(&dio->iocbq);
431 			if (cbtmp) {
432 				/*
433 				 * NOTE: flags not adjusted in this case.
434 				 *	 Flags will be adjusted by the last
435 				 *	 iocb.
436 				 */
437 				TAILQ_REMOVE(&dio->iocbq, cbtmp, entry);
438 				hammer2_spin_unex(&dio->spin);
439 				cbtmp->callback(cbtmp);	/* chained */
440 				break;
441 			} else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
442 				hammer2_spin_unex(&dio->spin);
443 				break;
444 			}
445 			hammer2_spin_unex(&dio->spin);
446 			/* retry */
447 		} else if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
448 			break;
449 		} /* else retry */
450 		/* retry */
451 	}
452 
453 	/*
454 	 * Mark the iocb as done and wakeup any waiters.  This is done after
455 	 * all iocb chains have been called back and after DIO_INPROG has been
456 	 * cleared.  This avoids races against ref count drops by the waiting
457 	 * threads (a hard but not impossible SMP race) which might result in
458 	 * a 1->0 transition of the refs while DIO_INPROG is still set.
459 	 */
460 	for (;;) {
461 		oflags = iocb->flags;
462 		cpu_ccfence();
463 		nflags = oflags;
464 		nflags &= ~(HAMMER2_IOCB_WAKEUP | HAMMER2_IOCB_INPROG);
465 		nflags |= HAMMER2_IOCB_DONE;
466 
467 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
468 			if (oflags & HAMMER2_IOCB_WAKEUP)
469 				wakeup(iocb);
470 			/* SMP: iocb is now stale */
471 			break;
472 		}
473 		/* retry */
474 	}
475 	iocb = NULL;
476 
477 }
478 
479 /*
480  * Wait for an iocb's I/O to finish.
481  */
482 void
483 hammer2_iocb_wait(hammer2_iocb_t *iocb)
484 {
485 	uint32_t oflags;
486 	uint32_t nflags;
487 
488 	for (;;) {
489 		oflags = iocb->flags;
490 		cpu_ccfence();
491 		nflags = oflags | HAMMER2_IOCB_WAKEUP;
492 		if (oflags & HAMMER2_IOCB_DONE)
493 			break;
494 		tsleep_interlock(iocb, 0);
495 		if (atomic_cmpset_int(&iocb->flags, oflags, nflags)) {
496 			tsleep(iocb, PINTERLOCKED, "h2iocb", hz);
497 		}
498 	}
499 
500 }
501 
502 /*
503  * Release our ref on *diop.
504  *
505  * On the last ref we must atomically clear DIO_GOOD and set DIO_INPROG,
506  * then dispose of the underlying buffer.
507  */
508 void
509 hammer2_io_putblk(hammer2_io_t **diop)
510 {
511 	hammer2_dev_t *hmp;
512 	hammer2_io_t *dio;
513 	hammer2_iocb_t iocb;
514 	struct buf *bp;
515 	off_t peof;
516 	off_t pbase;
517 	int psize;
518 	uint64_t orefs;
519 	uint64_t nrefs;
520 
521 	dio = *diop;
522 	*diop = NULL;
523 	hmp = dio->hmp;
524 
525 	while (dio->unused01) {
526 		tsleep(&dio->unused01, 0, "h2DEBUG", hz);
527 	}
528 
529 	/*
530 	 * Drop refs.
531 	 *
532 	 * On the 1->0 transition clear flags and set INPROG.
533 	 *
534 	 * On the 1->0 transition if INPROG is already set, another thread
535 	 * is in lastdrop and we can just return after the transition.
536 	 *
537 	 * On any other transition we can generally just return.
538 	 */
539 	for (;;) {
540 		orefs = dio->refs;
541 		cpu_ccfence();
542 		nrefs = orefs - 1;
543 
544 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
545 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
546 			/*
547 			 * Lastdrop case, INPROG can be set.
548 			 */
549 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
550 			nrefs &= ~(HAMMER2_DIO_INVAL);
551 			nrefs |= HAMMER2_DIO_INPROG;
552 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
553 				break;
554 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
555 			/*
556 			 * Lastdrop case, INPROG already set.
557 			 */
558 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
559 				atomic_add_int(&hmp->iofree_count, 1);
560 				return;
561 			}
562 		} else {
563 			/*
564 			 * Normal drop case.
565 			 */
566 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
567 				return;
568 		}
569 		cpu_pause();
570 		/* retry */
571 	}
572 
573 	/*
574 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
575 	 * have been cleared.
576 	 *
577 	 * We can now dispose of the buffer, and should do it before calling
578 	 * io_complete() in case there's a race against a new reference
579 	 * which causes io_complete() to chain and instantiate the bp again.
580 	 */
581 	pbase = dio->pbase;
582 	psize = dio->psize;
583 	bp = dio->bp;
584 	dio->bp = NULL;
585 
586 	if (orefs & HAMMER2_DIO_GOOD) {
587 		KKASSERT(bp != NULL);
588 #if 1
589 		if (hammer2_inval_enable &&
590 		    (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
591 			++hammer2_iod_invals;
592 			bp->b_flags |= B_INVAL | B_RELBUF;
593 			brelse(bp);
594 		} else
595 #endif
596 		if (orefs & HAMMER2_DIO_DIRTY) {
597 			int hce;
598 
599 			dio_write_stats_update(dio);
600 			if ((hce = hammer2_cluster_write) > 0) {
601 				/*
602 				 * Allows write-behind to keep the buffer
603 				 * cache sane.
604 				 */
605 				peof = (pbase + HAMMER2_SEGMASK64) &
606 				       ~HAMMER2_SEGMASK64;
607 				bp->b_flags |= B_CLUSTEROK;
608 				cluster_write(bp, peof, psize, hce);
609 			} else {
610 				/*
611 				 * Allows dirty buffers to accumulate and
612 				 * possibly be canceled (e.g. by a 'rm'),
613 				 * will burst-write later.
614 				 */
615 				bp->b_flags |= B_CLUSTEROK;
616 				bdwrite(bp);
617 			}
618 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
619 			brelse(bp);
620 		} else {
621 			bqrelse(bp);
622 		}
623 	} else if (bp) {
624 #if 1
625 		if (hammer2_inval_enable &&
626 		    (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) {
627 			++hammer2_iod_invals;
628 			bp->b_flags |= B_INVAL | B_RELBUF;
629 			brelse(bp);
630 		} else
631 #endif
632 		if (orefs & HAMMER2_DIO_DIRTY) {
633 			dio_write_stats_update(dio);
634 			bdwrite(bp);
635 		} else {
636 			brelse(bp);
637 		}
638 	}
639 
640 	/*
641 	 * The instant we call io_complete dio is a free agent again and
642 	 * can be ripped out from under us.
643 	 *
644 	 * we can cleanup our final DIO_INPROG by simulating an iocb
645 	 * completion.
646 	 */
647 	hmp = dio->hmp;				/* extract fields */
648 	atomic_add_int(&hmp->iofree_count, 1);
649 	cpu_ccfence();
650 
651 	iocb.dio = dio;
652 	iocb.flags = HAMMER2_IOCB_INPROG;
653 	hammer2_io_complete(&iocb);
654 	dio = NULL;				/* dio stale */
655 
656 	/*
657 	 * We cache free buffers so re-use cases can use a shared lock, but
658 	 * if too many build up we have to clean them out.
659 	 */
660 	if (hmp->iofree_count > 65536) {
661 		struct hammer2_cleanupcb_info info;
662 
663 		RB_INIT(&info.tmptree);
664 		hammer2_spin_ex(&hmp->io_spin);
665 		if (hmp->iofree_count > 65536) {
666 			info.count = hmp->iofree_count / 4;
667 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
668 				hammer2_io_cleanup_callback, &info);
669 		}
670 		hammer2_spin_unex(&hmp->io_spin);
671 		hammer2_io_cleanup(hmp, &info.tmptree);
672 	}
673 }
674 
675 /*
676  * Cleanup any dio's with (INPROG | refs) == 0.
677  *
678  * Called to clean up cached DIOs on umount after all activity has been
679  * flushed.
680  */
681 static
682 int
683 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
684 {
685 	struct hammer2_cleanupcb_info *info = arg;
686 	hammer2_io_t *xio;
687 
688 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
689 		if (dio->act > 0) {
690 			--dio->act;
691 			return 0;
692 		}
693 		KKASSERT(dio->bp == NULL);
694 		RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
695 		xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
696 		KKASSERT(xio == NULL);
697 		if (--info->count <= 0)	/* limit scan */
698 			return(-1);
699 	}
700 	return 0;
701 }
702 
703 void
704 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
705 {
706 	hammer2_io_t *dio;
707 
708 	while ((dio = RB_ROOT(tree)) != NULL) {
709 		RB_REMOVE(hammer2_io_tree, tree, dio);
710 		KKASSERT(dio->bp == NULL &&
711 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
712 		kfree(dio, M_HAMMER2);
713 		atomic_add_int(&hammer2_dio_count, -1);
714 		atomic_add_int(&hmp->iofree_count, -1);
715 	}
716 }
717 
718 /*
719  * Returns a pointer to the requested data.
720  */
721 char *
722 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
723 {
724 	struct buf *bp;
725 	int off;
726 
727 	bp = dio->bp;
728 	KKASSERT(bp != NULL);
729 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
730 	KKASSERT(off >= 0 && off < bp->b_bufsize);
731 	return(bp->b_data + off);
732 }
733 
734 #if 0
735 /*
736  * Keep track of good CRCs in dio->good_crc_mask. XXX needs to be done
737  * in the chain structure, but chain structure needs to be persistent as
738  * well on refs=0 and it isn't.
739  */
740 int
741 hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp)
742 {
743 	hammer2_io_t *dio;
744 	uint64_t mask;
745 
746 	if ((dio = chain->dio) != NULL && chain->bytes >= 1024) {
747 		mask = hammer2_io_mask(dio, chain->bref.data_off, chain->bytes);
748 		*maskp = mask;
749 		if ((dio->crc_good_mask & mask) == mask)
750 			return 1;
751 		return 0;
752 	}
753 	*maskp = 0;
754 
755 	return 0;
756 }
757 
758 void
759 hammer2_io_crc_setmask(hammer2_io_t *dio, uint64_t mask)
760 {
761 	if (dio) {
762 		if (sizeof(long) == 8) {
763 			atomic_set_long(&dio->crc_good_mask, mask);
764 		} else {
765 #if _BYTE_ORDER == _LITTLE_ENDIAN
766 			atomic_set_int(&((int *)&dio->crc_good_mask)[0],
767 					(uint32_t)mask);
768 			atomic_set_int(&((int *)&dio->crc_good_mask)[1],
769 					(uint32_t)(mask >> 32));
770 #else
771 			atomic_set_int(&((int *)&dio->crc_good_mask)[0],
772 					(uint32_t)(mask >> 32));
773 			atomic_set_int(&((int *)&dio->crc_good_mask)[1],
774 					(uint32_t)mask);
775 #endif
776 		}
777 	}
778 }
779 
780 void
781 hammer2_io_crc_clrmask(hammer2_io_t *dio, uint64_t mask)
782 {
783 	if (dio) {
784 		if (sizeof(long) == 8) {
785 			atomic_clear_long(&dio->crc_good_mask, mask);
786 		} else {
787 #if _BYTE_ORDER == _LITTLE_ENDIAN
788 			atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
789 					(uint32_t)mask);
790 			atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
791 					(uint32_t)(mask >> 32));
792 #else
793 			atomic_clear_int(&((int *)&dio->crc_good_mask)[0],
794 					(uint32_t)(mask >> 32));
795 			atomic_clear_int(&((int *)&dio->crc_good_mask)[1],
796 					(uint32_t)mask);
797 #endif
798 		}
799 	}
800 }
801 #endif
802 
803 /*
804  * Helpers for hammer2_io_new*() functions
805  */
806 static
807 void
808 hammer2_iocb_new_callback(hammer2_iocb_t *iocb)
809 {
810 	hammer2_io_t *dio = iocb->dio;
811 	int gbctl = (iocb->flags & HAMMER2_IOCB_QUICK) ? GETBLK_NOWAIT : 0;
812 
813 	/*
814 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
815 	 * can't mess with it other than zero the requested range.
816 	 *
817 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
818 	 * do what needs to be done with dio->bp.
819 	 */
820 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
821 		if ((iocb->flags & HAMMER2_IOCB_READ) == 0) {
822 			if (iocb->lsize == dio->psize) {
823 				/*
824 				 * Fully covered buffer, try to optimize to
825 				 * avoid any I/O.  We might already have the
826 				 * buffer due to iocb chaining.
827 				 */
828 				if (dio->bp == NULL) {
829 					dio->bp = getblk(dio->hmp->devvp,
830 							 dio->pbase, dio->psize,
831 							 gbctl, 0);
832 				}
833 				if (dio->bp) {
834 					vfs_bio_clrbuf(dio->bp);
835 					dio->bp->b_flags |= B_CACHE;
836 				}
837 
838 				/*
839 				 * Invalidation is ok on newly allocated
840 				 * buffers which cover the entire buffer.
841 				 * Flag will be cleared on use by the de-dup
842 				 * code.
843 				 *
844 				 * hammer2_chain_modify() also checks this flag.
845 				 *
846 				 * QUICK mode is used by the freemap code to
847 				 * pre-validate a junk buffer to prevent an
848 				 * unnecessary read I/O.  We do NOT want
849 				 * to set INVALOK in that situation as the
850 				 * underlying allocations may be smaller.
851 				 */
852 				if ((iocb->flags & HAMMER2_IOCB_QUICK) == 0) {
853 					atomic_set_64(&dio->refs,
854 						      HAMMER2_DIO_INVALOK);
855 				}
856 			} else if (iocb->flags & HAMMER2_IOCB_QUICK) {
857 				/*
858 				 * Partial buffer, quick mode.  Do nothing.
859 				 * Do not instantiate the buffer or try to
860 				 * mark it B_CACHE because other portions of
861 				 * the buffer might have to be read by other
862 				 * accessors.
863 				 */
864 			} else if (dio->bp == NULL ||
865 				   (dio->bp->b_flags & B_CACHE) == 0) {
866 				/*
867 				 * Partial buffer, normal mode, requires
868 				 * read-before-write.  Chain the read.
869 				 *
870 				 * We might already have the buffer due to
871 				 * iocb chaining.  XXX unclear if we really
872 				 * need to write/release it and reacquire
873 				 * in that case.
874 				 *
875 				 * QUEUE ASYNC I/O, IOCB IS NOT YET COMPLETE.
876 				 */
877 				if (dio->bp) {
878 					if (dio->refs & HAMMER2_DIO_DIRTY) {
879 						dio_write_stats_update(dio);
880 						bdwrite(dio->bp);
881 					} else {
882 						bqrelse(dio->bp);
883 					}
884 					dio->bp = NULL;
885 				}
886 				atomic_set_int(&iocb->flags, HAMMER2_IOCB_READ);
887 				breadcb(dio->hmp->devvp,
888 					dio->pbase, dio->psize,
889 					hammer2_io_callback, iocb);
890 				return;
891 			} /* else buffer is good */
892 		} /* else callback from breadcb is complete */
893 	}
894 	if (dio->bp) {
895 		if (iocb->flags & HAMMER2_IOCB_ZERO)
896 			bzero(hammer2_io_data(dio, iocb->lbase), iocb->lsize);
897 		atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
898 	}
899 	hammer2_io_complete(iocb);
900 }
901 
902 static
903 int
904 _hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
905 	        hammer2_io_t **diop, int flags)
906 {
907 	hammer2_iocb_t iocb;
908 
909 	iocb.callback = hammer2_iocb_new_callback;
910 	iocb.cluster = NULL;
911 	iocb.chain = NULL;
912 	iocb.ptr = NULL;
913 	iocb.lbase = lbase;
914 	iocb.lsize = lsize;
915 	iocb.flags = flags;
916 	iocb.btype = btype;
917 	iocb.error = 0;
918 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
919 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
920 		hammer2_iocb_wait(&iocb);
921 	*diop = iocb.dio;
922 
923 	return (iocb.error);
924 }
925 
926 int
927 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
928 	       hammer2_io_t **diop)
929 {
930 	return(_hammer2_io_new(hmp, btype, lbase, lsize,
931 			       diop, HAMMER2_IOCB_ZERO));
932 }
933 
934 int
935 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
936 		 hammer2_io_t **diop)
937 {
938 	return(_hammer2_io_new(hmp, btype, lbase, lsize, diop, 0));
939 }
940 
941 /*
942  * This is called from the freemap to pre-validate a full-sized buffer
943  * whos contents we don't care about, in order to prevent an unnecessary
944  * read-before-write.
945  */
946 void
947 hammer2_io_newq(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize)
948 {
949 	hammer2_io_t *dio = NULL;
950 
951 	_hammer2_io_new(hmp, btype, lbase, lsize, &dio, HAMMER2_IOCB_QUICK);
952 	hammer2_io_bqrelse(&dio);
953 }
954 
955 static
956 void
957 hammer2_iocb_bread_callback(hammer2_iocb_t *iocb)
958 {
959 	hammer2_io_t *dio = iocb->dio;
960 	off_t peof;
961 	int error;
962 
963 	/*
964 	 * If IOCB_INPROG is not set the dio already has a good buffer and we
965 	 * can't mess with it other than zero the requested range.
966 	 *
967 	 * If IOCB_INPROG is set we also own DIO_INPROG at this time and can
968 	 * do what needs to be done with dio->bp.
969 	 */
970 	if (iocb->flags & HAMMER2_IOCB_INPROG) {
971 		int hce;
972 
973 		if (dio->bp && (dio->bp->b_flags & B_CACHE)) {
974 			/*
975 			 * Already good, likely due to being chained from
976 			 * another iocb.
977 			 */
978 			error = 0;
979 		} else if ((hce = hammer2_cluster_read) > 0) {
980 			/*
981 			 * Synchronous cluster I/O for now.
982 			 */
983 			if (dio->bp) {
984 				bqrelse(dio->bp);
985 				dio->bp = NULL;
986 			}
987 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
988 			       ~HAMMER2_SEGMASK64;
989 			error = cluster_read(dio->hmp->devvp, peof, dio->pbase,
990 					     dio->psize,
991 					     dio->psize, HAMMER2_PBUFSIZE*hce,
992 					     &dio->bp);
993 		} else {
994 			/*
995 			 * Synchronous I/O for now.
996 			 */
997 			if (dio->bp) {
998 				bqrelse(dio->bp);
999 				dio->bp = NULL;
1000 			}
1001 			error = bread(dio->hmp->devvp, dio->pbase,
1002 				      dio->psize, &dio->bp);
1003 		}
1004 		if (error) {
1005 			brelse(dio->bp);
1006 			dio->bp = NULL;
1007 		}
1008 	}
1009 	hammer2_io_complete(iocb);
1010 }
1011 
1012 int
1013 hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
1014 		hammer2_io_t **diop)
1015 {
1016 	hammer2_iocb_t iocb;
1017 
1018 	iocb.callback = hammer2_iocb_bread_callback;
1019 	iocb.cluster = NULL;
1020 	iocb.chain = NULL;
1021 	iocb.ptr = NULL;
1022 	iocb.lbase = lbase;
1023 	iocb.lsize = lsize;
1024 	iocb.btype = btype;
1025 	iocb.flags = 0;
1026 	iocb.error = 0;
1027 	hammer2_io_getblk(hmp, lbase, lsize, &iocb);
1028 	if ((iocb.flags & HAMMER2_IOCB_DONE) == 0)
1029 		hammer2_iocb_wait(&iocb);
1030 	*diop = iocb.dio;
1031 
1032 	return (iocb.error);
1033 }
1034 
1035 /*
1036  * System buf/bio async callback extracts the iocb and chains
1037  * to the iocb callback.
1038  */
1039 void
1040 hammer2_io_callback(struct bio *bio)
1041 {
1042 	struct buf *dbp = bio->bio_buf;
1043 	hammer2_iocb_t *iocb = bio->bio_caller_info1.ptr;
1044 	hammer2_io_t *dio;
1045 
1046 	dio = iocb->dio;
1047 	if ((bio->bio_flags & BIO_DONE) == 0)
1048 		bpdone(dbp, 0);
1049 	bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
1050 	dio->bp = bio->bio_buf;
1051 	iocb->callback(iocb);
1052 }
1053 
1054 void
1055 hammer2_io_bawrite(hammer2_io_t **diop)
1056 {
1057 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1058 	hammer2_io_putblk(diop);
1059 }
1060 
1061 void
1062 hammer2_io_bdwrite(hammer2_io_t **diop)
1063 {
1064 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1065 	hammer2_io_putblk(diop);
1066 }
1067 
1068 int
1069 hammer2_io_bwrite(hammer2_io_t **diop)
1070 {
1071 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
1072 	hammer2_io_putblk(diop);
1073 	return (0);	/* XXX */
1074 }
1075 
1076 void
1077 hammer2_io_setdirty(hammer2_io_t *dio)
1078 {
1079 	atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
1080 }
1081 
1082 /*
1083  * Request an invalidation.  The hammer2_io code will oblige only if
1084  * DIO_INVALOK is also set.  INVALOK is cleared if the dio is used
1085  * in a dedup lookup and prevents invalidation of the dirty buffer.
1086  */
1087 void
1088 hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
1089 {
1090 	if ((u_int)dio->psize == bytes)
1091 		atomic_set_64(&dio->refs, HAMMER2_DIO_INVAL);
1092 }
1093 
1094 void
1095 hammer2_io_brelse(hammer2_io_t **diop)
1096 {
1097 	hammer2_io_putblk(diop);
1098 }
1099 
1100 void
1101 hammer2_io_bqrelse(hammer2_io_t **diop)
1102 {
1103 	hammer2_io_putblk(diop);
1104 }
1105 
1106 int
1107 hammer2_io_isdirty(hammer2_io_t *dio)
1108 {
1109 	return((dio->refs & HAMMER2_DIO_DIRTY) != 0);
1110 }
1111 
1112 static
1113 void
1114 dio_write_stats_update(hammer2_io_t *dio)
1115 {
1116 	long *counterp;
1117 
1118 	switch(dio->btype) {
1119 	case 0:
1120 		return;
1121 	case HAMMER2_BREF_TYPE_DATA:
1122 		counterp = &hammer2_iod_file_write;
1123 		break;
1124 	case HAMMER2_BREF_TYPE_INODE:
1125 		counterp = &hammer2_iod_meta_write;
1126 		break;
1127 	case HAMMER2_BREF_TYPE_INDIRECT:
1128 		counterp = &hammer2_iod_indr_write;
1129 		break;
1130 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1131 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
1132 		counterp = &hammer2_iod_fmap_write;
1133 		break;
1134 	default:
1135 		counterp = &hammer2_iod_volu_write;
1136 		break;
1137 	}
1138 	*counterp += dio->psize;
1139 }
1140