xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision 3c7e5806)
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 #define HAMMER2_DOP_READ	1
38 #define HAMMER2_DOP_NEW		2
39 #define HAMMER2_DOP_NEWNZ	3
40 #define HAMMER2_DOP_READQ	4
41 
42 /*
43  * Implements an abstraction layer for synchronous and asynchronous
44  * buffered device I/O.  Can be used as an OS-abstraction but the main
45  * purpose is to allow larger buffers to be used against hammer2_chain's
46  * using smaller allocations, without causing deadlocks.
47  *
48  * The DIOs also record temporary state with limited persistence.  This
49  * feature is used to keep track of dedupable blocks.
50  */
51 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
52 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
53 
54 static int
55 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
56 {
57 	if (io1->pbase < io2->pbase)
58 		return(-1);
59 	if (io1->pbase > io2->pbase)
60 		return(1);
61 	return(0);
62 }
63 
64 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
65 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
66 		off_t, pbase);
67 
68 struct hammer2_cleanupcb_info {
69 	struct hammer2_io_tree tmptree;
70 	int	count;
71 };
72 
73 #if 0
74 static __inline
75 uint64_t
76 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
77 {
78 	uint64_t mask;
79 	int i;
80 
81 	if (bytes < 1024)	/* smaller chunks not supported */
82 		return 0;
83 
84 	/*
85 	 * Calculate crc check mask for larger chunks
86 	 */
87 	i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
88 	     HAMMER2_PBUFMASK) >> 10;
89 	if (i == 0 && bytes == HAMMER2_PBUFSIZE)
90 		return((uint64_t)-1);
91 	mask = ((uint64_t)1U << (bytes >> 10)) - 1;
92 	mask <<= i;
93 
94 	return mask;
95 }
96 #endif
97 
98 #ifdef HAMMER2_IO_DEBUG
99 
100 static __inline void
101 DIO_RECORD(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
102 {
103 	int i;
104 
105 	i = atomic_fetchadd_int(&dio->debug_index, 1) & HAMMER2_IO_DEBUG_MASK;
106 
107 	dio->debug_file[i] = file;
108 	dio->debug_line[i] = line;
109 	dio->debug_refs[i] = dio->refs;
110 	dio->debug_td[i] = curthread;
111 }
112 
113 #else
114 
115 #define DIO_RECORD(dio)
116 
117 #endif
118 
119 /*
120  * Returns the DIO corresponding to the data|radix, creating it if necessary.
121  *
122  * If createit is 0, NULL can be returned indicating that the DIO does not
123  * exist.  (btype) is ignored when createit is 0.
124  */
125 static __inline
126 hammer2_io_t *
127 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
128 		 int createit, int *isgoodp)
129 {
130 	hammer2_io_t *dio;
131 	hammer2_io_t *xio;
132 	hammer2_key_t lbase;
133 	hammer2_key_t pbase;
134 	hammer2_key_t pmask;
135 	uint64_t refs;
136 	int lsize;
137 	int psize;
138 
139 	psize = HAMMER2_PBUFSIZE;
140 	pmask = ~(hammer2_off_t)(psize - 1);
141 	lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
142 	lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
143 	pbase = lbase & pmask;
144 
145 	if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
146 		kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
147 			pbase, lbase, lsize, pmask);
148 	}
149 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
150 	*isgoodp = 0;
151 
152 	/*
153 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
154 	 */
155 	hammer2_spin_sh(&hmp->io_spin);
156 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
157 	if (dio) {
158 		refs = atomic_fetchadd_64(&dio->refs, 1);
159 		if ((refs & HAMMER2_DIO_MASK) == 0) {
160 			atomic_add_int(&dio->hmp->iofree_count, -1);
161 		}
162 		if (refs & HAMMER2_DIO_GOOD)
163 			*isgoodp = 1;
164 		hammer2_spin_unsh(&hmp->io_spin);
165 	} else if (createit) {
166 		refs = 0;
167 		hammer2_spin_unsh(&hmp->io_spin);
168 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
169 		dio->hmp = hmp;
170 		dio->pbase = pbase;
171 		dio->psize = psize;
172 		dio->btype = btype;
173 		dio->refs = refs + 1;
174 		dio->act = 5;
175 		hammer2_spin_ex(&hmp->io_spin);
176 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
177 		if (xio == NULL) {
178 			atomic_add_int(&hammer2_dio_count, 1);
179 			hammer2_spin_unex(&hmp->io_spin);
180 		} else {
181 			refs = atomic_fetchadd_64(&xio->refs, 1);
182 			if ((refs & HAMMER2_DIO_MASK) == 0)
183 				atomic_add_int(&xio->hmp->iofree_count, -1);
184 			if (refs & HAMMER2_DIO_GOOD)
185 				*isgoodp = 1;
186 			hammer2_spin_unex(&hmp->io_spin);
187 			kfree(dio, M_HAMMER2);
188 			dio = xio;
189 		}
190 	} else {
191 		hammer2_spin_unsh(&hmp->io_spin);
192 		return NULL;
193 	}
194 	dio->ticks = ticks;
195 	if (dio->act < 10)
196 		++dio->act;
197 
198 	return dio;
199 }
200 
201 /*
202  * Acquire the requested dio.  If DIO_GOOD is not set we must instantiate
203  * a buffer.  If set the buffer already exists and is good to go.
204  */
205 hammer2_io_t *
206 _hammer2_io_getblk(hammer2_dev_t *hmp, int btype, off_t lbase,
207 		   int lsize, int op HAMMER2_IO_DEBUG_ARGS)
208 {
209 	hammer2_io_t *dio;
210 	off_t peof;
211 	uint64_t orefs;
212 	uint64_t nrefs;
213 	int isgood;
214 	int error;
215 	int hce;
216 	int bflags;
217 
218 	bflags = ((btype == HAMMER2_BREF_TYPE_DATA) ? B_NOTMETA : 0);
219 	bflags |= B_KVABIO;
220 
221 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
222 
223 	if (op == HAMMER2_DOP_READQ) {
224 		dio = hammer2_io_alloc(hmp, lbase, btype, 0, &isgood);
225 		if (dio == NULL)
226 			return NULL;
227 		op = HAMMER2_DOP_READ;
228 	} else {
229 		dio = hammer2_io_alloc(hmp, lbase, btype, 1, &isgood);
230 	}
231 
232 	for (;;) {
233 		orefs = dio->refs;
234 		cpu_ccfence();
235 
236 		/*
237 		 * Buffer is already good, handle the op and return.
238 		 */
239 		if (orefs & HAMMER2_DIO_GOOD) {
240 			if (isgood == 0)
241 				cpu_mfence();
242 			bkvasync(dio->bp);
243 
244 			switch(op) {
245 			case HAMMER2_DOP_NEW:
246 				bzero(hammer2_io_data(dio, lbase), lsize);
247 				/* fall through */
248 			case HAMMER2_DOP_NEWNZ:
249 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
250 				break;
251 			case HAMMER2_DOP_READ:
252 			default:
253 				/* nothing to do */
254 				break;
255 			}
256 			DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
257 			return (dio);
258 		}
259 
260 		/*
261 		 * Try to own the DIO
262 		 */
263 		if (orefs & HAMMER2_DIO_INPROG) {
264 			nrefs = orefs | HAMMER2_DIO_WAITING;
265 			tsleep_interlock(dio, 0);
266 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
267 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
268 			}
269 			/* retry */
270 		} else {
271 			nrefs = orefs | HAMMER2_DIO_INPROG;
272 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
273 				break;
274 			}
275 		}
276 	}
277 
278 	/*
279 	 * We break to here if GOOD is not set and we acquired INPROG for
280 	 * the I/O.
281 	 */
282 	KKASSERT(dio->bp == NULL);
283 	if (btype == HAMMER2_BREF_TYPE_DATA)
284 		hce = hammer2_cluster_data_read;
285 	else
286 		hce = hammer2_cluster_meta_read;
287 
288 	error = 0;
289 	if (dio->pbase == (lbase & ~HAMMER2_OFF_MASK_RADIX) &&
290 	    dio->psize == lsize) {
291 		switch(op) {
292 		case HAMMER2_DOP_NEW:
293 		case HAMMER2_DOP_NEWNZ:
294 			dio->bp = getblk(dio->hmp->devvp,
295 					 dio->pbase, dio->psize,
296 					 GETBLK_KVABIO, 0);
297 			if (op == HAMMER2_DOP_NEW) {
298 				bkvasync(dio->bp);
299 				bzero(dio->bp->b_data, dio->psize);
300 			}
301 			atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
302 			break;
303 		case HAMMER2_DOP_READ:
304 		default:
305 			if (hce > 0) {
306 				/*
307 				 * Synchronous cluster I/O for now.
308 				 */
309 				peof = (dio->pbase + HAMMER2_SEGMASK64) &
310 				       ~HAMMER2_SEGMASK64;
311 				dio->bp = NULL;
312 				error = cluster_readx(dio->hmp->devvp,
313 						     peof, dio->pbase,
314 						     dio->psize, bflags,
315 						     dio->psize,
316 						     HAMMER2_PBUFSIZE*hce,
317 						     &dio->bp);
318 			} else {
319 				dio->bp = NULL;
320 				error = breadnx(dio->hmp->devvp, dio->pbase,
321 						dio->psize, bflags,
322 					        NULL, NULL, 0, &dio->bp);
323 			}
324 		}
325 	} else {
326 		if (hce > 0) {
327 			/*
328 			 * Synchronous cluster I/O for now.
329 			 */
330 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
331 			       ~HAMMER2_SEGMASK64;
332 			error = cluster_readx(dio->hmp->devvp,
333 					      peof, dio->pbase, dio->psize,
334 					      bflags,
335 					      dio->psize, HAMMER2_PBUFSIZE*hce,
336 					      &dio->bp);
337 		} else {
338 			error = breadnx(dio->hmp->devvp, dio->pbase,
339 				        dio->psize, bflags,
340 					NULL, NULL, 0, &dio->bp);
341 		}
342 		if (dio->bp) {
343 			/*
344 			 * Handle NEW flags
345 			 */
346 			switch(op) {
347 			case HAMMER2_DOP_NEW:
348 				bkvasync(dio->bp);
349 				bzero(hammer2_io_data(dio, lbase), lsize);
350 				/* fall through */
351 			case HAMMER2_DOP_NEWNZ:
352 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
353 				break;
354 			case HAMMER2_DOP_READ:
355 			default:
356 				break;
357 			}
358 
359 			/*
360 			 * Tell the kernel that the buffer cache is not
361 			 * meta-data based on the btype.  This allows
362 			 * swapcache to distinguish between data and
363 			 * meta-data.
364 			 */
365 			switch(btype) {
366 			case HAMMER2_BREF_TYPE_DATA:
367 				dio->bp->b_flags |= B_NOTMETA;
368 				break;
369 			default:
370 				break;
371 			}
372 		}
373 	}
374 
375 	if (dio->bp) {
376 		bkvasync(dio->bp);
377 		BUF_KERNPROC(dio->bp);
378 		dio->bp->b_flags &= ~B_AGE;
379 		/* dio->bp->b_debug_info2 = dio; */
380 	}
381 	dio->error = error;
382 
383 	/*
384 	 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
385 	 */
386 	for (;;) {
387 		orefs = dio->refs;
388 		cpu_ccfence();
389 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_WAITING);
390 		if (error == 0)
391 			nrefs |= HAMMER2_DIO_GOOD;
392 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
393 			if (orefs & HAMMER2_DIO_WAITING)
394 				wakeup(dio);
395 			break;
396 		}
397 		cpu_pause();
398 	}
399 
400 	/* XXX error handling */
401 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
402 
403 	return dio;
404 }
405 
406 /*
407  * Release our ref on *diop.
408  *
409  * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
410  * of dio->bp.  Then we clean up DIO_INPROG and DIO_WAITING.
411  */
412 void
413 _hammer2_io_putblk(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
414 {
415 	hammer2_dev_t *hmp;
416 	hammer2_io_t *dio;
417 	struct buf *bp;
418 	off_t pbase;
419 	int psize;
420 	int dio_limit;
421 	uint64_t orefs;
422 	uint64_t nrefs;
423 
424 	dio = *diop;
425 	*diop = NULL;
426 	hmp = dio->hmp;
427 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
428 
429 	KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
430 
431 	/*
432 	 * Drop refs.
433 	 *
434 	 * On the 1->0 transition clear GOOD and set INPROG, and break.
435 	 * On any other transition we can return early.
436 	 */
437 	for (;;) {
438 		orefs = dio->refs;
439 		cpu_ccfence();
440 
441 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
442 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
443 			/*
444 			 * Lastdrop case, INPROG can be set.  GOOD must be
445 			 * cleared to prevent the getblk shortcut.
446 			 */
447 			nrefs = orefs - 1;
448 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
449 			nrefs |= HAMMER2_DIO_INPROG;
450 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
451 				break;
452 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
453 			/*
454 			 * Lastdrop case, INPROG already set.  We must
455 			 * wait for INPROG to clear.
456 			 */
457 			nrefs = orefs | HAMMER2_DIO_WAITING;
458 			tsleep_interlock(dio, 0);
459 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
460 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
461 			}
462 			/* retry */
463 		} else {
464 			/*
465 			 * Normal drop case.
466 			 */
467 			nrefs = orefs - 1;
468 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
469 				return;
470 			/* retry */
471 		}
472 		cpu_pause();
473 		/* retry */
474 	}
475 
476 	/*
477 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
478 	 * have been cleared.  iofree_count has not yet been incremented,
479 	 * note that another accessor race will decrement iofree_count so
480 	 * we have to increment it regardless.
481 	 *
482 	 * We can now dispose of the buffer, and should do it before calling
483 	 * io_complete() in case there's a race against a new reference
484 	 * which causes io_complete() to chain and instantiate the bp again.
485 	 */
486 	pbase = dio->pbase;
487 	psize = dio->psize;
488 	bp = dio->bp;
489 	dio->bp = NULL;
490 
491 	if ((orefs & HAMMER2_DIO_GOOD) && bp) {
492 		/*
493 		 * Non-errored disposal of bp
494 		 */
495 		if (orefs & HAMMER2_DIO_DIRTY) {
496 			dio_write_stats_update(dio, bp);
497 
498 			/*
499 			 * Allows dirty buffers to accumulate and
500 			 * possibly be canceled (e.g. by a 'rm'),
501 			 * by default we will burst-write later.
502 			 *
503 			 * We generally do NOT want to issue an actual
504 			 * b[a]write() or cluster_write() here.  Due to
505 			 * the way chains are locked, buffers may be cycled
506 			 * in and out quite often and disposal here can cause
507 			 * multiple writes or write-read stalls.
508 			 *
509 			 * If FLUSH is set we do want to issue the actual
510 			 * write.  This typically occurs in the write-behind
511 			 * case when writing to large files.
512 			 */
513 			off_t peof;
514 			int hce;
515 			if (dio->refs & HAMMER2_DIO_FLUSH) {
516 				if ((hce = hammer2_cluster_write) != 0) {
517 					peof = (pbase + HAMMER2_SEGMASK64) &
518 					       ~HAMMER2_SEGMASK64;
519 					bp->b_flags |= B_CLUSTEROK;
520 					cluster_write(bp, peof, psize, hce);
521 				} else {
522 					bp->b_flags &= ~B_CLUSTEROK;
523 					bawrite(bp);
524 				}
525 			} else {
526 				bp->b_flags &= ~B_CLUSTEROK;
527 				bdwrite(bp);
528 			}
529 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
530 			brelse(bp);
531 		} else {
532 			bqrelse(bp);
533 		}
534 	} else if (bp) {
535 		/*
536 		 * Errored disposal of bp
537 		 */
538 		brelse(bp);
539 	}
540 
541 	/*
542 	 * Update iofree_count before disposing of the dio
543 	 */
544 	hmp = dio->hmp;
545 	atomic_add_int(&hmp->iofree_count, 1);
546 
547 	/*
548 	 * Clear INPROG, GOOD, and WAITING (GOOD should already be clear).
549 	 *
550 	 * Also clear FLUSH as it was handled above.
551 	 */
552 	for (;;) {
553 		orefs = dio->refs;
554 		cpu_ccfence();
555 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
556 				  HAMMER2_DIO_WAITING | HAMMER2_DIO_FLUSH);
557 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
558 			if (orefs & HAMMER2_DIO_WAITING)
559 				wakeup(dio);
560 			break;
561 		}
562 		cpu_pause();
563 	}
564 
565 	/*
566 	 * We cache free buffers so re-use cases can use a shared lock, but
567 	 * if too many build up we have to clean them out.
568 	 */
569 	dio_limit = hammer2_dio_limit;
570 	if (dio_limit < 256)
571 		dio_limit = 256;
572 	if (dio_limit > 1024*1024)
573 		dio_limit = 1024*1024;
574 	if (hmp->iofree_count > dio_limit) {
575 		struct hammer2_cleanupcb_info info;
576 
577 		RB_INIT(&info.tmptree);
578 		hammer2_spin_ex(&hmp->io_spin);
579 		if (hmp->iofree_count > dio_limit) {
580 			info.count = hmp->iofree_count / 5;
581 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
582 				hammer2_io_cleanup_callback, &info);
583 		}
584 		hammer2_spin_unex(&hmp->io_spin);
585 		hammer2_io_cleanup(hmp, &info.tmptree);
586 	}
587 }
588 
589 /*
590  * Cleanup any dio's with (INPROG | refs) == 0.
591  *
592  * Called to clean up cached DIOs on umount after all activity has been
593  * flushed.
594  */
595 static
596 int
597 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
598 {
599 	struct hammer2_cleanupcb_info *info = arg;
600 	hammer2_io_t *xio;
601 
602 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
603 		if (dio->act > 0) {
604 			int act;
605 
606 			act = dio->act - (ticks - dio->ticks) / hz - 1;
607 			if (act > 0) {
608 				dio->act = act;
609 				return 0;
610 			}
611 			dio->act = 0;
612 		}
613 		KKASSERT(dio->bp == NULL);
614 		if (info->count > 0) {
615 			RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
616 			xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
617 			KKASSERT(xio == NULL);
618 			--info->count;
619 		}
620 	}
621 	return 0;
622 }
623 
624 void
625 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
626 {
627 	hammer2_io_t *dio;
628 
629 	while ((dio = RB_ROOT(tree)) != NULL) {
630 		RB_REMOVE(hammer2_io_tree, tree, dio);
631 		KKASSERT(dio->bp == NULL &&
632 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
633 		if (dio->refs & HAMMER2_DIO_DIRTY) {
634 			kprintf("hammer2_io_cleanup: Dirty buffer "
635 				"%016jx/%d (bp=%p)\n",
636 				dio->pbase, dio->psize, dio->bp);
637 		}
638 		kfree(dio, M_HAMMER2);
639 		atomic_add_int(&hammer2_dio_count, -1);
640 		atomic_add_int(&hmp->iofree_count, -1);
641 	}
642 }
643 
644 /*
645  * Returns a pointer to the requested data.
646  */
647 char *
648 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
649 {
650 	struct buf *bp;
651 	int off;
652 
653 	bp = dio->bp;
654 	KKASSERT(bp != NULL);
655 	bkvasync(bp);
656 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
657 	KKASSERT(off >= 0 && off < bp->b_bufsize);
658 	return(bp->b_data + off);
659 }
660 
661 int
662 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
663 	       hammer2_io_t **diop)
664 {
665 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEW);
666 	return ((*diop)->error);
667 }
668 
669 int
670 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
671 		 hammer2_io_t **diop)
672 {
673 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEWNZ);
674 	return ((*diop)->error);
675 }
676 
677 int
678 _hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
679 		hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
680 {
681 #ifdef HAMMER2_IO_DEBUG
682 	hammer2_io_t *dio;
683 #endif
684 
685 	*diop = _hammer2_io_getblk(hmp, btype, lbase, lsize,
686 				   HAMMER2_DOP_READ HAMMER2_IO_DEBUG_CALL);
687 #ifdef HAMMER2_IO_DEBUG
688 	if ((dio = *diop) != NULL) {
689 		int i = (dio->debug_index - 1) & HAMMER2_IO_DEBUG_MASK;
690 		dio->debug_data[i] = debug_data;
691 	}
692 #endif
693 	return ((*diop)->error);
694 }
695 
696 hammer2_io_t *
697 _hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase,
698 		     int lsize HAMMER2_IO_DEBUG_ARGS)
699 {
700 	hammer2_io_t *dio;
701 
702 	dio = _hammer2_io_getblk(hmp, 0, lbase, lsize,
703 				 HAMMER2_DOP_READQ HAMMER2_IO_DEBUG_CALL);
704 	return dio;
705 }
706 
707 void
708 _hammer2_io_bawrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
709 {
710 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
711 				      HAMMER2_DIO_FLUSH);
712 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
713 }
714 
715 void
716 _hammer2_io_bdwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
717 {
718 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
719 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
720 }
721 
722 int
723 _hammer2_io_bwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
724 {
725 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
726 				      HAMMER2_DIO_FLUSH);
727 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
728 	return (0);	/* XXX */
729 }
730 
731 void
732 hammer2_io_setdirty(hammer2_io_t *dio)
733 {
734 	atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
735 }
736 
737 /*
738  * This routine is called when a MODIFIED chain is being DESTROYED,
739  * in an attempt to allow the related buffer cache buffer to be
740  * invalidated and discarded instead of flushing it to disk.
741  *
742  * At the moment this case is only really useful for file meta-data.
743  * File data is already handled via the logical buffer cache associated
744  * with the vnode, and will be discarded if it was never flushed to disk.
745  * File meta-data may include inodes, directory entries, and indirect blocks.
746  *
747  * XXX
748  * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
749  * invalidated might be smaller.  Most of the meta-data structures above
750  * are in the 'smaller' category.  For now, don't try to invalidate the
751  * data areas.
752  */
753 void
754 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
755 {
756 	/* NOP */
757 }
758 
759 void
760 _hammer2_io_brelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
761 {
762 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
763 }
764 
765 void
766 _hammer2_io_bqrelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
767 {
768 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
769 }
770 
771 /*
772  * Set dedup validation bits in a DIO.  We do not need the buffer cache
773  * buffer for this.  This must be done concurrent with setting bits in
774  * the freemap so as to interlock with bulkfree's clearing of those bits.
775  */
776 void
777 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
778 {
779 	hammer2_io_t *dio;
780 	uint64_t mask;
781 	int lsize;
782 	int isgood;
783 
784 	dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1, &isgood);
785 	lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
786 	mask = hammer2_dedup_mask(dio, bref->data_off, lsize);
787 	atomic_clear_64(&dio->dedup_valid, mask);
788 	atomic_set_64(&dio->dedup_alloc, mask);
789 	hammer2_io_putblk(&dio);
790 }
791 
792 /*
793  * Clear dedup validation bits in a DIO.  This is typically done when
794  * a modified chain is destroyed or by the bulkfree code.  No buffer
795  * is needed for this operation.  If the DIO no longer exists it is
796  * equivalent to the bits not being set.
797  */
798 void
799 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
800 			hammer2_off_t data_off, u_int bytes)
801 {
802 	hammer2_io_t *dio;
803 	uint64_t mask;
804 	int isgood;
805 
806 	if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
807 		return;
808 	if (btype != HAMMER2_BREF_TYPE_DATA)
809 		return;
810 	dio = hammer2_io_alloc(hmp, data_off, btype, 0, &isgood);
811 	if (dio) {
812 		if (data_off < dio->pbase ||
813 		    (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
814 		    dio->pbase + dio->psize) {
815 			panic("hammer2_dedup_delete: DATAOFF BAD "
816 			      "%016jx/%d %016jx\n",
817 			      data_off, bytes, dio->pbase);
818 		}
819 		mask = hammer2_dedup_mask(dio, data_off, bytes);
820 		atomic_clear_64(&dio->dedup_alloc, mask);
821 		atomic_clear_64(&dio->dedup_valid, mask);
822 		hammer2_io_putblk(&dio);
823 	}
824 }
825 
826 /*
827  * Assert that dedup allocation bits in a DIO are not set.  This operation
828  * does not require a buffer.  The DIO does not need to exist.
829  */
830 void
831 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
832 {
833 	hammer2_io_t *dio;
834 	int isgood;
835 
836 	dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA,
837 			       0, &isgood);
838 	if (dio) {
839 		KASSERT((dio->dedup_alloc &
840 			  hammer2_dedup_mask(dio, data_off, bytes)) == 0,
841 			("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
842 			data_off,
843 			bytes,
844 			hammer2_dedup_mask(dio, data_off, bytes),
845 			dio->dedup_alloc));
846 		hammer2_io_putblk(&dio);
847 	}
848 }
849 
850 static
851 void
852 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
853 {
854 	long *counterp;
855 
856 	if (bp->b_flags & B_DELWRI)
857 		return;
858 
859 	switch(dio->btype) {
860 	case 0:
861 		return;
862 	case HAMMER2_BREF_TYPE_DATA:
863 		counterp = &hammer2_iod_file_write;
864 		break;
865 	case HAMMER2_BREF_TYPE_DIRENT:
866 	case HAMMER2_BREF_TYPE_INODE:
867 		counterp = &hammer2_iod_meta_write;
868 		break;
869 	case HAMMER2_BREF_TYPE_INDIRECT:
870 		counterp = &hammer2_iod_indr_write;
871 		break;
872 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
873 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
874 		counterp = &hammer2_iod_fmap_write;
875 		break;
876 	default:
877 		counterp = &hammer2_iod_volu_write;
878 		break;
879 	}
880 	*counterp += dio->psize;
881 }
882 
883 void
884 hammer2_io_bkvasync(hammer2_io_t *dio)
885 {
886 	KKASSERT(dio->bp != NULL);
887 	bkvasync(dio->bp);
888 }
889 
890 /*
891  * Ref a dio that is already owned
892  */
893 void
894 _hammer2_io_ref(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
895 {
896 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
897 	atomic_add_64(&dio->refs, 1);
898 }
899