xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision 3851e4b8)
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 #define HAMMER2_DOP_READ	1
38 #define HAMMER2_DOP_NEW		2
39 #define HAMMER2_DOP_NEWNZ	3
40 #define HAMMER2_DOP_READQ	4
41 
42 /*
43  * Implements an abstraction layer for synchronous and asynchronous
44  * buffered device I/O.  Can be used as an OS-abstraction but the main
45  * purpose is to allow larger buffers to be used against hammer2_chain's
46  * using smaller allocations, without causing deadlocks.
47  *
48  * The DIOs also record temporary state with limited persistence.  This
49  * feature is used to keep track of dedupable blocks.
50  */
51 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
52 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
53 
54 static int
55 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
56 {
57 	if (io1->pbase < io2->pbase)
58 		return(-1);
59 	if (io1->pbase > io2->pbase)
60 		return(1);
61 	return(0);
62 }
63 
64 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
65 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
66 		off_t, pbase);
67 
68 struct hammer2_cleanupcb_info {
69 	struct hammer2_io_tree tmptree;
70 	int	count;
71 };
72 
73 #if 0
74 static __inline
75 uint64_t
76 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
77 {
78 	uint64_t mask;
79 	int i;
80 
81 	if (bytes < 1024)	/* smaller chunks not supported */
82 		return 0;
83 
84 	/*
85 	 * Calculate crc check mask for larger chunks
86 	 */
87 	i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
88 	     HAMMER2_PBUFMASK) >> 10;
89 	if (i == 0 && bytes == HAMMER2_PBUFSIZE)
90 		return((uint64_t)-1);
91 	mask = ((uint64_t)1U << (bytes >> 10)) - 1;
92 	mask <<= i;
93 
94 	return mask;
95 }
96 #endif
97 
98 #ifdef HAMMER2_IO_DEBUG
99 
100 static __inline void
101 DIO_RECORD(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
102 {
103 	int i;
104 
105 	i = atomic_fetchadd_int(&dio->debug_index, 1) & HAMMER2_IO_DEBUG_MASK;
106 
107 	dio->debug_file[i] = file;
108 	dio->debug_line[i] = line;
109 	dio->debug_refs[i] = dio->refs;
110 	dio->debug_td[i] = curthread;
111 }
112 
113 #else
114 
115 #define DIO_RECORD(dio)
116 
117 #endif
118 
119 /*
120  * Returns the DIO corresponding to the data|radix, creating it if necessary.
121  *
122  * If createit is 0, NULL can be returned indicating that the DIO does not
123  * exist.  (btype) is ignored when createit is 0.
124  */
125 static __inline
126 hammer2_io_t *
127 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
128 		 int createit, int *isgoodp)
129 {
130 	hammer2_io_t *dio;
131 	hammer2_io_t *xio;
132 	hammer2_key_t lbase;
133 	hammer2_key_t pbase;
134 	hammer2_key_t pmask;
135 	uint64_t refs;
136 	int lsize;
137 	int psize;
138 
139 	psize = HAMMER2_PBUFSIZE;
140 	pmask = ~(hammer2_off_t)(psize - 1);
141 	lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
142 	lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
143 	pbase = lbase & pmask;
144 
145 	if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
146 		kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
147 			pbase, lbase, lsize, pmask);
148 	}
149 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
150 	*isgoodp = 0;
151 
152 	/*
153 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
154 	 */
155 	hammer2_spin_sh(&hmp->io_spin);
156 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
157 	if (dio) {
158 		refs = atomic_fetchadd_64(&dio->refs, 1);
159 		if ((refs & HAMMER2_DIO_MASK) == 0) {
160 			atomic_add_int(&dio->hmp->iofree_count, -1);
161 		}
162 		if (refs & HAMMER2_DIO_GOOD)
163 			*isgoodp = 1;
164 		hammer2_spin_unsh(&hmp->io_spin);
165 	} else if (createit) {
166 		refs = 0;
167 		hammer2_spin_unsh(&hmp->io_spin);
168 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
169 		dio->hmp = hmp;
170 		dio->pbase = pbase;
171 		dio->psize = psize;
172 		dio->btype = btype;
173 		dio->refs = refs + 1;
174 		dio->act = 5;
175 		hammer2_spin_ex(&hmp->io_spin);
176 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
177 		if (xio == NULL) {
178 			atomic_add_int(&hammer2_dio_count, 1);
179 			hammer2_spin_unex(&hmp->io_spin);
180 		} else {
181 			refs = atomic_fetchadd_64(&xio->refs, 1);
182 			if ((refs & HAMMER2_DIO_MASK) == 0)
183 				atomic_add_int(&xio->hmp->iofree_count, -1);
184 			if (refs & HAMMER2_DIO_GOOD)
185 				*isgoodp = 1;
186 			hammer2_spin_unex(&hmp->io_spin);
187 			kfree(dio, M_HAMMER2);
188 			dio = xio;
189 		}
190 	} else {
191 		hammer2_spin_unsh(&hmp->io_spin);
192 		return NULL;
193 	}
194 	dio->ticks = ticks;
195 	if (dio->act < 10)
196 		++dio->act;
197 
198 	return dio;
199 }
200 
201 /*
202  * Acquire the requested dio.  If DIO_GOOD is not set we must instantiate
203  * a buffer.  If set the buffer already exists and is good to go.
204  */
205 hammer2_io_t *
206 _hammer2_io_getblk(hammer2_dev_t *hmp, int btype, off_t lbase,
207 		   int lsize, int op HAMMER2_IO_DEBUG_ARGS)
208 {
209 	hammer2_io_t *dio;
210 	off_t peof;
211 	uint64_t orefs;
212 	uint64_t nrefs;
213 	int isgood;
214 	int error;
215 	int hce;
216 	int bflags;
217 
218 	bflags = ((btype == HAMMER2_BREF_TYPE_DATA) ? B_NOTMETA : 0);
219 	bflags |= B_KVABIO;
220 
221 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
222 
223 	if (op == HAMMER2_DOP_READQ) {
224 		dio = hammer2_io_alloc(hmp, lbase, btype, 0, &isgood);
225 		if (dio == NULL)
226 			return NULL;
227 		op = HAMMER2_DOP_READ;
228 	} else {
229 		dio = hammer2_io_alloc(hmp, lbase, btype, 1, &isgood);
230 	}
231 
232 	for (;;) {
233 		orefs = dio->refs;
234 		cpu_ccfence();
235 
236 		/*
237 		 * Buffer is already good, handle the op and return.
238 		 */
239 		if (orefs & HAMMER2_DIO_GOOD) {
240 			if (isgood == 0)
241 				cpu_mfence();
242 			bkvasync(dio->bp);
243 
244 			switch(op) {
245 			case HAMMER2_DOP_NEW:
246 				bzero(hammer2_io_data(dio, lbase), lsize);
247 				/* fall through */
248 			case HAMMER2_DOP_NEWNZ:
249 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
250 				break;
251 			case HAMMER2_DOP_READ:
252 			default:
253 				/* nothing to do */
254 				break;
255 			}
256 			DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
257 			return (dio);
258 		}
259 
260 		/*
261 		 * Try to own the DIO
262 		 */
263 		if (orefs & HAMMER2_DIO_INPROG) {
264 			nrefs = orefs | HAMMER2_DIO_WAITING;
265 			tsleep_interlock(dio, 0);
266 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
267 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
268 			}
269 			/* retry */
270 		} else {
271 			nrefs = orefs | HAMMER2_DIO_INPROG;
272 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
273 				break;
274 			}
275 		}
276 	}
277 
278 	/*
279 	 * We break to here if GOOD is not set and we acquired INPROG for
280 	 * the I/O.
281 	 */
282 	KKASSERT(dio->bp == NULL);
283 	if (btype == HAMMER2_BREF_TYPE_DATA)
284 		hce = hammer2_cluster_data_read;
285 	else
286 		hce = hammer2_cluster_meta_read;
287 
288 	error = 0;
289 	if (dio->pbase == (lbase & ~HAMMER2_OFF_MASK_RADIX) &&
290 	    dio->psize == lsize) {
291 		switch(op) {
292 		case HAMMER2_DOP_NEW:
293 		case HAMMER2_DOP_NEWNZ:
294 			dio->bp = getblk(dio->hmp->devvp,
295 					 dio->pbase, dio->psize,
296 					 GETBLK_KVABIO, 0);
297 			if (op == HAMMER2_DOP_NEW) {
298 				bkvasync(dio->bp);
299 				bzero(dio->bp->b_data, dio->psize);
300 			}
301 			atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
302 			break;
303 		case HAMMER2_DOP_READ:
304 		default:
305 			if (hce > 0) {
306 				/*
307 				 * Synchronous cluster I/O for now.
308 				 */
309 				peof = (dio->pbase + HAMMER2_SEGMASK64) &
310 				       ~HAMMER2_SEGMASK64;
311 				dio->bp = NULL;
312 				error = cluster_readx(dio->hmp->devvp,
313 						     peof, dio->pbase,
314 						     dio->psize, bflags,
315 						     dio->psize,
316 						     HAMMER2_PBUFSIZE*hce,
317 						     &dio->bp);
318 			} else {
319 				dio->bp = NULL;
320 				error = breadnx(dio->hmp->devvp, dio->pbase,
321 						dio->psize, bflags,
322 					        NULL, NULL, 0, &dio->bp);
323 			}
324 		}
325 	} else {
326 		if (hce > 0) {
327 			/*
328 			 * Synchronous cluster I/O for now.
329 			 */
330 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
331 			       ~HAMMER2_SEGMASK64;
332 			error = cluster_readx(dio->hmp->devvp,
333 					      peof, dio->pbase, dio->psize,
334 					      bflags,
335 					      dio->psize, HAMMER2_PBUFSIZE*hce,
336 					      &dio->bp);
337 		} else {
338 			error = breadnx(dio->hmp->devvp, dio->pbase,
339 				        dio->psize, bflags,
340 					NULL, NULL, 0, &dio->bp);
341 		}
342 		if (dio->bp) {
343 			/*
344 			 * Handle NEW flags
345 			 */
346 			switch(op) {
347 			case HAMMER2_DOP_NEW:
348 				bkvasync(dio->bp);
349 				bzero(hammer2_io_data(dio, lbase), lsize);
350 				/* fall through */
351 			case HAMMER2_DOP_NEWNZ:
352 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
353 				break;
354 			case HAMMER2_DOP_READ:
355 			default:
356 				break;
357 			}
358 
359 			/*
360 			 * Tell the kernel that the buffer cache is not
361 			 * meta-data based on the btype.  This allows
362 			 * swapcache to distinguish between data and
363 			 * meta-data.
364 			 */
365 			switch(btype) {
366 			case HAMMER2_BREF_TYPE_DATA:
367 				dio->bp->b_flags |= B_NOTMETA;
368 				break;
369 			default:
370 				break;
371 			}
372 		}
373 	}
374 
375 	if (dio->bp) {
376 		bkvasync(dio->bp);
377 		BUF_KERNPROC(dio->bp);
378 		dio->bp->b_flags &= ~B_AGE;
379 		/* dio->bp->b_debug_info2 = dio; */
380 	}
381 	dio->error = error;
382 
383 	/*
384 	 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
385 	 */
386 	for (;;) {
387 		orefs = dio->refs;
388 		cpu_ccfence();
389 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_WAITING);
390 		if (error == 0)
391 			nrefs |= HAMMER2_DIO_GOOD;
392 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
393 			if (orefs & HAMMER2_DIO_WAITING)
394 				wakeup(dio);
395 			break;
396 		}
397 		cpu_pause();
398 	}
399 
400 	/* XXX error handling */
401 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
402 
403 	return dio;
404 }
405 
406 /*
407  * Release our ref on *diop.
408  *
409  * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
410  * of dio->bp.  Then we clean up DIO_INPROG and DIO_WAITING.
411  */
412 void
413 _hammer2_io_putblk(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
414 {
415 	hammer2_dev_t *hmp;
416 	hammer2_io_t *dio;
417 	struct buf *bp;
418 	off_t pbase;
419 	int psize;
420 	int dio_limit;
421 	uint64_t orefs;
422 	uint64_t nrefs;
423 
424 	dio = *diop;
425 	*diop = NULL;
426 	hmp = dio->hmp;
427 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
428 
429 	KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
430 
431 	/*
432 	 * Drop refs.
433 	 *
434 	 * On the 1->0 transition clear GOOD and set INPROG, and break.
435 	 * On any other transition we can return early.
436 	 */
437 	for (;;) {
438 		orefs = dio->refs;
439 		cpu_ccfence();
440 
441 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
442 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
443 			/*
444 			 * Lastdrop case, INPROG can be set.  GOOD must be
445 			 * cleared to prevent the getblk shortcut.
446 			 */
447 			nrefs = orefs - 1;
448 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
449 			nrefs |= HAMMER2_DIO_INPROG;
450 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
451 				break;
452 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
453 			/*
454 			 * Lastdrop case, INPROG already set.  We must
455 			 * wait for INPROG to clear.
456 			 */
457 			nrefs = orefs | HAMMER2_DIO_WAITING;
458 			tsleep_interlock(dio, 0);
459 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
460 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
461 			}
462 			/* retry */
463 		} else {
464 			/*
465 			 * Normal drop case.
466 			 */
467 			nrefs = orefs - 1;
468 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
469 				return;
470 			/* retry */
471 		}
472 		cpu_pause();
473 		/* retry */
474 	}
475 
476 	/*
477 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
478 	 * have been cleared.  iofree_count has not yet been incremented,
479 	 * note that another accessor race will decrement iofree_count so
480 	 * we have to increment it regardless.
481 	 *
482 	 * We can now dispose of the buffer, and should do it before calling
483 	 * io_complete() in case there's a race against a new reference
484 	 * which causes io_complete() to chain and instantiate the bp again.
485 	 */
486 	pbase = dio->pbase;
487 	psize = dio->psize;
488 	bp = dio->bp;
489 	dio->bp = NULL;
490 
491 	if ((orefs & HAMMER2_DIO_GOOD) && bp) {
492 		/*
493 		 * Non-errored disposal of bp
494 		 */
495 		if (orefs & HAMMER2_DIO_DIRTY) {
496 			dio_write_stats_update(dio, bp);
497 
498 			/*
499 			 * Allows dirty buffers to accumulate and
500 			 * possibly be canceled (e.g. by a 'rm'),
501 			 * will burst-write later.
502 			 *
503 			 * We normally do not allow the kernel to
504 			 * cluster dirty buffers because H2 already
505 			 * uses a large block size.
506 			 *
507 			 * NOTE: Do not use cluster_write() here.  The
508 			 *	 problem is that due to the way chains
509 			 *	 are locked, buffers are cycled in and out
510 			 *	 quite often so the disposal here is not
511 			 *	 necessarily the final disposal.  Avoid
512 			 *	 excessive rewriting of the same blocks
513 			 *	 by using bdwrite().
514 			 */
515 #if 0
516 			off_t peof;
517 			int hce;
518 
519 			if ((hce = hammer2_cluster_write) > 0) {
520 				/*
521 				 * Allows write-behind to keep the buffer
522 				 * cache sane.
523 				 */
524 				peof = (pbase + HAMMER2_SEGMASK64) &
525 				       ~HAMMER2_SEGMASK64;
526 				bp->b_flags |= B_CLUSTEROK;
527 				cluster_write(bp, peof, psize, hce);
528 			} else
529 #endif
530 			if (hammer2_cluster_write)
531 				bp->b_flags |= B_CLUSTEROK;
532 			else
533 				bp->b_flags &= ~B_CLUSTEROK;
534 			bdwrite(bp);
535 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
536 			brelse(bp);
537 		} else {
538 			bqrelse(bp);
539 		}
540 	} else if (bp) {
541 		/*
542 		 * Errored disposal of bp
543 		 */
544 		brelse(bp);
545 	}
546 
547 	/*
548 	 * Update iofree_count before disposing of the dio
549 	 */
550 	hmp = dio->hmp;
551 	atomic_add_int(&hmp->iofree_count, 1);
552 
553 	/*
554 	 * Clear INPROG, GOOD, and WAITING (GOOD should already be clear).
555 	 */
556 	for (;;) {
557 		orefs = dio->refs;
558 		cpu_ccfence();
559 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
560 				  HAMMER2_DIO_WAITING);
561 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
562 			if (orefs & HAMMER2_DIO_WAITING)
563 				wakeup(dio);
564 			break;
565 		}
566 		cpu_pause();
567 	}
568 
569 	/*
570 	 * We cache free buffers so re-use cases can use a shared lock, but
571 	 * if too many build up we have to clean them out.
572 	 */
573 	dio_limit = hammer2_dio_limit;
574 	if (dio_limit < 256)
575 		dio_limit = 256;
576 	if (dio_limit > 1024*1024)
577 		dio_limit = 1024*1024;
578 	if (hmp->iofree_count > dio_limit) {
579 		struct hammer2_cleanupcb_info info;
580 
581 		RB_INIT(&info.tmptree);
582 		hammer2_spin_ex(&hmp->io_spin);
583 		if (hmp->iofree_count > dio_limit) {
584 			info.count = hmp->iofree_count / 5;
585 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
586 				hammer2_io_cleanup_callback, &info);
587 		}
588 		hammer2_spin_unex(&hmp->io_spin);
589 		hammer2_io_cleanup(hmp, &info.tmptree);
590 	}
591 }
592 
593 /*
594  * Cleanup any dio's with (INPROG | refs) == 0.
595  *
596  * Called to clean up cached DIOs on umount after all activity has been
597  * flushed.
598  */
599 static
600 int
601 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
602 {
603 	struct hammer2_cleanupcb_info *info = arg;
604 	hammer2_io_t *xio;
605 
606 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
607 		if (dio->act > 0) {
608 			int act;
609 
610 			act = dio->act - (ticks - dio->ticks) / hz - 1;
611 			if (act > 0) {
612 				dio->act = act;
613 				return 0;
614 			}
615 			dio->act = 0;
616 		}
617 		KKASSERT(dio->bp == NULL);
618 		if (info->count > 0) {
619 			RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
620 			xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
621 			KKASSERT(xio == NULL);
622 			--info->count;
623 		}
624 	}
625 	return 0;
626 }
627 
628 void
629 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
630 {
631 	hammer2_io_t *dio;
632 
633 	while ((dio = RB_ROOT(tree)) != NULL) {
634 		RB_REMOVE(hammer2_io_tree, tree, dio);
635 		KKASSERT(dio->bp == NULL &&
636 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
637 		if (dio->refs & HAMMER2_DIO_DIRTY) {
638 			kprintf("hammer2_io_cleanup: Dirty buffer "
639 				"%016jx/%d (bp=%p)\n",
640 				dio->pbase, dio->psize, dio->bp);
641 		}
642 		kfree(dio, M_HAMMER2);
643 		atomic_add_int(&hammer2_dio_count, -1);
644 		atomic_add_int(&hmp->iofree_count, -1);
645 	}
646 }
647 
648 /*
649  * Returns a pointer to the requested data.
650  */
651 char *
652 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
653 {
654 	struct buf *bp;
655 	int off;
656 
657 	bp = dio->bp;
658 	KKASSERT(bp != NULL);
659 	bkvasync(bp);
660 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
661 	KKASSERT(off >= 0 && off < bp->b_bufsize);
662 	return(bp->b_data + off);
663 }
664 
665 int
666 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
667 	       hammer2_io_t **diop)
668 {
669 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEW);
670 	return ((*diop)->error);
671 }
672 
673 int
674 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
675 		 hammer2_io_t **diop)
676 {
677 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEWNZ);
678 	return ((*diop)->error);
679 }
680 
681 int
682 _hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
683 		hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
684 {
685 #ifdef HAMMER2_IO_DEBUG
686 	hammer2_io_t *dio;
687 #endif
688 
689 	*diop = _hammer2_io_getblk(hmp, btype, lbase, lsize,
690 				   HAMMER2_DOP_READ HAMMER2_IO_DEBUG_CALL);
691 #ifdef HAMMER2_IO_DEBUG
692 	if ((dio = *diop) != NULL) {
693 		int i = (dio->debug_index - 1) & HAMMER2_IO_DEBUG_MASK;
694 		dio->debug_data[i] = debug_data;
695 	}
696 #endif
697 	return ((*diop)->error);
698 }
699 
700 hammer2_io_t *
701 _hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase,
702 		     int lsize HAMMER2_IO_DEBUG_ARGS)
703 {
704 	hammer2_io_t *dio;
705 
706 	dio = _hammer2_io_getblk(hmp, 0, lbase, lsize,
707 				 HAMMER2_DOP_READQ HAMMER2_IO_DEBUG_CALL);
708 	return dio;
709 }
710 
711 void
712 _hammer2_io_bawrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
713 {
714 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
715 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
716 }
717 
718 void
719 _hammer2_io_bdwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
720 {
721 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
722 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
723 }
724 
725 int
726 _hammer2_io_bwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
727 {
728 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
729 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
730 	return (0);	/* XXX */
731 }
732 
733 void
734 hammer2_io_setdirty(hammer2_io_t *dio)
735 {
736 	atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
737 }
738 
739 /*
740  * This routine is called when a MODIFIED chain is being DESTROYED,
741  * in an attempt to allow the related buffer cache buffer to be
742  * invalidated and discarded instead of flushing it to disk.
743  *
744  * At the moment this case is only really useful for file meta-data.
745  * File data is already handled via the logical buffer cache associated
746  * with the vnode, and will be discarded if it was never flushed to disk.
747  * File meta-data may include inodes, directory entries, and indirect blocks.
748  *
749  * XXX
750  * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
751  * invalidated might be smaller.  Most of the meta-data structures above
752  * are in the 'smaller' category.  For now, don't try to invalidate the
753  * data areas.
754  */
755 void
756 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
757 {
758 	/* NOP */
759 }
760 
761 void
762 _hammer2_io_brelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
763 {
764 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
765 }
766 
767 void
768 _hammer2_io_bqrelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
769 {
770 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
771 }
772 
773 /*
774  * Set dedup validation bits in a DIO.  We do not need the buffer cache
775  * buffer for this.  This must be done concurrent with setting bits in
776  * the freemap so as to interlock with bulkfree's clearing of those bits.
777  */
778 void
779 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
780 {
781 	hammer2_io_t *dio;
782 	uint64_t mask;
783 	int lsize;
784 	int isgood;
785 
786 	dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1, &isgood);
787 	lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
788 	mask = hammer2_dedup_mask(dio, bref->data_off, lsize);
789 	atomic_clear_64(&dio->dedup_valid, mask);
790 	atomic_set_64(&dio->dedup_alloc, mask);
791 	hammer2_io_putblk(&dio);
792 }
793 
794 /*
795  * Clear dedup validation bits in a DIO.  This is typically done when
796  * a modified chain is destroyed or by the bulkfree code.  No buffer
797  * is needed for this operation.  If the DIO no longer exists it is
798  * equivalent to the bits not being set.
799  */
800 void
801 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
802 			hammer2_off_t data_off, u_int bytes)
803 {
804 	hammer2_io_t *dio;
805 	uint64_t mask;
806 	int isgood;
807 
808 	if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
809 		return;
810 	if (btype != HAMMER2_BREF_TYPE_DATA)
811 		return;
812 	dio = hammer2_io_alloc(hmp, data_off, btype, 0, &isgood);
813 	if (dio) {
814 		if (data_off < dio->pbase ||
815 		    (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
816 		    dio->pbase + dio->psize) {
817 			panic("hammer2_dedup_delete: DATAOFF BAD "
818 			      "%016jx/%d %016jx\n",
819 			      data_off, bytes, dio->pbase);
820 		}
821 		mask = hammer2_dedup_mask(dio, data_off, bytes);
822 		atomic_clear_64(&dio->dedup_alloc, mask);
823 		atomic_clear_64(&dio->dedup_valid, mask);
824 		hammer2_io_putblk(&dio);
825 	}
826 }
827 
828 /*
829  * Assert that dedup allocation bits in a DIO are not set.  This operation
830  * does not require a buffer.  The DIO does not need to exist.
831  */
832 void
833 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
834 {
835 	hammer2_io_t *dio;
836 	int isgood;
837 
838 	dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA,
839 			       0, &isgood);
840 	if (dio) {
841 		KASSERT((dio->dedup_alloc &
842 			  hammer2_dedup_mask(dio, data_off, bytes)) == 0,
843 			("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
844 			data_off,
845 			bytes,
846 			hammer2_dedup_mask(dio, data_off, bytes),
847 			dio->dedup_alloc));
848 		hammer2_io_putblk(&dio);
849 	}
850 }
851 
852 static
853 void
854 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
855 {
856 	long *counterp;
857 
858 	if (bp->b_flags & B_DELWRI)
859 		return;
860 
861 	switch(dio->btype) {
862 	case 0:
863 		return;
864 	case HAMMER2_BREF_TYPE_DATA:
865 		counterp = &hammer2_iod_file_write;
866 		break;
867 	case HAMMER2_BREF_TYPE_DIRENT:
868 	case HAMMER2_BREF_TYPE_INODE:
869 		counterp = &hammer2_iod_meta_write;
870 		break;
871 	case HAMMER2_BREF_TYPE_INDIRECT:
872 		counterp = &hammer2_iod_indr_write;
873 		break;
874 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
875 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
876 		counterp = &hammer2_iod_fmap_write;
877 		break;
878 	default:
879 		counterp = &hammer2_iod_volu_write;
880 		break;
881 	}
882 	*counterp += dio->psize;
883 }
884 
885 void
886 hammer2_io_bkvasync(hammer2_io_t *dio)
887 {
888 	KKASSERT(dio->bp != NULL);
889 	bkvasync(dio->bp);
890 }
891 
892 /*
893  * Ref a dio that is already owned
894  */
895 void
896 _hammer2_io_ref(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
897 {
898 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
899 	atomic_add_64(&dio->refs, 1);
900 }
901