xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision 335b9e93)
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 #define HAMMER2_DOP_READ	1
38 #define HAMMER2_DOP_NEW		2
39 #define HAMMER2_DOP_NEWNZ	3
40 #define HAMMER2_DOP_READQ	4
41 
42 /*
43  * Implements an abstraction layer for synchronous and asynchronous
44  * buffered device I/O.  Can be used as an OS-abstraction but the main
45  * purpose is to allow larger buffers to be used against hammer2_chain's
46  * using smaller allocations, without causing deadlocks.
47  *
48  * The DIOs also record temporary state with limited persistence.  This
49  * feature is used to keep track of dedupable blocks.
50  */
51 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
52 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
53 
54 static int
55 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
56 {
57 	if (io1->pbase < io2->pbase)
58 		return(-1);
59 	if (io1->pbase > io2->pbase)
60 		return(1);
61 	return(0);
62 }
63 
64 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
65 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
66 		off_t, pbase);
67 
68 struct hammer2_cleanupcb_info {
69 	struct hammer2_io_tree tmptree;
70 	int	count;
71 };
72 
73 #if 0
74 static __inline
75 uint64_t
76 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
77 {
78 	uint64_t mask;
79 	int i;
80 
81 	if (bytes < 1024)	/* smaller chunks not supported */
82 		return 0;
83 
84 	/*
85 	 * Calculate crc check mask for larger chunks
86 	 */
87 	i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
88 	     HAMMER2_PBUFMASK) >> 10;
89 	if (i == 0 && bytes == HAMMER2_PBUFSIZE)
90 		return((uint64_t)-1);
91 	mask = ((uint64_t)1U << (bytes >> 10)) - 1;
92 	mask <<= i;
93 
94 	return mask;
95 }
96 #endif
97 
98 #ifdef HAMMER2_IO_DEBUG
99 
100 static __inline void
101 DIO_RECORD(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
102 {
103 	int i;
104 
105 	i = atomic_fetchadd_int(&dio->debug_index, 1) & HAMMER2_IO_DEBUG_MASK;
106 
107 	dio->debug_file[i] = file;
108 	dio->debug_line[i] = line;
109 	dio->debug_refs[i] = dio->refs;
110 	dio->debug_td[i] = curthread;
111 }
112 
113 #else
114 
115 #define DIO_RECORD(dio)
116 
117 #endif
118 
119 /*
120  * Returns the DIO corresponding to the data|radix, creating it if necessary.
121  *
122  * If createit is 0, NULL can be returned indicating that the DIO does not
123  * exist.  (btype) is ignored when createit is 0.
124  */
125 static __inline
126 hammer2_io_t *
127 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
128 		 int createit, int *isgoodp)
129 {
130 	hammer2_io_t *dio;
131 	hammer2_io_t *xio;
132 	hammer2_key_t lbase;
133 	hammer2_key_t pbase;
134 	hammer2_key_t pmask;
135 	uint64_t refs;
136 	int lsize;
137 	int psize;
138 
139 	psize = HAMMER2_PBUFSIZE;
140 	pmask = ~(hammer2_off_t)(psize - 1);
141 	lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
142 	lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
143 	pbase = lbase & pmask;
144 
145 	if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
146 		kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
147 			pbase, lbase, lsize, pmask);
148 	}
149 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
150 	*isgoodp = 0;
151 
152 	/*
153 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
154 	 *
155 	 * If DIO_GOOD is set the ref should prevent it from being cleared
156 	 * out from under us, we can set *isgoodp, and the caller can operate
157 	 * on the buffer without any further interaction.
158 	 */
159 	hammer2_spin_sh(&hmp->io_spin);
160 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
161 	if (dio) {
162 		refs = atomic_fetchadd_64(&dio->refs, 1);
163 		if ((refs & HAMMER2_DIO_MASK) == 0) {
164 			atomic_add_int(&dio->hmp->iofree_count, -1);
165 		}
166 		if (refs & HAMMER2_DIO_GOOD)
167 			*isgoodp = 1;
168 		hammer2_spin_unsh(&hmp->io_spin);
169 	} else if (createit) {
170 		refs = 0;
171 		hammer2_spin_unsh(&hmp->io_spin);
172 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
173 		dio->hmp = hmp;
174 		dio->pbase = pbase;
175 		dio->psize = psize;
176 		dio->btype = btype;
177 		dio->refs = refs + 1;
178 		dio->act = 5;
179 		hammer2_spin_ex(&hmp->io_spin);
180 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
181 		if (xio == NULL) {
182 			atomic_add_int(&hammer2_dio_count, 1);
183 			hammer2_spin_unex(&hmp->io_spin);
184 		} else {
185 			refs = atomic_fetchadd_64(&xio->refs, 1);
186 			if ((refs & HAMMER2_DIO_MASK) == 0)
187 				atomic_add_int(&xio->hmp->iofree_count, -1);
188 			if (refs & HAMMER2_DIO_GOOD)
189 				*isgoodp = 1;
190 			hammer2_spin_unex(&hmp->io_spin);
191 			kfree(dio, M_HAMMER2);
192 			dio = xio;
193 		}
194 	} else {
195 		hammer2_spin_unsh(&hmp->io_spin);
196 		return NULL;
197 	}
198 	dio->ticks = ticks;
199 	if (dio->act < 10)
200 		++dio->act;
201 
202 	return dio;
203 }
204 
205 /*
206  * Acquire the requested dio.  If DIO_GOOD is not set we must instantiate
207  * a buffer.  If set the buffer already exists and is good to go.
208  */
209 hammer2_io_t *
210 _hammer2_io_getblk(hammer2_dev_t *hmp, int btype, off_t lbase,
211 		   int lsize, int op HAMMER2_IO_DEBUG_ARGS)
212 {
213 	hammer2_io_t *dio;
214 	off_t peof;
215 	uint64_t orefs;
216 	uint64_t nrefs;
217 	int isgood;
218 	int error;
219 	int hce;
220 	int bflags;
221 
222 	bflags = ((btype == HAMMER2_BREF_TYPE_DATA) ? B_NOTMETA : 0);
223 	bflags |= B_KVABIO;
224 
225 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
226 
227 	if (op == HAMMER2_DOP_READQ) {
228 		dio = hammer2_io_alloc(hmp, lbase, btype, 0, &isgood);
229 		if (dio == NULL)
230 			return NULL;
231 		op = HAMMER2_DOP_READ;
232 	} else {
233 		dio = hammer2_io_alloc(hmp, lbase, btype, 1, &isgood);
234 	}
235 
236 	for (;;) {
237 		orefs = dio->refs;
238 		cpu_ccfence();
239 
240 		/*
241 		 * Buffer is already good, handle the op and return.
242 		 */
243 		if (orefs & HAMMER2_DIO_GOOD) {
244 			if (isgood == 0)
245 				cpu_mfence();
246 			bkvasync(dio->bp);
247 
248 			switch(op) {
249 			case HAMMER2_DOP_NEW:
250 				bzero(hammer2_io_data(dio, lbase), lsize);
251 				/* fall through */
252 			case HAMMER2_DOP_NEWNZ:
253 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
254 				break;
255 			case HAMMER2_DOP_READ:
256 			default:
257 				/* nothing to do */
258 				break;
259 			}
260 			DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
261 			return (dio);
262 		}
263 
264 		/*
265 		 * Try to own the DIO
266 		 */
267 		if (orefs & HAMMER2_DIO_INPROG) {
268 			nrefs = orefs | HAMMER2_DIO_WAITING;
269 			tsleep_interlock(dio, 0);
270 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
271 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
272 			}
273 			/* retry */
274 		} else {
275 			nrefs = orefs | HAMMER2_DIO_INPROG;
276 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
277 				break;
278 			}
279 		}
280 	}
281 
282 	/*
283 	 * We break to here if GOOD is not set and we acquired INPROG for
284 	 * the I/O.
285 	 */
286 	KKASSERT(dio->bp == NULL);
287 	if (btype == HAMMER2_BREF_TYPE_DATA)
288 		hce = hammer2_cluster_data_read;
289 	else
290 		hce = hammer2_cluster_meta_read;
291 
292 	error = 0;
293 	if (dio->pbase == (lbase & ~HAMMER2_OFF_MASK_RADIX) &&
294 	    dio->psize == lsize) {
295 		switch(op) {
296 		case HAMMER2_DOP_NEW:
297 		case HAMMER2_DOP_NEWNZ:
298 			dio->bp = getblk(dio->hmp->devvp,
299 					 dio->pbase, dio->psize,
300 					 GETBLK_KVABIO, 0);
301 			if (op == HAMMER2_DOP_NEW) {
302 				bkvasync(dio->bp);
303 				bzero(dio->bp->b_data, dio->psize);
304 			}
305 			atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
306 			break;
307 		case HAMMER2_DOP_READ:
308 		default:
309 			KKASSERT(dio->bp == NULL);
310 			if (hce > 0) {
311 				/*
312 				 * Synchronous cluster I/O for now.
313 				 */
314 				peof = (dio->pbase + HAMMER2_SEGMASK64) &
315 				       ~HAMMER2_SEGMASK64;
316 				error = cluster_readx(dio->hmp->devvp,
317 						     peof, dio->pbase,
318 						     dio->psize, bflags,
319 						     dio->psize,
320 						     HAMMER2_PBUFSIZE*hce,
321 						     &dio->bp);
322 			} else {
323 				error = breadnx(dio->hmp->devvp, dio->pbase,
324 						dio->psize, bflags,
325 					        NULL, NULL, 0, &dio->bp);
326 			}
327 		}
328 	} else {
329 		if (hce > 0) {
330 			/*
331 			 * Synchronous cluster I/O for now.
332 			 */
333 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
334 			       ~HAMMER2_SEGMASK64;
335 			error = cluster_readx(dio->hmp->devvp,
336 					      peof, dio->pbase, dio->psize,
337 					      bflags,
338 					      dio->psize, HAMMER2_PBUFSIZE*hce,
339 					      &dio->bp);
340 		} else {
341 			error = breadnx(dio->hmp->devvp, dio->pbase,
342 				        dio->psize, bflags,
343 					NULL, NULL, 0, &dio->bp);
344 		}
345 		if (dio->bp) {
346 			/*
347 			 * Handle NEW flags
348 			 */
349 			switch(op) {
350 			case HAMMER2_DOP_NEW:
351 				bkvasync(dio->bp);
352 				bzero(hammer2_io_data(dio, lbase), lsize);
353 				/* fall through */
354 			case HAMMER2_DOP_NEWNZ:
355 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
356 				break;
357 			case HAMMER2_DOP_READ:
358 			default:
359 				break;
360 			}
361 
362 			/*
363 			 * Tell the kernel that the buffer cache is not
364 			 * meta-data based on the btype.  This allows
365 			 * swapcache to distinguish between data and
366 			 * meta-data.
367 			 */
368 			switch(btype) {
369 			case HAMMER2_BREF_TYPE_DATA:
370 				dio->bp->b_flags |= B_NOTMETA;
371 				break;
372 			default:
373 				break;
374 			}
375 		}
376 	}
377 
378 	if (dio->bp) {
379 		bkvasync(dio->bp);
380 		BUF_KERNPROC(dio->bp);
381 		dio->bp->b_flags &= ~B_AGE;
382 		/* dio->bp->b_debug_info2 = dio; */
383 	}
384 	dio->error = error;
385 
386 	/*
387 	 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
388 	 */
389 	for (;;) {
390 		orefs = dio->refs;
391 		cpu_ccfence();
392 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_WAITING);
393 		if (error == 0)
394 			nrefs |= HAMMER2_DIO_GOOD;
395 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
396 			if (orefs & HAMMER2_DIO_WAITING)
397 				wakeup(dio);
398 			break;
399 		}
400 		cpu_pause();
401 	}
402 
403 	/* XXX error handling */
404 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
405 
406 	return dio;
407 }
408 
409 /*
410  * Release our ref on *diop.
411  *
412  * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
413  * of dio->bp.  Then we clean up DIO_INPROG and DIO_WAITING.
414  */
415 void
416 _hammer2_io_putblk(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
417 {
418 	hammer2_dev_t *hmp;
419 	hammer2_io_t *dio;
420 	struct buf *bp;
421 	off_t pbase;
422 	int psize;
423 	int dio_limit;
424 	uint64_t orefs;
425 	uint64_t nrefs;
426 
427 	dio = *diop;
428 	*diop = NULL;
429 	hmp = dio->hmp;
430 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
431 
432 	KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
433 
434 	/*
435 	 * Drop refs.
436 	 *
437 	 * On the 1->0 transition clear GOOD and set INPROG, and break.
438 	 * On any other transition we can return early.
439 	 */
440 	for (;;) {
441 		orefs = dio->refs;
442 		cpu_ccfence();
443 
444 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
445 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
446 			/*
447 			 * Lastdrop case, INPROG can be set.  GOOD must be
448 			 * cleared to prevent the getblk shortcut.
449 			 */
450 			nrefs = orefs - 1;
451 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
452 			nrefs |= HAMMER2_DIO_INPROG;
453 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
454 				break;
455 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
456 			/*
457 			 * Lastdrop case, INPROG already set.  We must
458 			 * wait for INPROG to clear.
459 			 */
460 			nrefs = orefs | HAMMER2_DIO_WAITING;
461 			tsleep_interlock(dio, 0);
462 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
463 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
464 			}
465 			/* retry */
466 		} else {
467 			/*
468 			 * Normal drop case.
469 			 */
470 			nrefs = orefs - 1;
471 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
472 				return;
473 			/* retry */
474 		}
475 		cpu_pause();
476 		/* retry */
477 	}
478 
479 	/*
480 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
481 	 * have been cleared.  iofree_count has not yet been incremented,
482 	 * note that another accessor race will decrement iofree_count so
483 	 * we have to increment it regardless.
484 	 *
485 	 * We can now dispose of the buffer, and should do it before calling
486 	 * io_complete() in case there's a race against a new reference
487 	 * which causes io_complete() to chain and instantiate the bp again.
488 	 */
489 	pbase = dio->pbase;
490 	psize = dio->psize;
491 	bp = dio->bp;
492 	dio->bp = NULL;
493 
494 	if ((orefs & HAMMER2_DIO_GOOD) && bp) {
495 		/*
496 		 * Non-errored disposal of bp
497 		 */
498 		if (orefs & HAMMER2_DIO_DIRTY) {
499 			dio_write_stats_update(dio, bp);
500 
501 			/*
502 			 * Allows dirty buffers to accumulate and
503 			 * possibly be canceled (e.g. by a 'rm'),
504 			 * by default we will burst-write later.
505 			 *
506 			 * We generally do NOT want to issue an actual
507 			 * b[a]write() or cluster_write() here.  Due to
508 			 * the way chains are locked, buffers may be cycled
509 			 * in and out quite often and disposal here can cause
510 			 * multiple writes or write-read stalls.
511 			 *
512 			 * If FLUSH is set we do want to issue the actual
513 			 * write.  This typically occurs in the write-behind
514 			 * case when writing to large files.
515 			 */
516 			off_t peof;
517 			int hce;
518 			if (dio->refs & HAMMER2_DIO_FLUSH) {
519 				if ((hce = hammer2_cluster_write) != 0) {
520 					peof = (pbase + HAMMER2_SEGMASK64) &
521 					       ~HAMMER2_SEGMASK64;
522 					bp->b_flags |= B_CLUSTEROK;
523 					cluster_write(bp, peof, psize, hce);
524 				} else {
525 					bp->b_flags &= ~B_CLUSTEROK;
526 					bawrite(bp);
527 				}
528 			} else {
529 				bp->b_flags &= ~B_CLUSTEROK;
530 				bdwrite(bp);
531 			}
532 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
533 			brelse(bp);
534 		} else {
535 			bqrelse(bp);
536 		}
537 	} else if (bp) {
538 		/*
539 		 * Errored disposal of bp
540 		 */
541 		brelse(bp);
542 	}
543 
544 	/*
545 	 * Update iofree_count before disposing of the dio
546 	 */
547 	hmp = dio->hmp;
548 	atomic_add_int(&hmp->iofree_count, 1);
549 
550 	/*
551 	 * Clear INPROG, GOOD, and WAITING (GOOD should already be clear).
552 	 *
553 	 * Also clear FLUSH as it was handled above.
554 	 */
555 	for (;;) {
556 		orefs = dio->refs;
557 		cpu_ccfence();
558 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
559 				  HAMMER2_DIO_WAITING | HAMMER2_DIO_FLUSH);
560 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
561 			if (orefs & HAMMER2_DIO_WAITING)
562 				wakeup(dio);
563 			break;
564 		}
565 		cpu_pause();
566 	}
567 
568 	/*
569 	 * We cache free buffers so re-use cases can use a shared lock, but
570 	 * if too many build up we have to clean them out.
571 	 */
572 	dio_limit = hammer2_dio_limit;
573 	if (dio_limit < 256)
574 		dio_limit = 256;
575 	if (dio_limit > 1024*1024)
576 		dio_limit = 1024*1024;
577 	if (hmp->iofree_count > dio_limit) {
578 		struct hammer2_cleanupcb_info info;
579 
580 		RB_INIT(&info.tmptree);
581 		hammer2_spin_ex(&hmp->io_spin);
582 		if (hmp->iofree_count > dio_limit) {
583 			info.count = hmp->iofree_count / 5;
584 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
585 				hammer2_io_cleanup_callback, &info);
586 		}
587 		hammer2_spin_unex(&hmp->io_spin);
588 		hammer2_io_cleanup(hmp, &info.tmptree);
589 	}
590 }
591 
592 /*
593  * Cleanup any dio's with (INPROG | refs) == 0.
594  *
595  * Called to clean up cached DIOs on umount after all activity has been
596  * flushed.
597  */
598 static
599 int
600 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
601 {
602 	struct hammer2_cleanupcb_info *info = arg;
603 	hammer2_io_t *xio;
604 
605 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
606 		if (dio->act > 0) {
607 			int act;
608 
609 			act = dio->act - (ticks - dio->ticks) / hz - 1;
610 			if (act > 0) {
611 				dio->act = act;
612 				return 0;
613 			}
614 			dio->act = 0;
615 		}
616 		KKASSERT(dio->bp == NULL);
617 		if (info->count > 0) {
618 			RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
619 			xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
620 			KKASSERT(xio == NULL);
621 			--info->count;
622 		}
623 	}
624 	return 0;
625 }
626 
627 void
628 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
629 {
630 	hammer2_io_t *dio;
631 
632 	while ((dio = RB_ROOT(tree)) != NULL) {
633 		RB_REMOVE(hammer2_io_tree, tree, dio);
634 		KKASSERT(dio->bp == NULL &&
635 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
636 		if (dio->refs & HAMMER2_DIO_DIRTY) {
637 			kprintf("hammer2_io_cleanup: Dirty buffer "
638 				"%016jx/%d (bp=%p)\n",
639 				dio->pbase, dio->psize, dio->bp);
640 		}
641 		kfree(dio, M_HAMMER2);
642 		atomic_add_int(&hammer2_dio_count, -1);
643 		atomic_add_int(&hmp->iofree_count, -1);
644 	}
645 }
646 
647 /*
648  * Returns a pointer to the requested data.
649  */
650 char *
651 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
652 {
653 	struct buf *bp;
654 	int off;
655 
656 	bp = dio->bp;
657 	KKASSERT(bp != NULL);
658 	bkvasync(bp);
659 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
660 	KKASSERT(off >= 0 && off < bp->b_bufsize);
661 	return(bp->b_data + off);
662 }
663 
664 int
665 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
666 	       hammer2_io_t **diop)
667 {
668 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEW);
669 	return ((*diop)->error);
670 }
671 
672 int
673 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
674 		 hammer2_io_t **diop)
675 {
676 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEWNZ);
677 	return ((*diop)->error);
678 }
679 
680 int
681 _hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
682 		hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
683 {
684 #ifdef HAMMER2_IO_DEBUG
685 	hammer2_io_t *dio;
686 #endif
687 
688 	*diop = _hammer2_io_getblk(hmp, btype, lbase, lsize,
689 				   HAMMER2_DOP_READ HAMMER2_IO_DEBUG_CALL);
690 #ifdef HAMMER2_IO_DEBUG
691 	if ((dio = *diop) != NULL) {
692 		int i = (dio->debug_index - 1) & HAMMER2_IO_DEBUG_MASK;
693 		dio->debug_data[i] = debug_data;
694 	}
695 #endif
696 	return ((*diop)->error);
697 }
698 
699 hammer2_io_t *
700 _hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase,
701 		     int lsize HAMMER2_IO_DEBUG_ARGS)
702 {
703 	hammer2_io_t *dio;
704 
705 	dio = _hammer2_io_getblk(hmp, 0, lbase, lsize,
706 				 HAMMER2_DOP_READQ HAMMER2_IO_DEBUG_CALL);
707 	return dio;
708 }
709 
710 void
711 _hammer2_io_bawrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
712 {
713 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
714 				      HAMMER2_DIO_FLUSH);
715 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
716 }
717 
718 void
719 _hammer2_io_bdwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
720 {
721 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
722 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
723 }
724 
725 int
726 _hammer2_io_bwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
727 {
728 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
729 				      HAMMER2_DIO_FLUSH);
730 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
731 	return (0);	/* XXX */
732 }
733 
734 void
735 hammer2_io_setdirty(hammer2_io_t *dio)
736 {
737 	atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
738 }
739 
740 /*
741  * This routine is called when a MODIFIED chain is being DESTROYED,
742  * in an attempt to allow the related buffer cache buffer to be
743  * invalidated and discarded instead of flushing it to disk.
744  *
745  * At the moment this case is only really useful for file meta-data.
746  * File data is already handled via the logical buffer cache associated
747  * with the vnode, and will be discarded if it was never flushed to disk.
748  * File meta-data may include inodes, directory entries, and indirect blocks.
749  *
750  * XXX
751  * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
752  * invalidated might be smaller.  Most of the meta-data structures above
753  * are in the 'smaller' category.  For now, don't try to invalidate the
754  * data areas.
755  */
756 void
757 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
758 {
759 	/* NOP */
760 }
761 
762 void
763 _hammer2_io_brelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
764 {
765 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
766 }
767 
768 void
769 _hammer2_io_bqrelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
770 {
771 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
772 }
773 
774 /*
775  * Set dedup validation bits in a DIO.  We do not need the buffer cache
776  * buffer for this.  This must be done concurrent with setting bits in
777  * the freemap so as to interlock with bulkfree's clearing of those bits.
778  */
779 void
780 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
781 {
782 	hammer2_io_t *dio;
783 	uint64_t mask;
784 	int lsize;
785 	int isgood;
786 
787 	dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1, &isgood);
788 	lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
789 	mask = hammer2_dedup_mask(dio, bref->data_off, lsize);
790 	atomic_clear_64(&dio->dedup_valid, mask);
791 	atomic_set_64(&dio->dedup_alloc, mask);
792 	hammer2_io_putblk(&dio);
793 }
794 
795 /*
796  * Clear dedup validation bits in a DIO.  This is typically done when
797  * a modified chain is destroyed or by the bulkfree code.  No buffer
798  * is needed for this operation.  If the DIO no longer exists it is
799  * equivalent to the bits not being set.
800  */
801 void
802 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
803 			hammer2_off_t data_off, u_int bytes)
804 {
805 	hammer2_io_t *dio;
806 	uint64_t mask;
807 	int isgood;
808 
809 	if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
810 		return;
811 	if (btype != HAMMER2_BREF_TYPE_DATA)
812 		return;
813 	dio = hammer2_io_alloc(hmp, data_off, btype, 0, &isgood);
814 	if (dio) {
815 		if (data_off < dio->pbase ||
816 		    (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
817 		    dio->pbase + dio->psize) {
818 			panic("hammer2_io_dedup_delete: DATAOFF BAD "
819 			      "%016jx/%d %016jx\n",
820 			      data_off, bytes, dio->pbase);
821 		}
822 		mask = hammer2_dedup_mask(dio, data_off, bytes);
823 		atomic_clear_64(&dio->dedup_alloc, mask);
824 		atomic_clear_64(&dio->dedup_valid, mask);
825 		hammer2_io_putblk(&dio);
826 	}
827 }
828 
829 /*
830  * Assert that dedup allocation bits in a DIO are not set.  This operation
831  * does not require a buffer.  The DIO does not need to exist.
832  */
833 void
834 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
835 {
836 	hammer2_io_t *dio;
837 	int isgood;
838 
839 	dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA,
840 			       0, &isgood);
841 	if (dio) {
842 		KASSERT((dio->dedup_alloc &
843 			  hammer2_dedup_mask(dio, data_off, bytes)) == 0,
844 			("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
845 			data_off,
846 			bytes,
847 			hammer2_dedup_mask(dio, data_off, bytes),
848 			dio->dedup_alloc));
849 		hammer2_io_putblk(&dio);
850 	}
851 }
852 
853 static
854 void
855 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
856 {
857 	long *counterp;
858 
859 	if (bp->b_flags & B_DELWRI)
860 		return;
861 
862 	switch(dio->btype) {
863 	case 0:
864 		return;
865 	case HAMMER2_BREF_TYPE_DATA:
866 		counterp = &hammer2_iod_file_write;
867 		break;
868 	case HAMMER2_BREF_TYPE_DIRENT:
869 	case HAMMER2_BREF_TYPE_INODE:
870 		counterp = &hammer2_iod_meta_write;
871 		break;
872 	case HAMMER2_BREF_TYPE_INDIRECT:
873 		counterp = &hammer2_iod_indr_write;
874 		break;
875 	case HAMMER2_BREF_TYPE_FREEMAP_NODE:
876 	case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
877 		counterp = &hammer2_iod_fmap_write;
878 		break;
879 	default:
880 		counterp = &hammer2_iod_volu_write;
881 		break;
882 	}
883 	*counterp += dio->psize;
884 }
885 
886 void
887 hammer2_io_bkvasync(hammer2_io_t *dio)
888 {
889 	KKASSERT(dio->bp != NULL);
890 	bkvasync(dio->bp);
891 }
892 
893 /*
894  * Ref a dio that is already owned
895  */
896 void
897 _hammer2_io_ref(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
898 {
899 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
900 	atomic_add_64(&dio->refs, 1);
901 }
902