xref: /dragonfly/sys/vfs/hammer2/hammer2_io.c (revision 5ca0a96d)
1 /*
2  * Copyright (c) 2013-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "hammer2.h"
36 
37 #define HAMMER2_DOP_READ	1
38 #define HAMMER2_DOP_NEW		2
39 #define HAMMER2_DOP_NEWNZ	3
40 #define HAMMER2_DOP_READQ	4
41 
42 /*
43  * Implements an abstraction layer for synchronous and asynchronous
44  * buffered device I/O.  Can be used as an OS-abstraction but the main
45  * purpose is to allow larger buffers to be used against hammer2_chain's
46  * using smaller allocations, without causing deadlocks.
47  *
48  * The DIOs also record temporary state with limited persistence.  This
49  * feature is used to keep track of dedupable blocks.
50  */
51 static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg);
52 static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp);
53 
54 static int
55 hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2)
56 {
57 	if (io1->pbase < io2->pbase)
58 		return(-1);
59 	if (io1->pbase > io2->pbase)
60 		return(1);
61 	return(0);
62 }
63 
64 RB_PROTOTYPE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp, off_t);
65 RB_GENERATE2(hammer2_io_tree, hammer2_io, rbnode, hammer2_io_cmp,
66 		off_t, pbase);
67 
68 struct hammer2_cleanupcb_info {
69 	struct hammer2_io_tree tmptree;
70 	int	count;
71 };
72 
73 #if 0
74 static __inline
75 uint64_t
76 hammer2_io_mask(hammer2_io_t *dio, hammer2_off_t off, u_int bytes)
77 {
78 	uint64_t mask;
79 	int i;
80 
81 	if (bytes < 1024)	/* smaller chunks not supported */
82 		return 0;
83 
84 	/*
85 	 * Calculate crc check mask for larger chunks
86 	 */
87 	i = (((off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) &
88 	     HAMMER2_PBUFMASK) >> 10;
89 	if (i == 0 && bytes == HAMMER2_PBUFSIZE)
90 		return((uint64_t)-1);
91 	mask = ((uint64_t)1U << (bytes >> 10)) - 1;
92 	mask <<= i;
93 
94 	return mask;
95 }
96 #endif
97 
98 #ifdef HAMMER2_IO_DEBUG
99 
100 static __inline void
101 DIO_RECORD(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
102 {
103 	int i;
104 
105 	i = atomic_fetchadd_int(&dio->debug_index, 1) & HAMMER2_IO_DEBUG_MASK;
106 
107 	dio->debug_file[i] = file;
108 	dio->debug_line[i] = line;
109 	dio->debug_refs[i] = dio->refs;
110 	dio->debug_td[i] = curthread;
111 }
112 
113 #else
114 
115 #define DIO_RECORD(dio)
116 
117 #endif
118 
119 /*
120  * Returns the DIO corresponding to the data|radix, creating it if necessary.
121  *
122  * If createit is 0, NULL can be returned indicating that the DIO does not
123  * exist.  (btype) is ignored when createit is 0.
124  */
125 static __inline
126 hammer2_io_t *
127 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_key_t data_off, uint8_t btype,
128 		 int createit, int *isgoodp)
129 {
130 	hammer2_io_t *dio;
131 	hammer2_io_t *xio;
132 	hammer2_key_t lbase;
133 	hammer2_key_t pbase;
134 	hammer2_key_t pmask;
135 	hammer2_volume_t *vol;
136 	uint64_t refs;
137 	int lsize;
138 	int psize;
139 
140 	psize = HAMMER2_PBUFSIZE;
141 	pmask = ~(hammer2_off_t)(psize - 1);
142 	if ((int)(data_off & HAMMER2_OFF_MASK_RADIX))
143 		lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
144 	else
145 		lsize = 0;
146 	lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
147 	pbase = lbase & pmask;
148 
149 	if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
150 		kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
151 			pbase, lbase, lsize, pmask);
152 	}
153 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
154 	*isgoodp = 0;
155 
156 	/*
157 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
158 	 *
159 	 * If DIO_GOOD is set the ref should prevent it from being cleared
160 	 * out from under us, we can set *isgoodp, and the caller can operate
161 	 * on the buffer without any further interaction.
162 	 */
163 	hammer2_spin_sh(&hmp->io_spin);
164 	dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, pbase);
165 	if (dio) {
166 		refs = atomic_fetchadd_64(&dio->refs, 1);
167 		if ((refs & HAMMER2_DIO_MASK) == 0) {
168 			atomic_add_int(&dio->hmp->iofree_count, -1);
169 		}
170 		if (refs & HAMMER2_DIO_GOOD)
171 			*isgoodp = 1;
172 		hammer2_spin_unsh(&hmp->io_spin);
173 	} else if (createit) {
174 		refs = 0;
175 		hammer2_spin_unsh(&hmp->io_spin);
176 		vol = hammer2_get_volume(hmp, pbase);
177 		dio = kmalloc(sizeof(*dio), M_HAMMER2, M_INTWAIT | M_ZERO);
178 		dio->hmp = hmp;
179 		dio->devvp = vol->dev->devvp;
180 		dio->dbase = vol->offset;
181 		KKASSERT((dio->dbase & HAMMER2_FREEMAP_LEVEL1_MASK) == 0);
182 		dio->pbase = pbase;
183 		dio->psize = psize;
184 		dio->btype = btype;
185 		dio->refs = refs + 1;
186 		dio->act = 5;
187 		hammer2_spin_ex(&hmp->io_spin);
188 		xio = RB_INSERT(hammer2_io_tree, &hmp->iotree, dio);
189 		if (xio == NULL) {
190 			atomic_add_int(&hammer2_dio_count, 1);
191 			hammer2_spin_unex(&hmp->io_spin);
192 		} else {
193 			refs = atomic_fetchadd_64(&xio->refs, 1);
194 			if ((refs & HAMMER2_DIO_MASK) == 0)
195 				atomic_add_int(&xio->hmp->iofree_count, -1);
196 			if (refs & HAMMER2_DIO_GOOD)
197 				*isgoodp = 1;
198 			hammer2_spin_unex(&hmp->io_spin);
199 			kfree(dio, M_HAMMER2);
200 			dio = xio;
201 		}
202 	} else {
203 		hammer2_spin_unsh(&hmp->io_spin);
204 		return NULL;
205 	}
206 	dio->ticks = ticks;
207 	if (dio->act < 10)
208 		++dio->act;
209 
210 	return dio;
211 }
212 
213 /*
214  * Acquire the requested dio.  If DIO_GOOD is not set we must instantiate
215  * a buffer.  If set the buffer already exists and is good to go.
216  */
217 hammer2_io_t *
218 _hammer2_io_getblk(hammer2_dev_t *hmp, int btype, off_t lbase,
219 		   int lsize, int op HAMMER2_IO_DEBUG_ARGS)
220 {
221 	hammer2_io_t *dio;
222 	hammer2_off_t dev_pbase;
223 	off_t peof;
224 	uint64_t orefs;
225 	uint64_t nrefs;
226 	int isgood;
227 	int error;
228 	int hce;
229 	int bflags;
230 
231 	bflags = ((btype == HAMMER2_BREF_TYPE_DATA) ? B_NOTMETA : 0);
232 	bflags |= B_KVABIO;
233 
234 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
235 
236 	if (op == HAMMER2_DOP_READQ) {
237 		dio = hammer2_io_alloc(hmp, lbase, btype, 0, &isgood);
238 		if (dio == NULL)
239 			return NULL;
240 		op = HAMMER2_DOP_READ;
241 	} else {
242 		dio = hammer2_io_alloc(hmp, lbase, btype, 1, &isgood);
243 	}
244 
245 	for (;;) {
246 		orefs = dio->refs;
247 		cpu_ccfence();
248 
249 		/*
250 		 * Buffer is already good, handle the op and return.
251 		 */
252 		if (orefs & HAMMER2_DIO_GOOD) {
253 			if (isgood == 0)
254 				cpu_mfence();
255 			bkvasync(dio->bp);
256 
257 			switch(op) {
258 			case HAMMER2_DOP_NEW:
259 				bzero(hammer2_io_data(dio, lbase), lsize);
260 				/* fall through */
261 			case HAMMER2_DOP_NEWNZ:
262 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
263 				break;
264 			case HAMMER2_DOP_READ:
265 			default:
266 				/* nothing to do */
267 				break;
268 			}
269 			DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
270 			return (dio);
271 		}
272 
273 		/*
274 		 * Try to own the DIO
275 		 */
276 		if (orefs & HAMMER2_DIO_INPROG) {
277 			nrefs = orefs | HAMMER2_DIO_WAITING;
278 			tsleep_interlock(dio, 0);
279 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
280 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
281 			}
282 			/* retry */
283 		} else {
284 			nrefs = orefs | HAMMER2_DIO_INPROG;
285 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
286 				break;
287 			}
288 		}
289 	}
290 
291 	/*
292 	 * We break to here if GOOD is not set and we acquired INPROG for
293 	 * the I/O.
294 	 */
295 	KKASSERT(dio->bp == NULL);
296 	if (btype == HAMMER2_BREF_TYPE_DATA)
297 		hce = hammer2_cluster_data_read;
298 	else
299 		hce = hammer2_cluster_meta_read;
300 
301 	error = 0;
302 	dev_pbase = dio->pbase - dio->dbase;
303 	if (dio->pbase == (lbase & ~HAMMER2_OFF_MASK_RADIX) &&
304 	    dio->psize == lsize) {
305 		switch(op) {
306 		case HAMMER2_DOP_NEW:
307 		case HAMMER2_DOP_NEWNZ:
308 			dio->bp = getblk(dio->devvp,
309 					 dev_pbase, dio->psize,
310 					 GETBLK_KVABIO, 0);
311 			if (op == HAMMER2_DOP_NEW) {
312 				bkvasync(dio->bp);
313 				bzero(dio->bp->b_data, dio->psize);
314 			}
315 			atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
316 			break;
317 		case HAMMER2_DOP_READ:
318 		default:
319 			KKASSERT(dio->bp == NULL);
320 			if (hce > 0) {
321 				/*
322 				 * Synchronous cluster I/O for now.
323 				 */
324 				peof = (dio->pbase + HAMMER2_SEGMASK64) &
325 				       ~HAMMER2_SEGMASK64;
326 				peof -= dio->dbase;
327 				error = cluster_readx(dio->devvp,
328 						     peof, dev_pbase,
329 						     dio->psize, bflags,
330 						     dio->psize,
331 						     HAMMER2_PBUFSIZE*hce,
332 						     &dio->bp);
333 			} else {
334 				error = breadnx(dio->devvp, dev_pbase,
335 						dio->psize, bflags,
336 					        NULL, NULL, 0, &dio->bp);
337 			}
338 		}
339 	} else {
340 		if (hce > 0) {
341 			/*
342 			 * Synchronous cluster I/O for now.
343 			 */
344 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
345 			       ~HAMMER2_SEGMASK64;
346 			peof -= dio->dbase;
347 			error = cluster_readx(dio->devvp,
348 					      peof, dev_pbase, dio->psize,
349 					      bflags,
350 					      dio->psize, HAMMER2_PBUFSIZE*hce,
351 					      &dio->bp);
352 		} else {
353 			error = breadnx(dio->devvp, dev_pbase,
354 				        dio->psize, bflags,
355 					NULL, NULL, 0, &dio->bp);
356 		}
357 		if (dio->bp) {
358 			/*
359 			 * Handle NEW flags
360 			 */
361 			switch(op) {
362 			case HAMMER2_DOP_NEW:
363 				bkvasync(dio->bp);
364 				bzero(hammer2_io_data(dio, lbase), lsize);
365 				/* fall through */
366 			case HAMMER2_DOP_NEWNZ:
367 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
368 				break;
369 			case HAMMER2_DOP_READ:
370 			default:
371 				break;
372 			}
373 
374 			/*
375 			 * Tell the kernel that the buffer cache is not
376 			 * meta-data based on the btype.  This allows
377 			 * swapcache to distinguish between data and
378 			 * meta-data.
379 			 */
380 			switch(btype) {
381 			case HAMMER2_BREF_TYPE_DATA:
382 				dio->bp->b_flags |= B_NOTMETA;
383 				break;
384 			default:
385 				break;
386 			}
387 		}
388 	}
389 
390 	if (dio->bp) {
391 		bkvasync(dio->bp);
392 		BUF_KERNPROC(dio->bp);
393 		dio->bp->b_flags &= ~B_AGE;
394 		/* dio->bp->b_debug_info2 = dio; */
395 	}
396 	dio->error = error;
397 
398 	/*
399 	 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
400 	 */
401 	for (;;) {
402 		orefs = dio->refs;
403 		cpu_ccfence();
404 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_WAITING);
405 		if (error == 0)
406 			nrefs |= HAMMER2_DIO_GOOD;
407 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
408 			if (orefs & HAMMER2_DIO_WAITING)
409 				wakeup(dio);
410 			break;
411 		}
412 		cpu_pause();
413 	}
414 
415 	/* XXX error handling */
416 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
417 
418 	return dio;
419 }
420 
421 /*
422  * Release our ref on *diop.
423  *
424  * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
425  * of dio->bp.  Then we clean up DIO_INPROG and DIO_WAITING.
426  */
427 void
428 _hammer2_io_putblk(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
429 {
430 	hammer2_dev_t *hmp;
431 	hammer2_io_t *dio;
432 	struct buf *bp;
433 	off_t pbase;
434 	int psize;
435 	int dio_limit;
436 	uint64_t orefs;
437 	uint64_t nrefs;
438 
439 	dio = *diop;
440 	*diop = NULL;
441 	hmp = dio->hmp;
442 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
443 
444 	KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
445 
446 	/*
447 	 * Drop refs.
448 	 *
449 	 * On the 1->0 transition clear GOOD and set INPROG, and break.
450 	 * On any other transition we can return early.
451 	 */
452 	for (;;) {
453 		orefs = dio->refs;
454 		cpu_ccfence();
455 
456 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
457 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
458 			/*
459 			 * Lastdrop case, INPROG can be set.  GOOD must be
460 			 * cleared to prevent the getblk shortcut.
461 			 */
462 			nrefs = orefs - 1;
463 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
464 			nrefs |= HAMMER2_DIO_INPROG;
465 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
466 				break;
467 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
468 			/*
469 			 * Lastdrop case, INPROG already set.  We must
470 			 * wait for INPROG to clear.
471 			 */
472 			nrefs = orefs | HAMMER2_DIO_WAITING;
473 			tsleep_interlock(dio, 0);
474 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
475 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
476 			}
477 			/* retry */
478 		} else {
479 			/*
480 			 * Normal drop case.
481 			 */
482 			nrefs = orefs - 1;
483 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
484 				return;
485 			/* retry */
486 		}
487 		cpu_pause();
488 		/* retry */
489 	}
490 
491 	/*
492 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
493 	 * have been cleared.  iofree_count has not yet been incremented,
494 	 * note that another accessor race will decrement iofree_count so
495 	 * we have to increment it regardless.
496 	 *
497 	 * We can now dispose of the buffer, and should do it before calling
498 	 * io_complete() in case there's a race against a new reference
499 	 * which causes io_complete() to chain and instantiate the bp again.
500 	 */
501 	pbase = dio->pbase;
502 	psize = dio->psize;
503 	bp = dio->bp;
504 	dio->bp = NULL;
505 
506 	if ((orefs & HAMMER2_DIO_GOOD) && bp) {
507 		/*
508 		 * Non-errored disposal of bp
509 		 */
510 		if (orefs & HAMMER2_DIO_DIRTY) {
511 			dio_write_stats_update(dio, bp);
512 
513 			/*
514 			 * Allows dirty buffers to accumulate and
515 			 * possibly be canceled (e.g. by a 'rm'),
516 			 * by default we will burst-write later.
517 			 *
518 			 * We generally do NOT want to issue an actual
519 			 * b[a]write() or cluster_write() here.  Due to
520 			 * the way chains are locked, buffers may be cycled
521 			 * in and out quite often and disposal here can cause
522 			 * multiple writes or write-read stalls.
523 			 *
524 			 * If FLUSH is set we do want to issue the actual
525 			 * write.  This typically occurs in the write-behind
526 			 * case when writing to large files.
527 			 */
528 			off_t peof;
529 			int hce;
530 			if (dio->refs & HAMMER2_DIO_FLUSH) {
531 				if ((hce = hammer2_cluster_write) != 0) {
532 					peof = (pbase + HAMMER2_SEGMASK64) &
533 					       ~HAMMER2_SEGMASK64;
534 					peof -= dio->dbase;
535 					bp->b_flags |= B_CLUSTEROK;
536 					cluster_write(bp, peof, psize, hce);
537 				} else {
538 					bp->b_flags &= ~B_CLUSTEROK;
539 					bawrite(bp);
540 				}
541 			} else {
542 				bp->b_flags &= ~B_CLUSTEROK;
543 				bdwrite(bp);
544 			}
545 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
546 			brelse(bp);
547 		} else {
548 			bqrelse(bp);
549 		}
550 	} else if (bp) {
551 		/*
552 		 * Errored disposal of bp
553 		 */
554 		brelse(bp);
555 	}
556 
557 	/*
558 	 * Update iofree_count before disposing of the dio
559 	 */
560 	hmp = dio->hmp;
561 	atomic_add_int(&hmp->iofree_count, 1);
562 
563 	/*
564 	 * Clear INPROG, GOOD, and WAITING (GOOD should already be clear).
565 	 *
566 	 * Also clear FLUSH as it was handled above.
567 	 */
568 	for (;;) {
569 		orefs = dio->refs;
570 		cpu_ccfence();
571 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
572 				  HAMMER2_DIO_WAITING | HAMMER2_DIO_FLUSH);
573 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
574 			if (orefs & HAMMER2_DIO_WAITING)
575 				wakeup(dio);
576 			break;
577 		}
578 		cpu_pause();
579 	}
580 
581 	/*
582 	 * We cache free buffers so re-use cases can use a shared lock, but
583 	 * if too many build up we have to clean them out.
584 	 */
585 	dio_limit = hammer2_dio_limit;
586 	if (dio_limit < 256)
587 		dio_limit = 256;
588 	if (dio_limit > 1024*1024)
589 		dio_limit = 1024*1024;
590 	if (hmp->iofree_count > dio_limit) {
591 		struct hammer2_cleanupcb_info info;
592 
593 		RB_INIT(&info.tmptree);
594 		hammer2_spin_ex(&hmp->io_spin);
595 		if (hmp->iofree_count > dio_limit) {
596 			info.count = hmp->iofree_count / 5;
597 			RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL,
598 				hammer2_io_cleanup_callback, &info);
599 		}
600 		hammer2_spin_unex(&hmp->io_spin);
601 		hammer2_io_cleanup(hmp, &info.tmptree);
602 	}
603 }
604 
605 /*
606  * Cleanup any dio's with (INPROG | refs) == 0.
607  *
608  * Called to clean up cached DIOs on umount after all activity has been
609  * flushed.
610  */
611 static
612 int
613 hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg)
614 {
615 	struct hammer2_cleanupcb_info *info = arg;
616 	hammer2_io_t *xio;
617 
618 	if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) {
619 		if (dio->act > 0) {
620 			int act;
621 
622 			act = dio->act - (ticks - dio->ticks) / hz - 1;
623 			if (act > 0) {
624 				dio->act = act;
625 				return 0;
626 			}
627 			dio->act = 0;
628 		}
629 		KKASSERT(dio->bp == NULL);
630 		if (info->count > 0) {
631 			RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio);
632 			xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio);
633 			KKASSERT(xio == NULL);
634 			--info->count;
635 		}
636 	}
637 	return 0;
638 }
639 
640 void
641 hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree)
642 {
643 	hammer2_io_t *dio;
644 
645 	while ((dio = RB_ROOT(tree)) != NULL) {
646 		RB_REMOVE(hammer2_io_tree, tree, dio);
647 		KKASSERT(dio->bp == NULL &&
648 		    (dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0);
649 		if (dio->refs & HAMMER2_DIO_DIRTY) {
650 			kprintf("hammer2_io_cleanup: Dirty buffer "
651 				"%016jx/%d (bp=%p)\n",
652 				dio->pbase, dio->psize, dio->bp);
653 		}
654 		kfree(dio, M_HAMMER2);
655 		atomic_add_int(&hammer2_dio_count, -1);
656 		atomic_add_int(&hmp->iofree_count, -1);
657 	}
658 }
659 
660 /*
661  * Returns a pointer to the requested data.
662  */
663 char *
664 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
665 {
666 	struct buf *bp;
667 	int off;
668 
669 	bp = dio->bp;
670 	KKASSERT(bp != NULL);
671 	bkvasync(bp);
672 	lbase -= dio->dbase;
673 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
674 	KKASSERT(off >= 0 && off < bp->b_bufsize);
675 	return(bp->b_data + off);
676 }
677 
678 int
679 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
680 	       hammer2_io_t **diop)
681 {
682 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEW);
683 	return ((*diop)->error);
684 }
685 
686 int
687 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
688 		 hammer2_io_t **diop)
689 {
690 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEWNZ);
691 	return ((*diop)->error);
692 }
693 
694 int
695 _hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
696 		hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
697 {
698 #ifdef HAMMER2_IO_DEBUG
699 	hammer2_io_t *dio;
700 #endif
701 
702 	*diop = _hammer2_io_getblk(hmp, btype, lbase, lsize,
703 				   HAMMER2_DOP_READ HAMMER2_IO_DEBUG_CALL);
704 #ifdef HAMMER2_IO_DEBUG
705 	if ((dio = *diop) != NULL) {
706 		int i = (dio->debug_index - 1) & HAMMER2_IO_DEBUG_MASK;
707 		dio->debug_data[i] = debug_data;
708 	}
709 #endif
710 	return ((*diop)->error);
711 }
712 
713 hammer2_io_t *
714 _hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase,
715 		     int lsize HAMMER2_IO_DEBUG_ARGS)
716 {
717 	hammer2_io_t *dio;
718 
719 	dio = _hammer2_io_getblk(hmp, 0, lbase, lsize,
720 				 HAMMER2_DOP_READQ HAMMER2_IO_DEBUG_CALL);
721 	return dio;
722 }
723 
724 void
725 _hammer2_io_bawrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
726 {
727 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
728 				      HAMMER2_DIO_FLUSH);
729 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
730 }
731 
732 void
733 _hammer2_io_bdwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
734 {
735 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
736 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
737 }
738 
739 int
740 _hammer2_io_bwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
741 {
742 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
743 				      HAMMER2_DIO_FLUSH);
744 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
745 	return (0);	/* XXX */
746 }
747 
748 void
749 hammer2_io_setdirty(hammer2_io_t *dio)
750 {
751 	atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
752 }
753 
754 /*
755  * This routine is called when a MODIFIED chain is being DESTROYED,
756  * in an attempt to allow the related buffer cache buffer to be
757  * invalidated and discarded instead of flushing it to disk.
758  *
759  * At the moment this case is only really useful for file meta-data.
760  * File data is already handled via the logical buffer cache associated
761  * with the vnode, and will be discarded if it was never flushed to disk.
762  * File meta-data may include inodes, directory entries, and indirect blocks.
763  *
764  * XXX
765  * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
766  * invalidated might be smaller.  Most of the meta-data structures above
767  * are in the 'smaller' category.  For now, don't try to invalidate the
768  * data areas.
769  */
770 void
771 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
772 {
773 	/* NOP */
774 }
775 
776 void
777 _hammer2_io_brelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
778 {
779 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
780 }
781 
782 void
783 _hammer2_io_bqrelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
784 {
785 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
786 }
787 
788 /*
789  * Set dedup validation bits in a DIO.  We do not need the buffer cache
790  * buffer for this.  This must be done concurrent with setting bits in
791  * the freemap so as to interlock with bulkfree's clearing of those bits.
792  */
793 void
794 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
795 {
796 	hammer2_io_t *dio;
797 	uint64_t mask;
798 	int lsize;
799 	int isgood;
800 
801 	dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1, &isgood);
802 	if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX))
803 		lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
804 	else
805 		lsize = 0;
806 	mask = hammer2_dedup_mask(dio, bref->data_off, lsize);
807 	atomic_clear_64(&dio->dedup_valid, mask);
808 	atomic_set_64(&dio->dedup_alloc, mask);
809 	hammer2_io_putblk(&dio);
810 }
811 
812 /*
813  * Clear dedup validation bits in a DIO.  This is typically done when
814  * a modified chain is destroyed or by the bulkfree code.  No buffer
815  * is needed for this operation.  If the DIO no longer exists it is
816  * equivalent to the bits not being set.
817  */
818 void
819 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
820 			hammer2_off_t data_off, u_int bytes)
821 {
822 	hammer2_io_t *dio;
823 	uint64_t mask;
824 	int isgood;
825 
826 	if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
827 		return;
828 	if (btype != HAMMER2_BREF_TYPE_DATA)
829 		return;
830 	dio = hammer2_io_alloc(hmp, data_off, btype, 0, &isgood);
831 	if (dio) {
832 		if (data_off < dio->pbase ||
833 		    (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
834 		    dio->pbase + dio->psize) {
835 			panic("hammer2_io_dedup_delete: DATAOFF BAD "
836 			      "%016jx/%d %016jx\n",
837 			      data_off, bytes, dio->pbase);
838 		}
839 		mask = hammer2_dedup_mask(dio, data_off, bytes);
840 		atomic_clear_64(&dio->dedup_alloc, mask);
841 		atomic_clear_64(&dio->dedup_valid, mask);
842 		hammer2_io_putblk(&dio);
843 	}
844 }
845 
846 /*
847  * Assert that dedup allocation bits in a DIO are not set.  This operation
848  * does not require a buffer.  The DIO does not need to exist.
849  */
850 void
851 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
852 {
853 	hammer2_io_t *dio;
854 	int isgood;
855 
856 	dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA,
857 			       0, &isgood);
858 	if (dio) {
859 		KASSERT((dio->dedup_alloc &
860 			  hammer2_dedup_mask(dio, data_off, bytes)) == 0,
861 			("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
862 			data_off,
863 			bytes,
864 			hammer2_dedup_mask(dio, data_off, bytes),
865 			dio->dedup_alloc));
866 		hammer2_io_putblk(&dio);
867 	}
868 }
869 
870 static
871 void
872 dio_write_stats_update(hammer2_io_t *dio, struct buf *bp)
873 {
874 	if (bp->b_flags & B_DELWRI)
875 		return;
876 	hammer2_adjwritecounter(dio->btype, dio->psize);
877 }
878 
879 void
880 hammer2_io_bkvasync(hammer2_io_t *dio)
881 {
882 	KKASSERT(dio->bp != NULL);
883 	bkvasync(dio->bp);
884 }
885 
886 /*
887  * Ref a dio that is already owned
888  */
889 void
890 _hammer2_io_ref(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
891 {
892 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
893 	atomic_add_64(&dio->refs, 1);
894 }
895