1 /*
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
5  * Copyright (c) 2013-2023 The DragonFly Project.  All rights reserved.
6  *
7  * This code is derived from software contributed to The DragonFly Project
8  * by Matthew Dillon <dillon@dragonflybsd.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in
18  *    the documentation and/or other materials provided with the
19  *    distribution.
20  * 3. Neither the name of The DragonFly Project nor the names of its
21  *    contributors may be used to endorse or promote products derived
22  *    from this software without specific, prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #include "hammer2.h"
39 
40 #define HAMMER2_DOP_READ	1
41 #define HAMMER2_DOP_NEW		2
42 #define HAMMER2_DOP_NEWNZ	3
43 #define HAMMER2_DOP_READQ	4
44 
45 /*
46  * Implements an abstraction layer for synchronous and asynchronous
47  * buffered device I/O.  Can be used as an OS-abstraction but the main
48  * purpose is to allow larger buffers to be used against hammer2_chain's
49  * using smaller allocations, without causing deadlocks.
50  *
51  * The DIOs also record temporary state with limited persistence.  This
52  * feature is used to keep track of dedupable blocks.
53  */
54 static void dio_write_stats_update(hammer2_io_t *dio, struct m_buf *bp);
55 
56 static hammer2_io_t *hammer2_io_hash_lookup(hammer2_dev_t *hmp,
57 			hammer2_off_t pbase, uint64_t *refsp);
58 static hammer2_io_t *hammer2_io_hash_enter(hammer2_dev_t *hmp,
59 			hammer2_io_t *dio, uint64_t *refsp);
60 static void hammer2_io_hash_cleanup(hammer2_dev_t *hmp, int dio_limit);
61 
62 void
63 hammer2_io_hash_init(hammer2_dev_t *hmp)
64 {
65 	hammer2_io_hash_t *hash;
66 	int i;
67 
68 	for (i = 0; i < HAMMER2_IOHASH_SIZE; ++i) {
69 		hash = &hmp->iohash[i];
70 		hammer2_spin_init(&hash->spin, "h2iohash");
71 	}
72 }
73 
74 #ifdef HAMMER2_IO_DEBUG
75 
76 static __inline void
77 DIO_RECORD(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
78 {
79 	int i;
80 
81 	i = atomic_fetchadd_int(&dio->debug_index, 1) & HAMMER2_IO_DEBUG_MASK;
82 
83 	dio->debug_file[i] = file;
84 	dio->debug_line[i] = line;
85 	dio->debug_refs[i] = dio->refs;
86 	dio->debug_td[i] = curthread;
87 }
88 
89 #else
90 
91 #define DIO_RECORD(dio)
92 
93 #endif
94 
95 /*
96  * Returns the DIO corresponding to the data|radix, creating it if necessary.
97  *
98  * If createit is 0, NULL can be returned indicating that the DIO does not
99  * exist.  (btype) is ignored when createit is 0.
100  */
101 static
102 hammer2_io_t *
103 hammer2_io_alloc(hammer2_dev_t *hmp, hammer2_off_t data_off, uint8_t btype,
104 		 int createit, int *isgoodp)
105 {
106 	hammer2_io_t *dio;
107 	hammer2_io_t *xio;
108 	hammer2_off_t lbase;
109 	hammer2_off_t pbase;
110 	hammer2_off_t pmask;
111 	hammer2_vfsvolume_t *vol;
112 	uint64_t refs;
113 	int lsize;
114 	int psize;
115 
116 	psize = HAMMER2_PBUFSIZE;
117 	pmask = ~(hammer2_off_t)(psize - 1);
118 	if ((int)(data_off & HAMMER2_OFF_MASK_RADIX))
119 		lsize = 1 << (int)(data_off & HAMMER2_OFF_MASK_RADIX);
120 	else
121 		lsize = 0;
122 	lbase = data_off & ~HAMMER2_OFF_MASK_RADIX;
123 	pbase = lbase & pmask;
124 
125 	if (pbase == 0 || ((lbase + lsize - 1) & pmask) != pbase) {
126 		kprintf("Illegal: %016jx %016jx+%08x / %016jx\n",
127 			pbase, lbase, lsize, pmask);
128 	}
129 	KKASSERT(pbase != 0 && ((lbase + lsize - 1) & pmask) == pbase);
130 	*isgoodp = 0;
131 
132 	/*
133 	 * Access/Allocate the DIO, bump dio->refs to prevent destruction.
134 	 *
135 	 * If DIO_GOOD is set the ref should prevent it from being cleared
136 	 * out from under us, we can set *isgoodp, and the caller can operate
137 	 * on the buffer without any further interaction.
138 	 */
139 	dio = hammer2_io_hash_lookup(hmp, pbase, &refs);
140 	if (dio) {
141 		if (refs & HAMMER2_DIO_GOOD)
142 			*isgoodp = 1;
143 	} else if (createit) {
144 		refs = 0;
145 		vol = hammer2_get_volume_from_hmp(hmp, pbase);
146 		dio = kmalloc_obj(sizeof(*dio), hmp->mio, M_INTWAIT | M_ZERO);
147 		dio->hmp = hmp;
148 		dio->devvp = vol->dev->devvp;
149 		dio->dbase = vol->offset;
150 		KKASSERT((dio->dbase & HAMMER2_FREEMAP_LEVEL1_MASK) == 0);
151 		dio->pbase = pbase;
152 		dio->psize = psize;
153 		dio->btype = btype;
154 		dio->refs = refs + 1;
155 		dio->act = 5;
156 		xio = hammer2_io_hash_enter(hmp, dio, &refs);
157 		if (xio == NULL) {
158 			atomic_add_int(&hammer2_dio_count, 1);
159 		} else {
160 			if (refs & HAMMER2_DIO_GOOD)
161 				*isgoodp = 1;
162 			kfree_obj(dio, hmp->mio);
163 			dio = xio;
164 		}
165 	} else {
166 		return NULL;
167 	}
168 	dio->ticks = ticks;
169 	if (dio->act < 10)
170 		++dio->act;
171 
172 	return dio;
173 }
174 
175 /*
176  * Acquire the requested dio.  If DIO_GOOD is not set we must instantiate
177  * a buffer.  If set the buffer already exists and is good to go.
178  */
179 hammer2_io_t *
180 _hammer2_io_getblk(hammer2_dev_t *hmp, int btype, off_t lbase,
181 		   int lsize, int op HAMMER2_IO_DEBUG_ARGS)
182 {
183 	hammer2_io_t *dio;
184 	hammer2_off_t dev_pbase;
185 	//off_t peof;
186 	uint64_t orefs;
187 	uint64_t nrefs;
188 	int isgood;
189 	int error;
190 	int hce;
191 	//int bflags;
192 
193 	//bflags = ((btype == HAMMER2_BREF_TYPE_DATA) ? B_NOTMETA : 0);
194 	//bflags |= B_KVABIO;
195 
196 	KKASSERT((1 << (int)(lbase & HAMMER2_OFF_MASK_RADIX)) == lsize);
197 
198 	if (op == HAMMER2_DOP_READQ) {
199 		dio = hammer2_io_alloc(hmp, lbase, btype, 0, &isgood);
200 		if (dio == NULL)
201 			return NULL;
202 		op = HAMMER2_DOP_READ;
203 	} else {
204 		dio = hammer2_io_alloc(hmp, lbase, btype, 1, &isgood);
205 	}
206 
207 	for (;;) {
208 		orefs = dio->refs;
209 		cpu_ccfence();
210 
211 		/*
212 		 * Buffer is already good, handle the op and return.
213 		 */
214 		if (orefs & HAMMER2_DIO_GOOD) {
215 			if (isgood == 0)
216 				cpu_mfence();
217 			bkvasync(dio->bp);
218 
219 			switch(op) {
220 			case HAMMER2_DOP_NEW:
221 				bzero(hammer2_io_data(dio, lbase), lsize);
222 				/* fall through */
223 			case HAMMER2_DOP_NEWNZ:
224 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
225 				break;
226 			case HAMMER2_DOP_READ:
227 			default:
228 				/* nothing to do */
229 				break;
230 			}
231 			DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
232 			return (dio);
233 		}
234 
235 		/*
236 		 * Try to own the DIO
237 		 */
238 		if (orefs & HAMMER2_DIO_INPROG) {
239 			nrefs = orefs | HAMMER2_DIO_WAITING;
240 			tsleep_interlock(dio, 0);
241 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
242 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
243 			}
244 			/* retry */
245 		} else {
246 			nrefs = orefs | HAMMER2_DIO_INPROG;
247 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
248 				break;
249 			}
250 		}
251 	}
252 
253 	/*
254 	 * We break to here if GOOD is not set and we acquired INPROG for
255 	 * the I/O.
256 	 */
257 	KKASSERT(dio->bp == NULL);
258 	if (btype == HAMMER2_BREF_TYPE_DATA)
259 		hce = hammer2_cluster_data_read;
260 	else
261 		hce = hammer2_cluster_meta_read;
262 
263 	error = 0;
264 	dev_pbase = dio->pbase - dio->dbase;
265 	if (dio->pbase == (lbase & ~HAMMER2_OFF_MASK_RADIX) &&
266 	    dio->psize == lsize) {
267 		switch(op) {
268 		case HAMMER2_DOP_NEW:
269 		case HAMMER2_DOP_NEWNZ:
270 			dio->bp = getblkx(dio->devvp,
271 					 dev_pbase, dio->psize,
272 					 GETBLK_KVABIO, 0);
273 			if (op == HAMMER2_DOP_NEW) {
274 				bkvasync(dio->bp);
275 				bzero(dio->bp->b_data, dio->psize);
276 			}
277 			atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
278 			break;
279 		case HAMMER2_DOP_READ:
280 		default:
281 			KKASSERT(dio->bp == NULL);
282 #if 0
283 			if (hce > 0) {
284 				/*
285 				 * Synchronous cluster I/O for now.
286 				 */
287 				peof = (dio->pbase + HAMMER2_SEGMASK64) &
288 				       ~HAMMER2_SEGMASK64;
289 				peof -= dio->dbase;
290 				error = cluster_readx(dio->devvp,
291 						     peof, dev_pbase,
292 						     dio->psize, bflags,
293 						     dio->psize,
294 						     HAMMER2_PBUFSIZE*hce,
295 						     &dio->bp);
296 			} else {
297 				error = breadnx(dio->devvp, dev_pbase,
298 						dio->psize, bflags,
299 					        NULL, NULL, 0, &dio->bp);
300 			}
301 #else
302 			error = breadx(dio->devvp, dev_pbase, dio->psize, &dio->bp);
303 #endif
304 			break;
305 		}
306 	} else {
307 #if 0
308 		if (hce > 0) {
309 			/*
310 			 * Synchronous cluster I/O for now.
311 			 */
312 			peof = (dio->pbase + HAMMER2_SEGMASK64) &
313 			       ~HAMMER2_SEGMASK64;
314 			peof -= dio->dbase;
315 			error = cluster_readx(dio->devvp,
316 					      peof, dev_pbase, dio->psize,
317 					      bflags,
318 					      dio->psize, HAMMER2_PBUFSIZE*hce,
319 					      &dio->bp);
320 		} else {
321 			error = breadnx(dio->devvp, dev_pbase,
322 				        dio->psize, bflags,
323 					NULL, NULL, 0, &dio->bp);
324 		}
325 #else
326 		error = breadx(dio->devvp, dev_pbase, dio->psize, &dio->bp);
327 #endif
328 		if (dio->bp) {
329 			/*
330 			 * Handle NEW flags
331 			 */
332 			switch(op) {
333 			case HAMMER2_DOP_NEW:
334 				bkvasync(dio->bp);
335 				bzero(hammer2_io_data(dio, lbase), lsize);
336 				/* fall through */
337 			case HAMMER2_DOP_NEWNZ:
338 				atomic_set_long(&dio->refs, HAMMER2_DIO_DIRTY);
339 				break;
340 			case HAMMER2_DOP_READ:
341 			default:
342 				break;
343 			}
344 
345 			/*
346 			 * Tell the kernel that the buffer cache is not
347 			 * meta-data based on the btype.  This allows
348 			 * swapcache to distinguish between data and
349 			 * meta-data.
350 			 */
351 			switch(btype) {
352 			case HAMMER2_BREF_TYPE_DATA:
353 				//dio->bp->b_flags |= B_NOTMETA;
354 				break;
355 			default:
356 				break;
357 			}
358 		}
359 	}
360 
361 	if (dio->bp) {
362 		bkvasync(dio->bp);
363 		BUF_KERNPROC(dio->bp);
364 		//dio->bp->b_flags &= ~B_AGE;
365 		/* dio->bp->b_debug_info2 = dio; */
366 	}
367 	dio->error = error;
368 
369 	/*
370 	 * Clear INPROG and WAITING, set GOOD wake up anyone waiting.
371 	 */
372 	for (;;) {
373 		orefs = dio->refs;
374 		cpu_ccfence();
375 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_WAITING);
376 		if (error == 0)
377 			nrefs |= HAMMER2_DIO_GOOD;
378 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
379 			if (orefs & HAMMER2_DIO_WAITING)
380 				wakeup(dio);
381 			break;
382 		}
383 		cpu_pause();
384 	}
385 
386 	/* XXX error handling */
387 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
388 
389 	return dio;
390 }
391 
392 /*
393  * Release our ref on *diop.
394  *
395  * On the 1->0 transition we clear DIO_GOOD, set DIO_INPROG, and dispose
396  * of dio->bp.  Then we clean up DIO_INPROG and DIO_WAITING.
397  */
398 void
399 _hammer2_io_putblk(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
400 {
401 	hammer2_dev_t *hmp;
402 	hammer2_io_t *dio;
403 	struct m_buf *bp;
404 	off_t pbase;
405 	int psize;
406 	int dio_limit;
407 	uint64_t orefs;
408 	uint64_t nrefs;
409 
410 	dio = *diop;
411 	*diop = NULL;
412 	hmp = dio->hmp;
413 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
414 
415 	KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0);
416 
417 	/*
418 	 * Drop refs.
419 	 *
420 	 * On the 1->0 transition clear GOOD and set INPROG, and break.
421 	 * On any other transition we can return early.
422 	 */
423 	for (;;) {
424 		orefs = dio->refs;
425 		cpu_ccfence();
426 
427 		if ((orefs & HAMMER2_DIO_MASK) == 1 &&
428 		    (orefs & HAMMER2_DIO_INPROG) == 0) {
429 			/*
430 			 * Lastdrop case, INPROG can be set.  GOOD must be
431 			 * cleared to prevent the getblk shortcut.
432 			 */
433 			nrefs = orefs - 1;
434 			nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY);
435 			nrefs |= HAMMER2_DIO_INPROG;
436 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
437 				break;
438 		} else if ((orefs & HAMMER2_DIO_MASK) == 1) {
439 			/*
440 			 * Lastdrop case, INPROG already set.  We must
441 			 * wait for INPROG to clear.
442 			 */
443 			nrefs = orefs | HAMMER2_DIO_WAITING;
444 			tsleep_interlock(dio, 0);
445 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
446 				tsleep(dio, PINTERLOCKED, "h2dio", hz);
447 			}
448 			/* retry */
449 		} else {
450 			/*
451 			 * Normal drop case.
452 			 */
453 			nrefs = orefs - 1;
454 			if (atomic_cmpset_64(&dio->refs, orefs, nrefs))
455 				return;
456 			/* retry */
457 		}
458 		cpu_pause();
459 		/* retry */
460 	}
461 
462 	/*
463 	 * Lastdrop (1->0 transition).  INPROG has been set, GOOD and DIRTY
464 	 * have been cleared.  iofree_count has not yet been incremented,
465 	 * note that another accessor race will decrement iofree_count so
466 	 * we have to increment it regardless.
467 	 * We can now dispose of the buffer.
468 	 */
469 	pbase = dio->pbase;
470 	psize = dio->psize;
471 	bp = dio->bp;
472 	dio->bp = NULL;
473 
474 	if ((orefs & HAMMER2_DIO_GOOD) && bp) {
475 		/*
476 		 * Non-errored disposal of bp
477 		 */
478 		if (orefs & HAMMER2_DIO_DIRTY) {
479 			dio_write_stats_update(dio, bp);
480 
481 			/*
482 			 * Allows dirty buffers to accumulate and
483 			 * possibly be canceled (e.g. by a 'rm'),
484 			 * by default we will burst-write later.
485 			 *
486 			 * We generally do NOT want to issue an actual
487 			 * b[a]write() or cluster_write() here.  Due to
488 			 * the way chains are locked, buffers may be cycled
489 			 * in and out quite often and disposal here can cause
490 			 * multiple writes or write-read stalls.
491 			 *
492 			 * If FLUSH is set we do want to issue the actual
493 			 * write.  This typically occurs in the write-behind
494 			 * case when writing to large files.
495 			 */
496 			//off_t peof;
497 			//int hce;
498 			if (dio->refs & HAMMER2_DIO_FLUSH) {
499 #if 0
500 				if ((hce = hammer2_cluster_write) != 0) {
501 					peof = (pbase + HAMMER2_SEGMASK64) &
502 					       ~HAMMER2_SEGMASK64;
503 					peof -= dio->dbase;
504 					bp->b_flags |= B_CLUSTEROK;
505 					cluster_write(bp, peof, psize, hce);
506 				} else {
507 					bp->b_flags &= ~B_CLUSTEROK;
508 					bawrite(bp);
509 				}
510 #else
511 				bawrite(bp);
512 #endif
513 			} else {
514 				//bp->b_flags &= ~B_CLUSTEROK;
515 				bdwrite(bp);
516 			}
517 #if 0
518 		} else if (bp->b_flags & (B_ERROR | B_INVAL | B_RELBUF)) {
519 			brelse(bp);
520 #endif
521 		} else {
522 			bqrelse(bp);
523 		}
524 	} else if (bp) {
525 		/*
526 		 * Errored disposal of bp
527 		 */
528 		brelse(bp);
529 	}
530 
531 	/*
532 	 * Update iofree_count before disposing of the dio
533 	 */
534 	hmp = dio->hmp;
535 	atomic_add_int(&hmp->iofree_count, 1);
536 
537 	/*
538 	 * Clear INPROG, GOOD, and WAITING (GOOD should already be clear).
539 	 *
540 	 * Also clear FLUSH as it was handled above.
541 	 */
542 	for (;;) {
543 		orefs = dio->refs;
544 		cpu_ccfence();
545 		nrefs = orefs & ~(HAMMER2_DIO_INPROG | HAMMER2_DIO_GOOD |
546 				  HAMMER2_DIO_WAITING | HAMMER2_DIO_FLUSH);
547 		if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) {
548 			if (orefs & HAMMER2_DIO_WAITING)
549 				wakeup(dio);
550 			break;
551 		}
552 		cpu_pause();
553 	}
554 
555 	/*
556 	 * We cache free buffers so re-use cases can use a shared lock, but
557 	 * if too many build up we have to clean them out.
558 	 */
559 	dio_limit = hammer2_dio_limit;
560 	if (dio_limit < 256)
561 		dio_limit = 256;
562 	if (dio_limit > 1024*1024)
563 		dio_limit = 1024*1024;
564 	if (hmp->iofree_count > dio_limit)
565 		hammer2_io_hash_cleanup(hmp, dio_limit);
566 }
567 
568 /*
569  * Returns a pointer to the requested data.
570  */
571 char *
572 hammer2_io_data(hammer2_io_t *dio, off_t lbase)
573 {
574 	struct m_buf *bp;
575 	int off;
576 
577 	bp = dio->bp;
578 	KKASSERT(bp != NULL);
579 	bkvasync(bp);
580 	lbase -= dio->dbase;
581 	off = (lbase & ~HAMMER2_OFF_MASK_RADIX) - bp->b_loffset;
582 	KKASSERT(off >= 0 && off < bp->b_bufsize);
583 	return(bp->b_data + off);
584 }
585 
586 int
587 hammer2_io_new(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
588 	       hammer2_io_t **diop)
589 {
590 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEW);
591 	return ((*diop)->error);
592 }
593 
594 int
595 hammer2_io_newnz(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
596 		 hammer2_io_t **diop)
597 {
598 	*diop = hammer2_io_getblk(hmp, btype, lbase, lsize, HAMMER2_DOP_NEWNZ);
599 	return ((*diop)->error);
600 }
601 
602 int
603 _hammer2_io_bread(hammer2_dev_t *hmp, int btype, off_t lbase, int lsize,
604 		hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
605 {
606 #ifdef HAMMER2_IO_DEBUG
607 	hammer2_io_t *dio;
608 #endif
609 
610 	*diop = _hammer2_io_getblk(hmp, btype, lbase, lsize,
611 				   HAMMER2_DOP_READ HAMMER2_IO_DEBUG_CALL);
612 #ifdef HAMMER2_IO_DEBUG
613 	if ((dio = *diop) != NULL) {
614 #if 0
615 		int i = (dio->debug_index - 1) & HAMMER2_IO_DEBUG_MASK;
616 		dio->debug_data[i] = debug_data;
617 #endif
618 	}
619 #endif
620 	return ((*diop)->error);
621 }
622 
623 hammer2_io_t *
624 _hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase,
625 		     int lsize HAMMER2_IO_DEBUG_ARGS)
626 {
627 	hammer2_io_t *dio;
628 
629 	dio = _hammer2_io_getblk(hmp, 0, lbase, lsize,
630 				 HAMMER2_DOP_READQ HAMMER2_IO_DEBUG_CALL);
631 	return dio;
632 }
633 
634 void
635 _hammer2_io_bawrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
636 {
637 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
638 				      HAMMER2_DIO_FLUSH);
639 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
640 }
641 
642 void
643 _hammer2_io_bdwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
644 {
645 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY);
646 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
647 }
648 
649 int
650 _hammer2_io_bwrite(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
651 {
652 	atomic_set_64(&(*diop)->refs, HAMMER2_DIO_DIRTY |
653 				      HAMMER2_DIO_FLUSH);
654 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
655 	return (0);	/* XXX */
656 }
657 
658 void
659 hammer2_io_setdirty(hammer2_io_t *dio)
660 {
661 	atomic_set_64(&dio->refs, HAMMER2_DIO_DIRTY);
662 }
663 
664 /*
665  * This routine is called when a MODIFIED chain is being DESTROYED,
666  * in an attempt to allow the related buffer cache buffer to be
667  * invalidated and discarded instead of flushing it to disk.
668  *
669  * At the moment this case is only really useful for file meta-data.
670  * File data is already handled via the logical buffer cache associated
671  * with the vnode, and will be discarded if it was never flushed to disk.
672  * File meta-data may include inodes, directory entries, and indirect blocks.
673  *
674  * XXX
675  * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being
676  * invalidated might be smaller.  Most of the meta-data structures above
677  * are in the 'smaller' category.  For now, don't try to invalidate the
678  * data areas.
679  */
680 void
681 hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes)
682 {
683 	/* NOP */
684 }
685 
686 void
687 _hammer2_io_brelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
688 {
689 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
690 }
691 
692 void
693 _hammer2_io_bqrelse(hammer2_io_t **diop HAMMER2_IO_DEBUG_ARGS)
694 {
695 	_hammer2_io_putblk(diop HAMMER2_IO_DEBUG_CALL);
696 }
697 
698 /*
699  * Set dedup validation bits in a DIO.  We do not need the buffer cache
700  * buffer for this.  This must be done concurrent with setting bits in
701  * the freemap so as to interlock with bulkfree's clearing of those bits.
702  */
703 void
704 hammer2_io_dedup_set(hammer2_dev_t *hmp, hammer2_blockref_t *bref)
705 {
706 	hammer2_io_t *dio;
707 	uint64_t mask;
708 	int lsize;
709 	int isgood;
710 
711 	dio = hammer2_io_alloc(hmp, bref->data_off, bref->type, 1, &isgood);
712 	if ((int)(bref->data_off & HAMMER2_OFF_MASK_RADIX))
713 		lsize = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
714 	else
715 		lsize = 0;
716 	mask = hammer2_dedup_mask(dio, bref->data_off, lsize);
717 	atomic_clear_64(&dio->dedup_valid, mask);
718 	atomic_set_64(&dio->dedup_alloc, mask);
719 	hammer2_io_putblk(&dio);
720 }
721 
722 /*
723  * Clear dedup validation bits in a DIO.  This is typically done when
724  * a modified chain is destroyed or by the bulkfree code.  No buffer
725  * is needed for this operation.  If the DIO no longer exists it is
726  * equivalent to the bits not being set.
727  */
728 void
729 hammer2_io_dedup_delete(hammer2_dev_t *hmp, uint8_t btype,
730 			hammer2_off_t data_off, u_int bytes)
731 {
732 	hammer2_io_t *dio;
733 	uint64_t mask;
734 	int isgood;
735 
736 	if ((data_off & ~HAMMER2_OFF_MASK_RADIX) == 0)
737 		return;
738 	if (btype != HAMMER2_BREF_TYPE_DATA)
739 		return;
740 	dio = hammer2_io_alloc(hmp, data_off, btype, 0, &isgood);
741 	if (dio) {
742 		if (data_off < dio->pbase ||
743 		    (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes >
744 		    dio->pbase + dio->psize) {
745 			panic("hammer2_io_dedup_delete: DATAOFF BAD "
746 			      "%016jx/%d %016jx\n",
747 			      data_off, bytes, dio->pbase);
748 		}
749 		mask = hammer2_dedup_mask(dio, data_off, bytes);
750 		atomic_clear_64(&dio->dedup_alloc, mask);
751 		atomic_clear_64(&dio->dedup_valid, mask);
752 		hammer2_io_putblk(&dio);
753 	}
754 }
755 
756 /*
757  * Assert that dedup allocation bits in a DIO are not set.  This operation
758  * does not require a buffer.  The DIO does not need to exist.
759  */
760 void
761 hammer2_io_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes)
762 {
763 	hammer2_io_t *dio;
764 	int isgood;
765 
766 	dio = hammer2_io_alloc(hmp, data_off, HAMMER2_BREF_TYPE_DATA,
767 			       0, &isgood);
768 	if (dio) {
769 		KASSERT((dio->dedup_alloc &
770 			  hammer2_dedup_mask(dio, data_off, bytes)) == 0,
771 			("hammer2_dedup_assert: %016jx/%d %016jx/%016jx",
772 			data_off,
773 			bytes,
774 			hammer2_dedup_mask(dio, data_off, bytes),
775 			dio->dedup_alloc));
776 		hammer2_io_putblk(&dio);
777 	}
778 }
779 
780 static
781 void
782 dio_write_stats_update(hammer2_io_t *dio, struct m_buf *bp)
783 {
784 	/*
785 	if (bp->b_flags & B_DELWRI)
786 		return;
787 	*/
788 	hammer2_adjwritecounter(dio->btype, dio->psize);
789 }
790 
791 void
792 hammer2_io_bkvasync(hammer2_io_t *dio)
793 {
794 	KKASSERT(dio->bp != NULL);
795 	bkvasync(dio->bp);
796 }
797 
798 /*
799  * Ref a dio that is already owned
800  */
801 void
802 _hammer2_io_ref(hammer2_io_t *dio HAMMER2_IO_DEBUG_ARGS)
803 {
804 	DIO_RECORD(dio HAMMER2_IO_DEBUG_CALL);
805 	atomic_add_64(&dio->refs, 1);
806 }
807 
808 static __inline hammer2_io_hash_t *
809 hammer2_io_hashv(hammer2_dev_t *hmp, hammer2_off_t pbase)
810 {
811 	int hv;
812 
813 	hv = (int)pbase + (int)(pbase >> 16);
814 	return (&hmp->iohash[hv & HAMMER2_IOHASH_MASK]);
815 }
816 
817 /*
818  * Lookup and reference the requested dio
819  */
820 static hammer2_io_t *
821 hammer2_io_hash_lookup(hammer2_dev_t *hmp, hammer2_off_t pbase, uint64_t *refsp)
822 {
823 	hammer2_io_hash_t *hash;
824 	hammer2_io_t *dio;
825 	uint64_t refs;
826 
827 	*refsp = 0;
828 	hash = hammer2_io_hashv(hmp, pbase);
829 	hammer2_spin_sh(&hash->spin);
830 	for (dio = hash->base; dio; dio = dio->next) {
831 		if (dio->pbase == pbase) {
832 			refs = atomic_fetchadd_64(&dio->refs, 1);
833 			if ((refs & HAMMER2_DIO_MASK) == 0)
834 				atomic_add_int(&dio->hmp->iofree_count, -1);
835 			*refsp = refs;
836 			break;
837 		}
838 	}
839 	hammer2_spin_unsh(&hash->spin);
840 
841 	return dio;
842 }
843 
844 /*
845  * Enter a dio into the hash.  If the pbase already exists in the hash,
846  * the xio in the hash is referenced and returned.  If dio is sucessfully
847  * entered into the hash, NULL is returned.
848  */
849 static hammer2_io_t *
850 hammer2_io_hash_enter(hammer2_dev_t *hmp, hammer2_io_t *dio, uint64_t *refsp)
851 {
852 	hammer2_io_t *xio;
853 	hammer2_io_t **xiop;
854 	hammer2_io_hash_t *hash;
855 	uint64_t refs;
856 
857 	*refsp = 0;
858 	hash = hammer2_io_hashv(hmp, dio->pbase);
859 	hammer2_spin_ex(&hash->spin);
860 	for (xiop = &hash->base; (xio = *xiop) != NULL; xiop = &xio->next) {
861 		if (xio->pbase == dio->pbase) {
862 			refs = atomic_fetchadd_64(&xio->refs, 1);
863 			if ((refs & HAMMER2_DIO_MASK) == 0)
864 				atomic_add_int(&xio->hmp->iofree_count, -1);
865 			*refsp = refs;
866 			goto done;
867 		}
868 	}
869 	dio->next = NULL;
870 	*xiop = dio;
871 done:
872 	hammer2_spin_unex(&hash->spin);
873 
874 	return xio;
875 }
876 
877 /*
878  * Clean out a limited number of freeable DIOs
879  */
880 static void
881 hammer2_io_hash_cleanup(hammer2_dev_t *hmp, int dio_limit)
882 {
883 	hammer2_io_hash_t *hash;
884 	hammer2_io_t *dio;
885 	hammer2_io_t **diop;
886 	hammer2_io_t **cleanapp;
887 	hammer2_io_t *cleanbase;
888 	int count;
889 	int maxscan;
890 	int i;
891 
892 	count = hmp->iofree_count - dio_limit + 32;
893 	if (count <= 0)
894 		return;
895 	cleanbase = NULL;
896 	cleanapp = &cleanbase;
897 
898 	i = hmp->io_iterator++;
899 	maxscan = HAMMER2_IOHASH_SIZE;
900 	while (count > 0 && maxscan--) {
901 		hash = &hmp->iohash[i & HAMMER2_IOHASH_MASK];
902 		hammer2_spin_ex(&hash->spin);
903 		diop = &hash->base;
904 		while ((dio = *diop) != NULL) {
905 			if ((dio->refs & (HAMMER2_DIO_MASK |
906 					  HAMMER2_DIO_INPROG)) != 0)
907 			{
908 				diop = &dio->next;
909 				continue;
910 			}
911 			if (dio->act > 0) {
912 				int act;
913 
914 				act = dio->act - (ticks - dio->ticks) / hz - 1;
915 				dio->act = (act < 0) ? 0 : act;
916 			}
917 			if (dio->act) {
918 				diop = &dio->next;
919 				continue;
920 			}
921 			KKASSERT(dio->bp == NULL);
922 			*diop = dio->next;
923 			dio->next = NULL;
924 			*cleanapp = dio;
925 			cleanapp = &dio->next;
926 			--count;
927 			/* diop remains unchanged */
928 			atomic_add_int(&hmp->iofree_count, -1);
929 		}
930 		hammer2_spin_unex(&hash->spin);
931 		i = hmp->io_iterator++;
932 	}
933 
934 	/*
935 	 * Get rid of dios on clean list without holding any locks
936 	 */
937 	while ((dio = cleanbase) != NULL) {
938 		cleanbase = dio->next;
939 		dio->next = NULL;
940 		KKASSERT(dio->bp == NULL &&
941 		    (dio->refs & (HAMMER2_DIO_MASK |
942 				  HAMMER2_DIO_INPROG)) == 0);
943 		if (dio->refs & HAMMER2_DIO_DIRTY) {
944 			kprintf("hammer2_io_cleanup: Dirty buffer "
945 				"%016jx/%d (bp=%p)\n",
946 				dio->pbase, dio->psize, dio->bp);
947 		}
948 		kfree_obj(dio, hmp->mio);
949 		atomic_add_int(&hammer2_dio_count, -1);
950 	}
951 }
952 
953 /*
954  * Destroy all DIOs associated with the media
955  */
956 void
957 hammer2_io_hash_cleanup_all(hammer2_dev_t *hmp)
958 {
959 	hammer2_io_hash_t *hash;
960 	hammer2_io_t *dio;
961 	int i;
962 
963 	for (i = 0; i < HAMMER2_IOHASH_SIZE; ++i) {
964 		hash = &hmp->iohash[i];
965 
966 		while ((dio = hash->base) != NULL) {
967 			hash->base = dio->next;
968 			dio->next = NULL;
969 			KKASSERT(dio->bp == NULL &&
970 			    (dio->refs & (HAMMER2_DIO_MASK |
971 					  HAMMER2_DIO_INPROG)) == 0);
972 			if (dio->refs & HAMMER2_DIO_DIRTY) {
973 				kprintf("hammer2_io_cleanup: Dirty buffer "
974 					"%016jx/%d (bp=%p)\n",
975 					dio->pbase, dio->psize, dio->bp);
976 			}
977 			kfree_obj(dio, hmp->mio);
978 			atomic_add_int(&hammer2_dio_count, -1);
979 			atomic_add_int(&hmp->iofree_count, -1);
980 		}
981 	}
982 }
983