xref: /dragonfly/sys/kern/vfs_cluster.c (revision d5f516c3)
1 /*-
2  * Copyright (c) 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Modifications/enhancements:
5  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *	This product includes software developed by the University of
18  *	California, Berkeley and its contributors.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  *
35  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
36  * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $
37  * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.11 2004/07/14 03:10:17 hmp Exp $
38  */
39 
40 #include "opt_debug_cluster.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/proc.h>
46 #include <sys/buf.h>
47 #include <sys/vnode.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/resourcevar.h>
51 #include <sys/vmmeter.h>
52 #include <vm/vm.h>
53 #include <vm/vm_object.h>
54 #include <vm/vm_page.h>
55 #include <sys/sysctl.h>
56 #include <sys/buf2.h>
57 #include <vm/vm_page2.h>
58 
59 #if defined(CLUSTERDEBUG)
60 #include <sys/sysctl.h>
61 static int	rcluster= 0;
62 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, "");
63 #endif
64 
65 static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer");
66 
67 static struct cluster_save *
68 	cluster_collectbufs (struct vnode *vp, struct buf *last_bp);
69 static struct buf *
70 	cluster_rbuild (struct vnode *vp, u_quad_t filesize, daddr_t lbn,
71 			    daddr_t blkno, long size, int run, struct buf *fbp);
72 
73 static int write_behind = 1;
74 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "");
75 
76 extern vm_page_t	bogus_page;
77 
78 extern int cluster_pbuf_freecnt;
79 
80 /*
81  * Maximum number of blocks for read-ahead.
82  */
83 #define MAXRA 32
84 
85 /*
86  * This replaces bread.
87  */
88 int
89 cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno,
90 	long size, long totread, int seqcount, struct buf **bpp)
91 {
92 	struct buf *bp, *rbp, *reqbp;
93 	daddr_t blkno, origblkno;
94 	int error, num_ra;
95 	int i;
96 	int maxra, racluster;
97 	long origtotread;
98 
99 	error = 0;
100 
101 	/*
102 	 * Try to limit the amount of read-ahead by a few
103 	 * ad-hoc parameters.  This needs work!!!
104 	 */
105 	racluster = vp->v_mount->mnt_iosize_max / size;
106 	maxra = 2 * racluster + (totread / size);
107 	if (maxra > MAXRA)
108 		maxra = MAXRA;
109 	if (maxra > nbuf/8)
110 		maxra = nbuf/8;
111 
112 	/*
113 	 * get the requested block
114 	 */
115 	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
116 	origblkno = lblkno;
117 	origtotread = totread;
118 
119 	/*
120 	 * if it is in the cache, then check to see if the reads have been
121 	 * sequential.  If they have, then try some read-ahead, otherwise
122 	 * back-off on prospective read-aheads.
123 	 */
124 	if (bp->b_flags & B_CACHE) {
125 		if (!seqcount) {
126 			return 0;
127 		} else if ((bp->b_flags & B_RAM) == 0) {
128 			return 0;
129 		} else {
130 			int s;
131 			struct buf *tbp;
132 			bp->b_flags &= ~B_RAM;
133 			/*
134 			 * We do the spl here so that there is no window
135 			 * between the incore and the b_usecount increment
136 			 * below.  We opt to keep the spl out of the loop
137 			 * for efficiency.
138 			 */
139 			s = splbio();
140 			for (i = 1; i < maxra; i++) {
141 
142 				if (!(tbp = incore(vp, lblkno+i))) {
143 					break;
144 				}
145 
146 				/*
147 				 * Set another read-ahead mark so we know
148 				 * to check again.
149 				 */
150 				if (((i % racluster) == (racluster - 1)) ||
151 					(i == (maxra - 1)))
152 					tbp->b_flags |= B_RAM;
153 			}
154 			splx(s);
155 			if (i >= maxra) {
156 				return 0;
157 			}
158 			lblkno += i;
159 		}
160 		reqbp = bp = NULL;
161 	} else {
162 		off_t firstread = bp->b_offset;
163 
164 		KASSERT(bp->b_offset != NOOFFSET,
165 		    ("cluster_read: no buffer offset"));
166 		if (firstread + totread > filesize)
167 			totread = filesize - firstread;
168 		if (totread > size) {
169 			int nblks = 0;
170 			int ncontigafter;
171 			while (totread > 0) {
172 				nblks++;
173 				totread -= size;
174 			}
175 			if (nblks == 1)
176 				goto single_block_read;
177 			if (nblks > racluster)
178 				nblks = racluster;
179 
180 	    		error = VOP_BMAP(vp, lblkno, NULL,
181 				&blkno, &ncontigafter, NULL);
182 			if (error)
183 				goto single_block_read;
184 			if (blkno == -1)
185 				goto single_block_read;
186 			if (ncontigafter == 0)
187 				goto single_block_read;
188 			if (ncontigafter + 1 < nblks)
189 				nblks = ncontigafter + 1;
190 
191 			bp = cluster_rbuild(vp, filesize, lblkno,
192 				blkno, size, nblks, bp);
193 			lblkno += (bp->b_bufsize / size);
194 		} else {
195 single_block_read:
196 			/*
197 			 * if it isn't in the cache, then get a chunk from
198 			 * disk if sequential, otherwise just get the block.
199 			 */
200 			bp->b_flags |= B_READ | B_RAM;
201 			lblkno += 1;
202 		}
203 	}
204 
205 	/*
206 	 * if we have been doing sequential I/O, then do some read-ahead
207 	 */
208 	rbp = NULL;
209 	if (seqcount && (lblkno < (origblkno + seqcount))) {
210 		/*
211 		 * we now build the read-ahead buffer if it is desirable.
212 		 */
213 		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
214 		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
215 		    blkno != -1) {
216 			int nblksread;
217 			int ntoread = num_ra + 1;
218 			nblksread = (origtotread + size - 1) / size;
219 			if (seqcount < nblksread)
220 				seqcount = nblksread;
221 			if (seqcount < ntoread)
222 				ntoread = seqcount;
223 			if (num_ra) {
224 				rbp = cluster_rbuild(vp, filesize, lblkno,
225 					blkno, size, ntoread, NULL);
226 			} else {
227 				rbp = getblk(vp, lblkno, size, 0, 0);
228 				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
229 				rbp->b_blkno = blkno;
230 			}
231 		}
232 	}
233 
234 	/*
235 	 * handle the synchronous read
236 	 */
237 	if (bp) {
238 #if defined(CLUSTERDEBUG)
239 		if (rcluster)
240 			printf("S(%ld,%ld,%d) ",
241 			    (long)bp->b_lblkno, bp->b_bcount, seqcount);
242 #endif
243 		if ((bp->b_flags & B_CLUSTER) == 0) {
244 			vfs_busy_pages(bp, 0);
245 		}
246 		bp->b_flags &= ~(B_ERROR|B_INVAL);
247 		if (bp->b_flags & (B_ASYNC|B_CALL))
248 			BUF_KERNPROC(bp);
249 		error = VOP_STRATEGY(vp, bp);
250 	}
251 
252 	/*
253 	 * and if we have read-aheads, do them too
254 	 */
255 	if (rbp) {
256 		if (error) {
257 			rbp->b_flags &= ~(B_ASYNC | B_READ);
258 			brelse(rbp);
259 		} else if (rbp->b_flags & B_CACHE) {
260 			rbp->b_flags &= ~(B_ASYNC | B_READ);
261 			bqrelse(rbp);
262 		} else {
263 #if defined(CLUSTERDEBUG)
264 			if (rcluster) {
265 				if (bp)
266 					printf("A+(%ld,%ld,%ld,%d) ",
267 					    (long)rbp->b_lblkno, rbp->b_bcount,
268 					    (long)(rbp->b_lblkno - origblkno),
269 					    seqcount);
270 				else
271 					printf("A(%ld,%ld,%ld,%d) ",
272 					    (long)rbp->b_lblkno, rbp->b_bcount,
273 					    (long)(rbp->b_lblkno - origblkno),
274 					    seqcount);
275 			}
276 #endif
277 
278 			if ((rbp->b_flags & B_CLUSTER) == 0) {
279 				vfs_busy_pages(rbp, 0);
280 			}
281 			rbp->b_flags &= ~(B_ERROR|B_INVAL);
282 			if (rbp->b_flags & (B_ASYNC|B_CALL))
283 				BUF_KERNPROC(rbp);
284 			(void) VOP_STRATEGY(vp, rbp);
285 		}
286 	}
287 	if (reqbp)
288 		return (biowait(reqbp));
289 	else
290 		return (error);
291 }
292 
293 /*
294  * If blocks are contiguous on disk, use this to provide clustered
295  * read ahead.  We will read as many blocks as possible sequentially
296  * and then parcel them up into logical blocks in the buffer hash table.
297  */
298 static struct buf *
299 cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
300 	daddr_t blkno, long size, int run, struct buf *fbp)
301 {
302 	struct buf *bp, *tbp;
303 	daddr_t bn;
304 	int i, inc, j;
305 
306 	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
307 	    ("cluster_rbuild: size %ld != filesize %ld\n",
308 	    size, vp->v_mount->mnt_stat.f_iosize));
309 
310 	/*
311 	 * avoid a division
312 	 */
313 	while ((u_quad_t) size * (lbn + run) > filesize) {
314 		--run;
315 	}
316 
317 	if (fbp) {
318 		tbp = fbp;
319 		tbp->b_flags |= B_READ;
320 	} else {
321 		tbp = getblk(vp, lbn, size, 0, 0);
322 		if (tbp->b_flags & B_CACHE)
323 			return tbp;
324 		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
325 	}
326 
327 	tbp->b_blkno = blkno;
328 	if( (tbp->b_flags & B_MALLOC) ||
329 		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
330 		return tbp;
331 
332 	bp = trypbuf(&cluster_pbuf_freecnt);
333 	if (bp == 0)
334 		return tbp;
335 
336 	/*
337 	 * We are synthesizing a buffer out of vm_page_t's, but
338 	 * if the block size is not page aligned then the starting
339 	 * address may not be either.  Inherit the b_data offset
340 	 * from the original buffer.
341 	 */
342 	bp->b_data = (char *)((vm_offset_t)bp->b_data |
343 	    ((vm_offset_t)tbp->b_data & PAGE_MASK));
344 	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_CLUSTER | B_VMIO;
345 	bp->b_iodone = cluster_callback;
346 	bp->b_blkno = blkno;
347 	bp->b_lblkno = lbn;
348 	bp->b_offset = tbp->b_offset;
349 	KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
350 	pbgetvp(vp, bp);
351 
352 	TAILQ_INIT(&bp->b_cluster.cluster_head);
353 
354 	bp->b_bcount = 0;
355 	bp->b_bufsize = 0;
356 	bp->b_xio.xio_npages = 0;
357 
358 	inc = btodb(size);
359 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
360 		if (i != 0) {
361 			if ((bp->b_xio.xio_npages * PAGE_SIZE) +
362 			    round_page(size) > vp->v_mount->mnt_iosize_max) {
363 				break;
364 			}
365 
366 			/*
367 			 * Shortcut some checks and try to avoid buffers that
368 			 * would block in the lock.  The same checks have to
369 			 * be made again after we officially get the buffer.
370 			 */
371 			if ((tbp = incore(vp, lbn + i)) != NULL) {
372 				if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT))
373 					break;
374 				BUF_UNLOCK(tbp);
375 
376 				for (j = 0; j < tbp->b_xio.xio_npages; j++) {
377 					if (tbp->b_xio.xio_pages[j]->valid)
378 						break;
379 				}
380 
381 				if (j != tbp->b_xio.xio_npages)
382 					break;
383 
384 				if (tbp->b_bcount != size)
385 					break;
386 			}
387 
388 			tbp = getblk(vp, lbn + i, size, 0, 0);
389 
390 			/*
391 			 * Stop scanning if the buffer is fuly valid
392 			 * (marked B_CACHE), or locked (may be doing a
393 			 * background write), or if the buffer is not
394 			 * VMIO backed.  The clustering code can only deal
395 			 * with VMIO-backed buffers.
396 			 */
397 			if ((tbp->b_flags & (B_CACHE|B_LOCKED)) ||
398 			    (tbp->b_flags & B_VMIO) == 0) {
399 				bqrelse(tbp);
400 				break;
401 			}
402 
403 			/*
404 			 * The buffer must be completely invalid in order to
405 			 * take part in the cluster.  If it is partially valid
406 			 * then we stop.
407 			 */
408 			for (j = 0;j < tbp->b_xio.xio_npages; j++) {
409 				if (tbp->b_xio.xio_pages[j]->valid)
410 					break;
411 			}
412 			if (j != tbp->b_xio.xio_npages) {
413 				bqrelse(tbp);
414 				break;
415 			}
416 
417 			/*
418 			 * Set a read-ahead mark as appropriate
419 			 */
420 			if ((fbp && (i == 1)) || (i == (run - 1)))
421 				tbp->b_flags |= B_RAM;
422 
423 			/*
424 			 * Set the buffer up for an async read (XXX should
425 			 * we do this only if we do not wind up brelse()ing?).
426 			 * Set the block number if it isn't set, otherwise
427 			 * if it is make sure it matches the block number we
428 			 * expect.
429 			 */
430 			tbp->b_flags |= B_READ | B_ASYNC;
431 			if (tbp->b_blkno == tbp->b_lblkno) {
432 				tbp->b_blkno = bn;
433 			} else if (tbp->b_blkno != bn) {
434 				brelse(tbp);
435 				break;
436 			}
437 		}
438 		/*
439 		 * XXX fbp from caller may not be B_ASYNC, but we are going
440 		 * to biodone() it in cluster_callback() anyway
441 		 */
442 		BUF_KERNPROC(tbp);
443 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
444 			tbp, b_cluster.cluster_entry);
445 		for (j = 0; j < tbp->b_xio.xio_npages; j += 1) {
446 			vm_page_t m;
447 			m = tbp->b_xio.xio_pages[j];
448 			vm_page_io_start(m);
449 			vm_object_pip_add(m->object, 1);
450 			if ((bp->b_xio.xio_npages == 0) ||
451 				(bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
452 				bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
453 				bp->b_xio.xio_npages++;
454 			}
455 			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
456 				tbp->b_xio.xio_pages[j] = bogus_page;
457 		}
458 		/*
459 		 * XXX shouldn't this be += size for both, like in
460 		 * cluster_wbuild()?
461 		 *
462 		 * Don't inherit tbp->b_bufsize as it may be larger due to
463 		 * a non-page-aligned size.  Instead just aggregate using
464 		 * 'size'.
465 		 */
466 		if (tbp->b_bcount != size)
467 		    printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
468 		if (tbp->b_bufsize != size)
469 		    printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
470 		bp->b_bcount += size;
471 		bp->b_bufsize += size;
472 	}
473 
474 	/*
475 	 * Fully valid pages in the cluster are already good and do not need
476 	 * to be re-read from disk.  Replace the page with bogus_page
477 	 */
478 	for (j = 0; j < bp->b_xio.xio_npages; j++) {
479 		if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) ==
480 		    VM_PAGE_BITS_ALL) {
481 			bp->b_xio.xio_pages[j] = bogus_page;
482 		}
483 	}
484 	if (bp->b_bufsize > bp->b_kvasize)
485 		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)",
486 		    bp->b_bufsize, bp->b_kvasize);
487 	bp->b_kvasize = bp->b_bufsize;
488 
489 	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
490 		(vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages);
491 	return (bp);
492 }
493 
494 /*
495  * Cleanup after a clustered read or write.
496  * This is complicated by the fact that any of the buffers might have
497  * extra memory (if there were no empty buffer headers at allocbuf time)
498  * that we will need to shift around.
499  */
500 void
501 cluster_callback(struct buf *bp)
502 {
503 	struct buf *nbp, *tbp;
504 	int error = 0;
505 
506 	/*
507 	 * Must propogate errors to all the components.
508 	 */
509 	if (bp->b_flags & B_ERROR)
510 		error = bp->b_error;
511 
512 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages);
513 	/*
514 	 * Move memory from the large cluster buffer into the component
515 	 * buffers and mark IO as done on these.
516 	 */
517 	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
518 		tbp; tbp = nbp) {
519 		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
520 		if (error) {
521 			tbp->b_flags |= B_ERROR;
522 			tbp->b_error = error;
523 		} else {
524 			tbp->b_dirtyoff = tbp->b_dirtyend = 0;
525 			tbp->b_flags &= ~(B_ERROR|B_INVAL);
526 			/*
527 			 * XXX the bdwrite()/bqrelse() issued during
528 			 * cluster building clears B_RELBUF (see bqrelse()
529 			 * comment).  If direct I/O was specified, we have
530 			 * to restore it here to allow the buffer and VM
531 			 * to be freed.
532 			 */
533 			if (tbp->b_flags & B_DIRECT)
534 				tbp->b_flags |= B_RELBUF;
535 		}
536 		biodone(tbp);
537 	}
538 	relpbuf(bp, &cluster_pbuf_freecnt);
539 }
540 
541 /*
542  *	cluster_wbuild_wb:
543  *
544  *	Implement modified write build for cluster.
545  *
546  *		write_behind = 0	write behind disabled
547  *		write_behind = 1	write behind normal (default)
548  *		write_behind = 2	write behind backed-off
549  */
550 
551 static __inline int
552 cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
553 {
554 	int r = 0;
555 
556 	switch(write_behind) {
557 	case 2:
558 		if (start_lbn < len)
559 			break;
560 		start_lbn -= len;
561 		/* fall through */
562 	case 1:
563 		r = cluster_wbuild(vp, size, start_lbn, len);
564 		/* fall through */
565 	default:
566 		/* fall through */
567 		break;
568 	}
569 	return(r);
570 }
571 
572 /*
573  * Do clustered write for FFS.
574  *
575  * Three cases:
576  *	1. Write is not sequential (write asynchronously)
577  *	Write is sequential:
578  *	2.	beginning of cluster - begin cluster
579  *	3.	middle of a cluster - add to cluster
580  *	4.	end of a cluster - asynchronously write cluster
581  */
582 void
583 cluster_write(struct buf *bp, u_quad_t filesize, int seqcount)
584 {
585 	struct vnode *vp;
586 	daddr_t lbn;
587 	int maxclen, cursize;
588 	int lblocksize;
589 	int async;
590 
591 	vp = bp->b_vp;
592 	if (vp->v_type == VREG) {
593 		async = vp->v_mount->mnt_flag & MNT_ASYNC;
594 		lblocksize = vp->v_mount->mnt_stat.f_iosize;
595 	} else {
596 		async = 0;
597 		lblocksize = bp->b_bufsize;
598 	}
599 	lbn = bp->b_lblkno;
600 	KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
601 
602 	/* Initialize vnode to beginning of file. */
603 	if (lbn == 0)
604 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
605 
606 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
607 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
608 		maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
609 		if (vp->v_clen != 0) {
610 			/*
611 			 * Next block is not sequential.
612 			 *
613 			 * If we are not writing at end of file, the process
614 			 * seeked to another point in the file since its last
615 			 * write, or we have reached our maximum cluster size,
616 			 * then push the previous cluster. Otherwise try
617 			 * reallocating to make it sequential.
618 			 *
619 			 * Change to algorithm: only push previous cluster if
620 			 * it was sequential from the point of view of the
621 			 * seqcount heuristic, otherwise leave the buffer
622 			 * intact so we can potentially optimize the I/O
623 			 * later on in the buf_daemon or update daemon
624 			 * flush.
625 			 */
626 			cursize = vp->v_lastw - vp->v_cstart + 1;
627 			if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
628 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
629 				if (!async && seqcount > 0) {
630 					cluster_wbuild_wb(vp, lblocksize,
631 						vp->v_cstart, cursize);
632 				}
633 			} else {
634 				struct buf **bpp, **endbp;
635 				struct cluster_save *buflist;
636 
637 				buflist = cluster_collectbufs(vp, bp);
638 				endbp = &buflist->bs_children
639 				    [buflist->bs_nchildren - 1];
640 				if (VOP_REALLOCBLKS(vp, buflist)) {
641 					/*
642 					 * Failed, push the previous cluster
643 					 * if *really* writing sequentially
644 					 * in the logical file (seqcount > 1),
645 					 * otherwise delay it in the hopes that
646 					 * the low level disk driver can
647 					 * optimize the write ordering.
648 					 */
649 					for (bpp = buflist->bs_children;
650 					     bpp < endbp; bpp++)
651 						brelse(*bpp);
652 					free(buflist, M_SEGMENT);
653 					if (seqcount > 1) {
654 						cluster_wbuild_wb(vp,
655 						    lblocksize, vp->v_cstart,
656 						    cursize);
657 					}
658 				} else {
659 					/*
660 					 * Succeeded, keep building cluster.
661 					 */
662 					for (bpp = buflist->bs_children;
663 					     bpp <= endbp; bpp++)
664 						bdwrite(*bpp);
665 					free(buflist, M_SEGMENT);
666 					vp->v_lastw = lbn;
667 					vp->v_lasta = bp->b_blkno;
668 					return;
669 				}
670 			}
671 		}
672 		/*
673 		 * Consider beginning a cluster. If at end of file, make
674 		 * cluster as large as possible, otherwise find size of
675 		 * existing cluster.
676 		 */
677 		if ((vp->v_type == VREG) &&
678 			((u_quad_t) bp->b_offset + lblocksize) != filesize &&
679 		    (bp->b_blkno == bp->b_lblkno) &&
680 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
681 		     bp->b_blkno == -1)) {
682 			bawrite(bp);
683 			vp->v_clen = 0;
684 			vp->v_lasta = bp->b_blkno;
685 			vp->v_cstart = lbn + 1;
686 			vp->v_lastw = lbn;
687 			return;
688 		}
689 		vp->v_clen = maxclen;
690 		if (!async && maxclen == 0) {	/* I/O not contiguous */
691 			vp->v_cstart = lbn + 1;
692 			bawrite(bp);
693 		} else {	/* Wait for rest of cluster */
694 			vp->v_cstart = lbn;
695 			bdwrite(bp);
696 		}
697 	} else if (lbn == vp->v_cstart + vp->v_clen) {
698 		/*
699 		 * At end of cluster, write it out if seqcount tells us we
700 		 * are operating sequentially, otherwise let the buf or
701 		 * update daemon handle it.
702 		 */
703 		bdwrite(bp);
704 		if (seqcount > 1)
705 			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
706 		vp->v_clen = 0;
707 		vp->v_cstart = lbn + 1;
708 	} else if (vm_page_count_severe()) {
709 		/*
710 		 * We are low on memory, get it going NOW
711 		 */
712 		bawrite(bp);
713 	} else {
714 		/*
715 		 * In the middle of a cluster, so just delay the I/O for now.
716 		 */
717 		bdwrite(bp);
718 	}
719 	vp->v_lastw = lbn;
720 	vp->v_lasta = bp->b_blkno;
721 }
722 
723 
724 /*
725  * This is an awful lot like cluster_rbuild...wish they could be combined.
726  * The last lbn argument is the current block on which I/O is being
727  * performed.  Check to see that it doesn't fall in the middle of
728  * the current block (if last_bp == NULL).
729  */
730 int
731 cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len)
732 {
733 	struct buf *bp, *tbp;
734 	int i, j, s;
735 	int totalwritten = 0;
736 	int dbsize = btodb(size);
737 
738 	while (len > 0) {
739 		s = splbio();
740 		/*
741 		 * If the buffer is not delayed-write (i.e. dirty), or it
742 		 * is delayed-write but either locked or inval, it cannot
743 		 * partake in the clustered write.
744 		 */
745 		if (((tbp = gbincore(vp, start_lbn)) == NULL) ||
746 		  ((tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI) ||
747 		  BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
748 			++start_lbn;
749 			--len;
750 			splx(s);
751 			continue;
752 		}
753 		bremfree(tbp);
754 		tbp->b_flags &= ~B_DONE;
755 		splx(s);
756 
757 		/*
758 		 * Extra memory in the buffer, punt on this buffer.
759 		 * XXX we could handle this in most cases, but we would
760 		 * have to push the extra memory down to after our max
761 		 * possible cluster size and then potentially pull it back
762 		 * up if the cluster was terminated prematurely--too much
763 		 * hassle.
764 		 */
765 		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
766 		  (tbp->b_bcount != tbp->b_bufsize) ||
767 		  (tbp->b_bcount != size) ||
768 		  (len == 1) ||
769 		  ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
770 			totalwritten += tbp->b_bufsize;
771 			bawrite(tbp);
772 			++start_lbn;
773 			--len;
774 			continue;
775 		}
776 
777 		/*
778 		 * We got a pbuf to make the cluster in.
779 		 * so initialise it.
780 		 */
781 		TAILQ_INIT(&bp->b_cluster.cluster_head);
782 		bp->b_bcount = 0;
783 		bp->b_bufsize = 0;
784 		bp->b_xio.xio_npages = 0;
785 		bp->b_blkno = tbp->b_blkno;
786 		bp->b_lblkno = tbp->b_lblkno;
787 		bp->b_offset = tbp->b_offset;
788 
789 		/*
790 		 * We are synthesizing a buffer out of vm_page_t's, but
791 		 * if the block size is not page aligned then the starting
792 		 * address may not be either.  Inherit the b_data offset
793 		 * from the original buffer.
794 		 */
795 		bp->b_data = (char *)((vm_offset_t)bp->b_data |
796 		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
797 		bp->b_flags |= B_CALL | B_CLUSTER |
798 			(tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | B_NOWDRAIN));
799 		bp->b_iodone = cluster_callback;
800 		pbgetvp(vp, bp);
801 		/*
802 		 * From this location in the file, scan forward to see
803 		 * if there are buffers with adjacent data that need to
804 		 * be written as well.
805 		 */
806 		for (i = 0; i < len; ++i, ++start_lbn) {
807 			if (i != 0) { /* If not the first buffer */
808 				s = splbio();
809 				/*
810 				 * If the adjacent data is not even in core it
811 				 * can't need to be written.
812 				 */
813 				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
814 					splx(s);
815 					break;
816 				}
817 
818 				/*
819 				 * If it IS in core, but has different
820 				 * characteristics, or is locked (which
821 				 * means it could be undergoing a background
822 				 * I/O or be in a weird state), then don't
823 				 * cluster with it.
824 				 */
825 				if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
826 				    B_INVAL | B_DELWRI | B_NEEDCOMMIT))
827 				  != (B_DELWRI | B_CLUSTEROK |
828 				    (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
829 				    (tbp->b_flags & B_LOCKED) ||
830 				    BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) {
831 					splx(s);
832 					break;
833 				}
834 
835 				/*
836 				 * Check that the combined cluster
837 				 * would make sense with regard to pages
838 				 * and would not be too large
839 				 */
840 				if ((tbp->b_bcount != size) ||
841 				  ((bp->b_blkno + (dbsize * i)) !=
842 				    tbp->b_blkno) ||
843 				  ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) >
844 				    (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
845 					BUF_UNLOCK(tbp);
846 					splx(s);
847 					break;
848 				}
849 				/*
850 				 * Ok, it's passed all the tests,
851 				 * so remove it from the free list
852 				 * and mark it busy. We will use it.
853 				 */
854 				bremfree(tbp);
855 				tbp->b_flags &= ~B_DONE;
856 				splx(s);
857 			} /* end of code for non-first buffers only */
858 			/* check for latent dependencies to be handled */
859 			if ((LIST_FIRST(&tbp->b_dep)) != NULL &&
860 			    bioops.io_start)
861 				(*bioops.io_start)(tbp);
862 			/*
863 			 * If the IO is via the VM then we do some
864 			 * special VM hackery (yuck).  Since the buffer's
865 			 * block size may not be page-aligned it is possible
866 			 * for a page to be shared between two buffers.  We
867 			 * have to get rid of the duplication when building
868 			 * the cluster.
869 			 */
870 			if (tbp->b_flags & B_VMIO) {
871 				vm_page_t m;
872 
873 				if (i != 0) { /* if not first buffer */
874 					for (j = 0; j < tbp->b_xio.xio_npages; j += 1) {
875 						m = tbp->b_xio.xio_pages[j];
876 						if (m->flags & PG_BUSY) {
877 							bqrelse(tbp);
878 							goto finishcluster;
879 						}
880 					}
881 				}
882 
883 				for (j = 0; j < tbp->b_xio.xio_npages; j += 1) {
884 					m = tbp->b_xio.xio_pages[j];
885 					vm_page_io_start(m);
886 					vm_object_pip_add(m->object, 1);
887 					if ((bp->b_xio.xio_npages == 0) ||
888 					  (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
889 						bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
890 						bp->b_xio.xio_npages++;
891 					}
892 				}
893 			}
894 			bp->b_bcount += size;
895 			bp->b_bufsize += size;
896 
897 			s = splbio();
898 			bundirty(tbp);
899 			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
900 			tbp->b_flags |= B_ASYNC;
901 			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
902 			++tbp->b_vp->v_numoutput;
903 			splx(s);
904 			BUF_KERNPROC(tbp);
905 			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
906 				tbp, b_cluster.cluster_entry);
907 		}
908 	finishcluster:
909 		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
910 			(vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages);
911 		if (bp->b_bufsize > bp->b_kvasize)
912 			panic(
913 			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
914 			    bp->b_bufsize, bp->b_kvasize);
915 		bp->b_kvasize = bp->b_bufsize;
916 		totalwritten += bp->b_bufsize;
917 		bp->b_dirtyoff = 0;
918 		bp->b_dirtyend = bp->b_bufsize;
919 		bawrite(bp);
920 
921 		len -= i;
922 	}
923 	return totalwritten;
924 }
925 
926 /*
927  * Collect together all the buffers in a cluster.
928  * Plus add one additional buffer.
929  */
930 static struct cluster_save *
931 cluster_collectbufs(struct vnode *vp, struct buf *last_bp)
932 {
933 	struct cluster_save *buflist;
934 	struct buf *bp;
935 	daddr_t lbn;
936 	int i, len;
937 
938 	len = vp->v_lastw - vp->v_cstart + 1;
939 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
940 	    M_SEGMENT, M_WAITOK);
941 	buflist->bs_nchildren = 0;
942 	buflist->bs_children = (struct buf **) (buflist + 1);
943 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
944 		(void) bread(vp, lbn, last_bp->b_bcount, &bp);
945 		buflist->bs_children[i] = bp;
946 		if (bp->b_blkno == bp->b_lblkno)
947 			VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
948 				NULL, NULL);
949 	}
950 	buflist->bs_children[i] = bp = last_bp;
951 	if (bp->b_blkno == bp->b_lblkno)
952 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno,
953 			NULL, NULL);
954 	buflist->bs_nchildren = i + 1;
955 	return (buflist);
956 }
957