xref: /original-bsd/sys/kern/vfs_cluster.c (revision bacd16ee)
1 /*-
2  * Copyright (c) 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  *
7  *	@(#)vfs_cluster.c	8.1 (Berkeley) 06/10/93
8  */
9 
10 #include <sys/param.h>
11 #include <sys/proc.h>
12 #include <sys/buf.h>
13 #include <sys/vnode.h>
14 #include <sys/mount.h>
15 #include <sys/trace.h>
16 #include <sys/malloc.h>
17 #include <sys/resourcevar.h>
18 #include <libkern/libkern.h>
19 
20 /*
21  * Local declarations
22  */
23 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
24 	    daddr_t, long, int));
25 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
26 	    daddr_t, daddr_t, long, int, long));
27 void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
28 	    daddr_t start_lbn, int len, daddr_t lbn));
29 
30 /*
31  * We could optimize this by keeping track of where the last read-ahead
32  * was, but it would involve adding fields to the vnode.  For now, let's
33  * just get it working.
34  *
35  * This replaces bread.  If this is a bread at the beginning of a file and
36  * lastr is 0, we assume this is the first read and we'll read up to two
37  * blocks if they are sequential.  After that, we'll do regular read ahead
38  * in clustered chunks.
39  *
40  * There are 4 or 5 cases depending on how you count:
41  *	Desired block is in the cache:
42  *	    1 Not sequential access (0 I/Os).
43  *	    2 Access is sequential, do read-ahead (1 ASYNC).
44  *	Desired block is not in cache:
45  *	    3 Not sequential access (1 SYNC).
46  *	    4 Sequential access, next block is contiguous (1 SYNC).
47  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
48  *
49  * There are potentially two buffers that require I/O.
50  * 	bp is the block requested.
51  *	rbp is the read-ahead block.
52  *	If either is NULL, then you don't have to do the I/O.
53  */
54 cluster_read(vp, filesize, lblkno, size, cred, bpp)
55 	struct vnode *vp;
56 	u_quad_t filesize;
57 	daddr_t lblkno;
58 	long size;
59 	struct ucred *cred;
60 	struct buf **bpp;
61 {
62 	struct buf *bp, *rbp;
63 	daddr_t blkno, ioblkno;
64 	long flags;
65 	int error, num_ra, alreadyincore;
66 
67 #ifdef DIAGNOSTIC
68 	if (size == 0)
69 		panic("cluster_read: size = 0");
70 #endif
71 
72 	error = 0;
73 	flags = B_READ;
74 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
75 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
76 		/*
77 		 * Desired block is in cache; do any readahead ASYNC.
78 		 * Case 1, 2.
79 		 */
80 		trace(TR_BREADHIT, pack(vp, size), lblkno);
81 		flags |= B_ASYNC;
82 		ioblkno = lblkno +
83 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
84 		alreadyincore = (int)incore(vp, ioblkno);
85 		bp = NULL;
86 	} else {
87 		/* Block wasn't in cache, case 3, 4, 5. */
88 		trace(TR_BREADMISS, pack(vp, size), lblkno);
89 		ioblkno = lblkno;
90 		bp->b_flags |= flags;
91 		alreadyincore = 0;
92 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
93 	}
94 	/*
95 	 * XXX
96 	 * Replace 1 with a window size based on some permutation of
97 	 * maxcontig and rot_delay.  This will let you figure out how
98 	 * many blocks you should read-ahead (case 2, 4, 5).
99 	 *
100 	 * If the access isn't sequential, cut the window size in half.
101 	 */
102 	rbp = NULL;
103 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
104 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
105 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
106 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
107 		/*
108 		 * Reading sequentially, and the next block is not in the
109 		 * cache.  We are going to try reading ahead. If this is
110 		 * the first read of a file, then limit read-ahead to a
111 		 * single block, else read as much as we're allowed.
112 		 */
113 		if (num_ra > vp->v_ralen) {
114 			num_ra = vp->v_ralen;
115 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
116 		} else
117 			vp->v_ralen = num_ra + 1;
118 
119 
120 		if (num_ra)				/* case 2, 4 */
121 			rbp = cluster_rbuild(vp, filesize,
122 			    bp, ioblkno, blkno, size, num_ra, flags);
123 		else if (lblkno != 0 && ioblkno == lblkno) {
124 			/* Case 5: check how many blocks to read ahead */
125 			++ioblkno;
126 			if ((ioblkno + 1) * size > filesize ||
127 			    (error = VOP_BMAP(vp,
128 			    ioblkno, NULL, &blkno, &num_ra)))
129 				goto skip_readahead;
130 			flags |= B_ASYNC;
131 			if (num_ra)
132 				rbp = cluster_rbuild(vp, filesize,
133 				    NULL, ioblkno, blkno, size, num_ra, flags);
134 			else {
135 				rbp = getblk(vp, ioblkno, size, 0, 0);
136 				rbp->b_flags |= flags;
137 				rbp->b_blkno = blkno;
138 			}
139 		} else if (lblkno != 0) {
140 			/* case 2; read ahead single block */
141 			rbp = getblk(vp, ioblkno, size, 0, 0);
142 			rbp->b_flags |= flags;
143 			rbp->b_blkno = blkno;
144 		} else if (bp)				/* case 1, 3, block 0 */
145 			bp->b_blkno = blkno;
146 		/* Case 1 on block 0; not really doing sequential I/O */
147 
148 		if (rbp == bp)		/* case 4 */
149 			rbp = NULL;
150 		else if (rbp) {			/* case 2, 5 */
151 			trace(TR_BREADMISSRA,
152 			    pack(vp, (num_ra + 1) * size), ioblkno);
153 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
154 		}
155 	}
156 
157 	/* XXX Kirk, do we need to make sure the bp has creds? */
158 skip_readahead:
159 	if (bp)
160 		if (bp->b_flags & (B_DONE | B_DELWRI))
161 			panic("cluster_read: DONE bp");
162 		else
163 			error = VOP_STRATEGY(bp);
164 
165 	if (rbp)
166 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
167 			rbp->b_flags &= ~(B_ASYNC | B_READ);
168 			brelse(rbp);
169 		} else
170 			(void) VOP_STRATEGY(rbp);
171 
172 	if (bp)
173 		return(biowait(bp));
174 	return(error);
175 }
176 
177 /*
178  * If blocks are contiguous on disk, use this to provide clustered
179  * read ahead.  We will read as many blocks as possible sequentially
180  * and then parcel them up into logical blocks in the buffer hash table.
181  */
182 struct buf *
183 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
184 	struct vnode *vp;
185 	u_quad_t filesize;
186 	struct buf *bp;
187 	daddr_t lbn;
188 	daddr_t blkno;
189 	long size;
190 	int run;
191 	long flags;
192 {
193 	struct cluster_save *b_save;
194 	struct buf *tbp;
195 	daddr_t bn;
196 	int i, inc;
197 
198 #ifdef DIAGNOSTIC
199 	if (size != vp->v_mount->mnt_stat.f_iosize)
200 		panic("cluster_rbuild: size %d != filesize %d\n",
201 			size, vp->v_mount->mnt_stat.f_iosize);
202 #endif
203 	if (size * (lbn + run + 1) > filesize)
204 		--run;
205 	if (run == 0) {
206 		if (!bp) {
207 			bp = getblk(vp, lbn, size, 0, 0);
208 			bp->b_blkno = blkno;
209 			bp->b_flags |= flags;
210 		}
211 		return(bp);
212 	}
213 
214 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
215 	if (bp->b_flags & (B_DONE | B_DELWRI))
216 		return (bp);
217 
218 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
219 	    M_SEGMENT, M_WAITOK);
220 	b_save->bs_bufsize = b_save->bs_bcount = size;
221 	b_save->bs_nchildren = 0;
222 	b_save->bs_children = (struct buf **)(b_save + 1);
223 	b_save->bs_saveaddr = bp->b_saveaddr;
224 	bp->b_saveaddr = (caddr_t) b_save;
225 
226 	inc = size / DEV_BSIZE;
227 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
228 		if (incore(vp, lbn + i)) {
229 			if (i == 1) {
230 				bp->b_saveaddr = b_save->bs_saveaddr;
231 				bp->b_flags &= ~B_CALL;
232 				bp->b_iodone = NULL;
233 				allocbuf(bp, size);
234 				free(b_save, M_SEGMENT);
235 			} else
236 				allocbuf(bp, size * i);
237 			break;
238 		}
239 		tbp = getblk(vp, lbn + i, 0, 0, 0);
240 		tbp->b_bcount = tbp->b_bufsize = size;
241 		tbp->b_blkno = bn;
242 		tbp->b_flags |= flags | B_READ | B_ASYNC;
243 		++b_save->bs_nchildren;
244 		b_save->bs_children[i - 1] = tbp;
245 	}
246 	if (!(bp->b_flags & B_ASYNC))
247 		vp->v_ralen = max(vp->v_ralen - 1, 1);
248 	return(bp);
249 }
250 
251 /*
252  * Either get a new buffer or grow the existing one.
253  */
254 struct buf *
255 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
256 	struct vnode *vp;
257 	struct buf *bp;
258 	long flags;
259 	daddr_t blkno;
260 	daddr_t lblkno;
261 	long size;
262 	int run;
263 {
264 	if (!bp) {
265 		bp = getblk(vp, lblkno, size, 0, 0);
266 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
267 			bp->b_blkno = blkno;
268 			return(bp);
269 		}
270 	}
271 	allocbuf(bp, run * size);
272 	bp->b_blkno = blkno;
273 	bp->b_iodone = cluster_callback;
274 	bp->b_flags |= flags | B_CALL;
275 	return(bp);
276 }
277 
278 /*
279  * Cleanup after a clustered read or write.
280  */
281 void
282 cluster_callback(bp)
283 	struct buf *bp;
284 {
285 	struct cluster_save *b_save;
286 	struct buf **tbp;
287 	long bsize;
288 	caddr_t cp;
289 	b_save = (struct cluster_save *)(bp->b_saveaddr);
290 	bp->b_saveaddr = b_save->bs_saveaddr;
291 
292 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
293 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
294 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
295 		cp += (*tbp)->b_bufsize;
296 		bp->b_bufsize -= (*tbp)->b_bufsize;
297 		biodone(*tbp);
298 	}
299 #ifdef DIAGNOSTIC
300 	if (bp->b_bufsize != b_save->bs_bufsize)
301 		panic ("cluster_callback: more space to reclaim");
302 #endif
303 	bp->b_bcount = bp->b_bufsize;
304 	bp->b_iodone = NULL;
305 	free(b_save, M_SEGMENT);
306 	if (bp->b_flags & B_ASYNC)
307 		brelse(bp);
308 	else
309 		wakeup((caddr_t)bp);
310 }
311 
312 /*
313  * Do clustered write for FFS.
314  *
315  * Three cases:
316  *	1. Write is not sequential (write asynchronously)
317  *	Write is sequential:
318  *	2.	beginning of cluster - begin cluster
319  *	3.	middle of a cluster - add to cluster
320  *	4.	end of a cluster - asynchronously write cluster
321  */
322 void
323 cluster_write(bp, filesize)
324         struct buf *bp;
325 	u_quad_t filesize;
326 {
327         struct vnode *vp;
328         daddr_t lbn;
329         int clen;
330 
331         vp = bp->b_vp;
332         lbn = bp->b_lblkno;
333 
334 	/* Initialize vnode to beginning of file. */
335 	if (lbn == 0)
336 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
337 
338         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
339 	    (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) {
340 		if (vp->v_clen != 0)
341 			/*
342 			 * Write is not sequential.
343 			 */
344 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
345 			    vp->v_lastw - vp->v_cstart + 1, lbn);
346 		/*
347 		 * Consider beginning a cluster.
348 		 */
349 		if ((lbn + 1) * bp->b_bcount == filesize)
350 			/* End of file, make cluster as large as possible */
351 			clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
352 		else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
353 			bawrite(bp);
354 			vp->v_clen = 0;
355 			vp->v_lasta = bp->b_blkno;
356 			vp->v_cstart = lbn + 1;
357 			vp->v_lastw = lbn;
358 			return;
359 		} else
360 			clen = 0;
361                 vp->v_clen = clen;
362                 if (clen == 0) {		/* I/O not contiguous */
363 			vp->v_cstart = lbn + 1;
364                         bawrite(bp);
365                 } else {			/* Wait for rest of cluster */
366 			vp->v_cstart = lbn;
367                         bdwrite(bp);
368 		}
369         } else if (lbn == vp->v_cstart + vp->v_clen) {
370 		/*
371 		 * At end of cluster, write it out.
372 		 */
373 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
374 		    vp->v_clen + 1, lbn);
375 		vp->v_clen = 0;
376 		vp->v_cstart = lbn + 1;
377         } else
378 		/*
379 		 * In the middle of a cluster, so just delay the
380 		 * I/O for now.
381 		 */
382                 bdwrite(bp);
383         vp->v_lastw = lbn;
384 	vp->v_lasta = bp->b_blkno;
385 }
386 
387 
388 /*
389  * This is an awful lot like cluster_rbuild...wish they could be combined.
390  * The last lbn argument is the current block on which I/O is being
391  * performed.  Check to see that it doesn't fall in the middle of
392  * the current block.
393  */
394 void
395 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
396 	struct vnode *vp;
397 	struct buf *last_bp;
398 	long size;
399 	daddr_t start_lbn;
400 	int len;
401 	daddr_t	lbn;
402 {
403 	struct cluster_save *b_save;
404 	struct buf *bp, *tbp;
405 	caddr_t	cp;
406 	int i, s;
407 
408 #ifdef DIAGNOSTIC
409 	if (size != vp->v_mount->mnt_stat.f_iosize)
410 		panic("cluster_wbuild: size %d != filesize %d\n",
411 			size, vp->v_mount->mnt_stat.f_iosize);
412 #endif
413 redo:
414 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
415 		++start_lbn;
416 		--len;
417 	}
418 
419 	/* Get more memory for current buffer */
420 	if (len <= 1) {
421 		if (last_bp) {
422 			bawrite(last_bp);
423 		} else if (len) {
424 			bp = getblk(vp, start_lbn, size, 0, 0);
425 			bawrite(bp);
426 		}
427 		return;
428 	}
429 
430 	bp = getblk(vp, start_lbn, size, 0, 0);
431 	if (!(bp->b_flags & B_DELWRI)) {
432 		++start_lbn;
433 		--len;
434 		brelse(bp);
435 		goto redo;
436 	}
437 
438 	--len;
439 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
440 	    M_SEGMENT, M_WAITOK);
441 	b_save->bs_bcount = bp->b_bcount;
442 	b_save->bs_bufsize = bp->b_bufsize;
443 	b_save->bs_nchildren = 0;
444 	b_save->bs_children = (struct buf **)(b_save + 1);
445 	b_save->bs_saveaddr = bp->b_saveaddr;
446 	bp->b_saveaddr = (caddr_t) b_save;
447 
448 
449 	bp->b_flags |= B_CALL;
450 	bp->b_iodone = cluster_callback;
451 	cp = bp->b_un.b_addr + bp->b_bufsize;
452 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
453 		if (!incore(vp, start_lbn) || start_lbn == lbn)
454 			break;
455 
456 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
457 			tbp = getblk(vp, start_lbn, size, 0, 0);
458 #ifdef DIAGNOSTIC
459 			if (tbp->b_bcount != tbp->b_bufsize)
460 				panic("cluster_wbuild: Buffer too big");
461 #endif
462 			if (!(tbp->b_flags & B_DELWRI)) {
463 				brelse(tbp);
464 				break;
465 			}
466 		} else
467 			tbp = last_bp;
468 
469 		++b_save->bs_nchildren;
470 
471 		/* Move memory from children to parent */
472 		if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) {
473 			printf("Clustered Block: %d addr %x bufsize: %d\n",
474 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
475 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
476 			    tbp->b_blkno);
477 			panic("Clustered write to wrong blocks");
478 		}
479 
480 		pagemove(tbp->b_un.b_daddr, cp, size);
481 		bp->b_bcount += size;
482 		bp->b_bufsize += size;
483 
484 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
485 		tbp->b_flags |= B_ASYNC;
486 		s = splbio();
487 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
488 		++tbp->b_vp->v_numoutput;
489 		splx(s);
490 		b_save->bs_children[i] = tbp;
491 
492 		cp += tbp->b_bufsize;
493 	}
494 
495 	if (i == 0) {
496 		/* None to cluster */
497 		bp->b_saveaddr = b_save->bs_saveaddr;
498 		bp->b_flags &= ~B_CALL;
499 		bp->b_iodone = NULL;
500 		free(b_save, M_SEGMENT);
501 	}
502 	bawrite(bp);
503 	if (i < len) {
504 		len -= i + 1;
505 		start_lbn += 1;
506 		goto redo;
507 	}
508 }
509