xref: /original-bsd/sys/kern/vfs_cluster.c (revision 95ecee29)
1 /*-
2  * Copyright (c) 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  *
7  *	@(#)vfs_cluster.c	8.3 (Berkeley) 10/14/93
8  */
9 
10 #include <sys/param.h>
11 #include <sys/proc.h>
12 #include <sys/buf.h>
13 #include <sys/vnode.h>
14 #include <sys/mount.h>
15 #include <sys/trace.h>
16 #include <sys/malloc.h>
17 #include <sys/resourcevar.h>
18 #include <libkern/libkern.h>
19 
20 /*
21  * Local declarations
22  */
23 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
24 	    daddr_t, long, int));
25 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
26 	    daddr_t, daddr_t, long, int, long));
27 void	    cluster_wbuild __P((struct vnode *, struct buf *, long,
28 	    daddr_t, int, daddr_t));
29 
30 /*
31  * We could optimize this by keeping track of where the last read-ahead
32  * was, but it would involve adding fields to the vnode.  For now, let's
33  * just get it working.
34  *
35  * This replaces bread.  If this is a bread at the beginning of a file and
36  * lastr is 0, we assume this is the first read and we'll read up to two
37  * blocks if they are sequential.  After that, we'll do regular read ahead
38  * in clustered chunks.
39  *
40  * There are 4 or 5 cases depending on how you count:
41  *	Desired block is in the cache:
42  *	    1 Not sequential access (0 I/Os).
43  *	    2 Access is sequential, do read-ahead (1 ASYNC).
44  *	Desired block is not in cache:
45  *	    3 Not sequential access (1 SYNC).
46  *	    4 Sequential access, next block is contiguous (1 SYNC).
47  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
48  *
49  * There are potentially two buffers that require I/O.
50  * 	bp is the block requested.
51  *	rbp is the read-ahead block.
52  *	If either is NULL, then you don't have to do the I/O.
53  */
54 cluster_read(vp, filesize, lblkno, size, cred, bpp)
55 	struct vnode *vp;
56 	u_quad_t filesize;
57 	daddr_t lblkno;
58 	long size;
59 	struct ucred *cred;
60 	struct buf **bpp;
61 {
62 	struct buf *bp, *rbp;
63 	daddr_t blkno, ioblkno;
64 	long flags;
65 	int error, num_ra, alreadyincore;
66 
67 #ifdef DIAGNOSTIC
68 	if (size == 0)
69 		panic("cluster_read: size = 0");
70 #endif
71 
72 	error = 0;
73 	flags = B_READ;
74 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
75 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
76 		/*
77 		 * Desired block is in cache; do any readahead ASYNC.
78 		 * Case 1, 2.
79 		 */
80 		trace(TR_BREADHIT, pack(vp, size), lblkno);
81 		flags |= B_ASYNC;
82 		ioblkno = lblkno +
83 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
84 		alreadyincore = (int)incore(vp, ioblkno);
85 		bp = NULL;
86 	} else {
87 		/* Block wasn't in cache, case 3, 4, 5. */
88 		trace(TR_BREADMISS, pack(vp, size), lblkno);
89 		ioblkno = lblkno;
90 		bp->b_flags |= flags;
91 		alreadyincore = 0;
92 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
93 	}
94 	/*
95 	 * XXX
96 	 * Replace 1 with a window size based on some permutation of
97 	 * maxcontig and rot_delay.  This will let you figure out how
98 	 * many blocks you should read-ahead (case 2, 4, 5).
99 	 *
100 	 * If the access isn't sequential, cut the window size in half.
101 	 */
102 	rbp = NULL;
103 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
104 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
105 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
106 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
107 	    blkno != -1) {
108 		/*
109 		 * Reading sequentially, and the next block is not in the
110 		 * cache.  We are going to try reading ahead. If this is
111 		 * the first read of a file, then limit read-ahead to a
112 		 * single block, else read as much as we're allowed.
113 		 */
114 		if (num_ra > vp->v_ralen) {
115 			num_ra = vp->v_ralen;
116 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
117 		} else
118 			vp->v_ralen = num_ra + 1;
119 
120 
121 		if (num_ra)				/* case 2, 4 */
122 			rbp = cluster_rbuild(vp, filesize,
123 			    bp, ioblkno, blkno, size, num_ra, flags);
124 		else if (lblkno != 0 && ioblkno == lblkno) {
125 			/* Case 5: check how many blocks to read ahead */
126 			++ioblkno;
127 			if ((ioblkno + 1) * size > filesize ||
128 			    (error = VOP_BMAP(vp, ioblkno, NULL, &blkno,
129 			    &num_ra)) || blkno == -1)
130 				goto skip_readahead;
131 			flags |= B_ASYNC;
132 			if (num_ra)
133 				rbp = cluster_rbuild(vp, filesize,
134 				    NULL, ioblkno, blkno, size, num_ra, flags);
135 			else {
136 				rbp = getblk(vp, ioblkno, size, 0, 0);
137 				rbp->b_flags |= flags;
138 				rbp->b_blkno = blkno;
139 			}
140 		} else if (lblkno != 0) {
141 			/* case 2; read ahead single block */
142 			rbp = getblk(vp, ioblkno, size, 0, 0);
143 			rbp->b_flags |= flags;
144 			rbp->b_blkno = blkno;
145 		} else if (bp)				/* case 1, 3, block 0 */
146 			bp->b_blkno = blkno;
147 		/* Case 1 on block 0; not really doing sequential I/O */
148 
149 		if (rbp == bp)		/* case 4 */
150 			rbp = NULL;
151 		else if (rbp) {			/* case 2, 5 */
152 			trace(TR_BREADMISSRA,
153 			    pack(vp, (num_ra + 1) * size), ioblkno);
154 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
155 		}
156 	}
157 
158 	/* XXX Kirk, do we need to make sure the bp has creds? */
159 skip_readahead:
160 	if (bp)
161 		if (bp->b_flags & (B_DONE | B_DELWRI))
162 			panic("cluster_read: DONE bp");
163 		else
164 			error = VOP_STRATEGY(bp);
165 
166 	if (rbp)
167 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
168 			rbp->b_flags &= ~(B_ASYNC | B_READ);
169 			brelse(rbp);
170 		} else
171 			(void) VOP_STRATEGY(rbp);
172 
173 	if (bp)
174 		return(biowait(bp));
175 	return(error);
176 }
177 
178 /*
179  * If blocks are contiguous on disk, use this to provide clustered
180  * read ahead.  We will read as many blocks as possible sequentially
181  * and then parcel them up into logical blocks in the buffer hash table.
182  */
183 struct buf *
184 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
185 	struct vnode *vp;
186 	u_quad_t filesize;
187 	struct buf *bp;
188 	daddr_t lbn;
189 	daddr_t blkno;
190 	long size;
191 	int run;
192 	long flags;
193 {
194 	struct cluster_save *b_save;
195 	struct buf *tbp;
196 	daddr_t bn;
197 	int i, inc;
198 
199 #ifdef DIAGNOSTIC
200 	if (size != vp->v_mount->mnt_stat.f_iosize)
201 		panic("cluster_rbuild: size %d != filesize %d\n",
202 			size, vp->v_mount->mnt_stat.f_iosize);
203 #endif
204 	if (size * (lbn + run + 1) > filesize)
205 		--run;
206 	if (run == 0) {
207 		if (!bp) {
208 			bp = getblk(vp, lbn, size, 0, 0);
209 			bp->b_blkno = blkno;
210 			bp->b_flags |= flags;
211 		}
212 		return(bp);
213 	}
214 
215 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
216 	if (bp->b_flags & (B_DONE | B_DELWRI))
217 		return (bp);
218 
219 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
220 	    M_SEGMENT, M_WAITOK);
221 	b_save->bs_bufsize = b_save->bs_bcount = size;
222 	b_save->bs_nchildren = 0;
223 	b_save->bs_children = (struct buf **)(b_save + 1);
224 	b_save->bs_saveaddr = bp->b_saveaddr;
225 	bp->b_saveaddr = (caddr_t) b_save;
226 
227 	inc = size / DEV_BSIZE;
228 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
229 		if (incore(vp, lbn + i)) {
230 			if (i == 1) {
231 				bp->b_saveaddr = b_save->bs_saveaddr;
232 				bp->b_flags &= ~B_CALL;
233 				bp->b_iodone = NULL;
234 				allocbuf(bp, size);
235 				free(b_save, M_SEGMENT);
236 			} else
237 				allocbuf(bp, size * i);
238 			break;
239 		}
240 		tbp = getblk(vp, lbn + i, 0, 0, 0);
241 		tbp->b_bcount = tbp->b_bufsize = size;
242 		tbp->b_blkno = bn;
243 		tbp->b_flags |= flags | B_READ | B_ASYNC;
244 		++b_save->bs_nchildren;
245 		b_save->bs_children[i - 1] = tbp;
246 	}
247 	if (!(bp->b_flags & B_ASYNC))
248 		vp->v_ralen = max(vp->v_ralen - 1, 1);
249 	return(bp);
250 }
251 
252 /*
253  * Either get a new buffer or grow the existing one.
254  */
255 struct buf *
256 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
257 	struct vnode *vp;
258 	struct buf *bp;
259 	long flags;
260 	daddr_t blkno;
261 	daddr_t lblkno;
262 	long size;
263 	int run;
264 {
265 	if (!bp) {
266 		bp = getblk(vp, lblkno, size, 0, 0);
267 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
268 			bp->b_blkno = blkno;
269 			return(bp);
270 		}
271 	}
272 	allocbuf(bp, run * size);
273 	bp->b_blkno = blkno;
274 	bp->b_iodone = cluster_callback;
275 	bp->b_flags |= flags | B_CALL;
276 	return(bp);
277 }
278 
279 /*
280  * Cleanup after a clustered read or write.
281  */
282 void
283 cluster_callback(bp)
284 	struct buf *bp;
285 {
286 	struct cluster_save *b_save;
287 	struct buf **tbp;
288 	long bsize;
289 	caddr_t cp;
290 
291 	b_save = (struct cluster_save *)(bp->b_saveaddr);
292 	bp->b_saveaddr = b_save->bs_saveaddr;
293 
294 	cp = (char *)bp->b_data + b_save->bs_bufsize;
295 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
296 		pagemove(cp, (*tbp)->b_data, (*tbp)->b_bufsize);
297 		cp += (*tbp)->b_bufsize;
298 		bp->b_bufsize -= (*tbp)->b_bufsize;
299 		biodone(*tbp);
300 	}
301 #ifdef DIAGNOSTIC
302 	if (bp->b_bufsize != b_save->bs_bufsize)
303 		panic ("cluster_callback: more space to reclaim");
304 #endif
305 	bp->b_bcount = bp->b_bufsize;
306 	bp->b_iodone = NULL;
307 	free(b_save, M_SEGMENT);
308 	if (bp->b_flags & B_ASYNC)
309 		brelse(bp);
310 	else
311 		wakeup((caddr_t)bp);
312 }
313 
314 /*
315  * Do clustered write for FFS.
316  *
317  * Three cases:
318  *	1. Write is not sequential (write asynchronously)
319  *	Write is sequential:
320  *	2.	beginning of cluster - begin cluster
321  *	3.	middle of a cluster - add to cluster
322  *	4.	end of a cluster - asynchronously write cluster
323  */
324 void
325 cluster_write(bp, filesize)
326         struct buf *bp;
327 	u_quad_t filesize;
328 {
329         struct vnode *vp;
330         daddr_t lbn;
331         int clen;
332 
333         vp = bp->b_vp;
334         lbn = bp->b_lblkno;
335 
336 	/* Initialize vnode to beginning of file. */
337 	if (lbn == 0)
338 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
339 
340         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
341 	    (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) {
342 		if (vp->v_clen != 0)
343 			/*
344 			 * Write is not sequential.
345 			 */
346 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
347 			    vp->v_lastw - vp->v_cstart + 1, lbn);
348 		/*
349 		 * Consider beginning a cluster.
350 		 */
351 		if ((lbn + 1) * bp->b_bcount == filesize)
352 			/* End of file, make cluster as large as possible */
353 			clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
354 		else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen) ||
355 			    bp->b_blkno == -1) {
356 			bawrite(bp);
357 			vp->v_clen = 0;
358 			vp->v_lasta = bp->b_blkno;
359 			vp->v_cstart = lbn + 1;
360 			vp->v_lastw = lbn;
361 			return;
362 		}
363                 vp->v_clen = clen;
364                 if (clen == 0) {		/* I/O not contiguous */
365 			vp->v_cstart = lbn + 1;
366                         bawrite(bp);
367                 } else {			/* Wait for rest of cluster */
368 			vp->v_cstart = lbn;
369                         bdwrite(bp);
370 		}
371         } else if (lbn == vp->v_cstart + vp->v_clen) {
372 		/*
373 		 * At end of cluster, write it out.
374 		 */
375 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
376 		    vp->v_clen + 1, lbn);
377 		vp->v_clen = 0;
378 		vp->v_cstart = lbn + 1;
379         } else
380 		/*
381 		 * In the middle of a cluster, so just delay the
382 		 * I/O for now.
383 		 */
384                 bdwrite(bp);
385         vp->v_lastw = lbn;
386 	vp->v_lasta = bp->b_blkno;
387 }
388 
389 
390 /*
391  * This is an awful lot like cluster_rbuild...wish they could be combined.
392  * The last lbn argument is the current block on which I/O is being
393  * performed.  Check to see that it doesn't fall in the middle of
394  * the current block.
395  */
396 void
397 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
398 	struct vnode *vp;
399 	struct buf *last_bp;
400 	long size;
401 	daddr_t start_lbn;
402 	int len;
403 	daddr_t	lbn;
404 {
405 	struct cluster_save *b_save;
406 	struct buf *bp, *tbp;
407 	caddr_t	cp;
408 	int i, s;
409 
410 #ifdef DIAGNOSTIC
411 	if (size != vp->v_mount->mnt_stat.f_iosize)
412 		panic("cluster_wbuild: size %d != filesize %d\n",
413 			size, vp->v_mount->mnt_stat.f_iosize);
414 #endif
415 redo:
416 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
417 		++start_lbn;
418 		--len;
419 	}
420 
421 	/* Get more memory for current buffer */
422 	if (len <= 1) {
423 		if (last_bp) {
424 			bawrite(last_bp);
425 		} else if (len) {
426 			bp = getblk(vp, start_lbn, size, 0, 0);
427 			bawrite(bp);
428 		}
429 		return;
430 	}
431 
432 	bp = getblk(vp, start_lbn, size, 0, 0);
433 	if (!(bp->b_flags & B_DELWRI)) {
434 		++start_lbn;
435 		--len;
436 		brelse(bp);
437 		goto redo;
438 	}
439 
440 	--len;
441 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
442 	    M_SEGMENT, M_WAITOK);
443 	b_save->bs_bcount = bp->b_bcount;
444 	b_save->bs_bufsize = bp->b_bufsize;
445 	b_save->bs_nchildren = 0;
446 	b_save->bs_children = (struct buf **)(b_save + 1);
447 	b_save->bs_saveaddr = bp->b_saveaddr;
448 	bp->b_saveaddr = (caddr_t) b_save;
449 
450 
451 	bp->b_flags |= B_CALL;
452 	bp->b_iodone = cluster_callback;
453 	cp = (char *)bp->b_data + bp->b_bufsize;
454 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
455 		if (!incore(vp, start_lbn) || start_lbn == lbn)
456 			break;
457 
458 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
459 			tbp = getblk(vp, start_lbn, size, 0, 0);
460 #ifdef DIAGNOSTIC
461 			if (tbp->b_bcount != tbp->b_bufsize)
462 				panic("cluster_wbuild: Buffer too big");
463 #endif
464 			if (!(tbp->b_flags & B_DELWRI)) {
465 				brelse(tbp);
466 				break;
467 			}
468 		} else
469 			tbp = last_bp;
470 
471 		++b_save->bs_nchildren;
472 
473 		/* Move memory from children to parent */
474 		if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) {
475 			printf("Clustered Block: %d addr %x bufsize: %d\n",
476 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
477 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
478 			    tbp->b_blkno);
479 			panic("Clustered write to wrong blocks");
480 		}
481 
482 		pagemove(tbp->b_data, cp, size);
483 		bp->b_bcount += size;
484 		bp->b_bufsize += size;
485 
486 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
487 		tbp->b_flags |= B_ASYNC;
488 		s = splbio();
489 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
490 		++tbp->b_vp->v_numoutput;
491 		splx(s);
492 		b_save->bs_children[i] = tbp;
493 
494 		cp += tbp->b_bufsize;
495 	}
496 
497 	if (i == 0) {
498 		/* None to cluster */
499 		bp->b_saveaddr = b_save->bs_saveaddr;
500 		bp->b_flags &= ~B_CALL;
501 		bp->b_iodone = NULL;
502 		free(b_save, M_SEGMENT);
503 	}
504 	bawrite(bp);
505 	if (i < len) {
506 		len -= i + 1;
507 		start_lbn += 1;
508 		goto redo;
509 	}
510 }
511