1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 * 7 * @(#)vfs_cluster.c 8.3 (Berkeley) 10/14/93 8 */ 9 10 #include <sys/param.h> 11 #include <sys/proc.h> 12 #include <sys/buf.h> 13 #include <sys/vnode.h> 14 #include <sys/mount.h> 15 #include <sys/trace.h> 16 #include <sys/malloc.h> 17 #include <sys/resourcevar.h> 18 #include <libkern/libkern.h> 19 20 /* 21 * Local declarations 22 */ 23 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 24 daddr_t, long, int)); 25 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 26 daddr_t, daddr_t, long, int, long)); 27 void cluster_wbuild __P((struct vnode *, struct buf *, long, 28 daddr_t, int, daddr_t)); 29 30 /* 31 * We could optimize this by keeping track of where the last read-ahead 32 * was, but it would involve adding fields to the vnode. For now, let's 33 * just get it working. 34 * 35 * This replaces bread. If this is a bread at the beginning of a file and 36 * lastr is 0, we assume this is the first read and we'll read up to two 37 * blocks if they are sequential. After that, we'll do regular read ahead 38 * in clustered chunks. 39 * 40 * There are 4 or 5 cases depending on how you count: 41 * Desired block is in the cache: 42 * 1 Not sequential access (0 I/Os). 43 * 2 Access is sequential, do read-ahead (1 ASYNC). 44 * Desired block is not in cache: 45 * 3 Not sequential access (1 SYNC). 46 * 4 Sequential access, next block is contiguous (1 SYNC). 47 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 48 * 49 * There are potentially two buffers that require I/O. 50 * bp is the block requested. 51 * rbp is the read-ahead block. 52 * If either is NULL, then you don't have to do the I/O. 53 */ 54 cluster_read(vp, filesize, lblkno, size, cred, bpp) 55 struct vnode *vp; 56 u_quad_t filesize; 57 daddr_t lblkno; 58 long size; 59 struct ucred *cred; 60 struct buf **bpp; 61 { 62 struct buf *bp, *rbp; 63 daddr_t blkno, ioblkno; 64 long flags; 65 int error, num_ra, alreadyincore; 66 67 #ifdef DIAGNOSTIC 68 if (size == 0) 69 panic("cluster_read: size = 0"); 70 #endif 71 72 error = 0; 73 flags = B_READ; 74 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 75 if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { 76 /* 77 * Desired block is in cache; do any readahead ASYNC. 78 * Case 1, 2. 79 */ 80 trace(TR_BREADHIT, pack(vp, size), lblkno); 81 flags |= B_ASYNC; 82 ioblkno = lblkno + 83 (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); 84 alreadyincore = (int)incore(vp, ioblkno); 85 bp = NULL; 86 } else { 87 /* Block wasn't in cache, case 3, 4, 5. */ 88 trace(TR_BREADMISS, pack(vp, size), lblkno); 89 ioblkno = lblkno; 90 bp->b_flags |= flags; 91 alreadyincore = 0; 92 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 93 } 94 /* 95 * XXX 96 * Replace 1 with a window size based on some permutation of 97 * maxcontig and rot_delay. This will let you figure out how 98 * many blocks you should read-ahead (case 2, 4, 5). 99 * 100 * If the access isn't sequential, cut the window size in half. 101 */ 102 rbp = NULL; 103 if (lblkno != vp->v_lastr + 1 && lblkno != 0) 104 vp->v_ralen = max(vp->v_ralen >> 1, 1); 105 else if ((ioblkno + 1) * size < filesize && !alreadyincore && 106 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 107 blkno != -1) { 108 /* 109 * Reading sequentially, and the next block is not in the 110 * cache. We are going to try reading ahead. If this is 111 * the first read of a file, then limit read-ahead to a 112 * single block, else read as much as we're allowed. 113 */ 114 if (num_ra > vp->v_ralen) { 115 num_ra = vp->v_ralen; 116 vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); 117 } else 118 vp->v_ralen = num_ra + 1; 119 120 121 if (num_ra) /* case 2, 4 */ 122 rbp = cluster_rbuild(vp, filesize, 123 bp, ioblkno, blkno, size, num_ra, flags); 124 else if (lblkno != 0 && ioblkno == lblkno) { 125 /* Case 5: check how many blocks to read ahead */ 126 ++ioblkno; 127 if ((ioblkno + 1) * size > filesize || 128 (error = VOP_BMAP(vp, ioblkno, NULL, &blkno, 129 &num_ra)) || blkno == -1) 130 goto skip_readahead; 131 flags |= B_ASYNC; 132 if (num_ra) 133 rbp = cluster_rbuild(vp, filesize, 134 NULL, ioblkno, blkno, size, num_ra, flags); 135 else { 136 rbp = getblk(vp, ioblkno, size, 0, 0); 137 rbp->b_flags |= flags; 138 rbp->b_blkno = blkno; 139 } 140 } else if (lblkno != 0) { 141 /* case 2; read ahead single block */ 142 rbp = getblk(vp, ioblkno, size, 0, 0); 143 rbp->b_flags |= flags; 144 rbp->b_blkno = blkno; 145 } else if (bp) /* case 1, 3, block 0 */ 146 bp->b_blkno = blkno; 147 /* Case 1 on block 0; not really doing sequential I/O */ 148 149 if (rbp == bp) /* case 4 */ 150 rbp = NULL; 151 else if (rbp) { /* case 2, 5 */ 152 trace(TR_BREADMISSRA, 153 pack(vp, (num_ra + 1) * size), ioblkno); 154 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 155 } 156 } 157 158 /* XXX Kirk, do we need to make sure the bp has creds? */ 159 skip_readahead: 160 if (bp) 161 if (bp->b_flags & (B_DONE | B_DELWRI)) 162 panic("cluster_read: DONE bp"); 163 else 164 error = VOP_STRATEGY(bp); 165 166 if (rbp) 167 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 168 rbp->b_flags &= ~(B_ASYNC | B_READ); 169 brelse(rbp); 170 } else 171 (void) VOP_STRATEGY(rbp); 172 173 if (bp) 174 return(biowait(bp)); 175 return(error); 176 } 177 178 /* 179 * If blocks are contiguous on disk, use this to provide clustered 180 * read ahead. We will read as many blocks as possible sequentially 181 * and then parcel them up into logical blocks in the buffer hash table. 182 */ 183 struct buf * 184 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 185 struct vnode *vp; 186 u_quad_t filesize; 187 struct buf *bp; 188 daddr_t lbn; 189 daddr_t blkno; 190 long size; 191 int run; 192 long flags; 193 { 194 struct cluster_save *b_save; 195 struct buf *tbp; 196 daddr_t bn; 197 int i, inc; 198 199 #ifdef DIAGNOSTIC 200 if (size != vp->v_mount->mnt_stat.f_iosize) 201 panic("cluster_rbuild: size %d != filesize %d\n", 202 size, vp->v_mount->mnt_stat.f_iosize); 203 #endif 204 if (size * (lbn + run + 1) > filesize) 205 --run; 206 if (run == 0) { 207 if (!bp) { 208 bp = getblk(vp, lbn, size, 0, 0); 209 bp->b_blkno = blkno; 210 bp->b_flags |= flags; 211 } 212 return(bp); 213 } 214 215 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 216 if (bp->b_flags & (B_DONE | B_DELWRI)) 217 return (bp); 218 219 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 220 M_SEGMENT, M_WAITOK); 221 b_save->bs_bufsize = b_save->bs_bcount = size; 222 b_save->bs_nchildren = 0; 223 b_save->bs_children = (struct buf **)(b_save + 1); 224 b_save->bs_saveaddr = bp->b_saveaddr; 225 bp->b_saveaddr = (caddr_t) b_save; 226 227 inc = size / DEV_BSIZE; 228 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 229 if (incore(vp, lbn + i)) { 230 if (i == 1) { 231 bp->b_saveaddr = b_save->bs_saveaddr; 232 bp->b_flags &= ~B_CALL; 233 bp->b_iodone = NULL; 234 allocbuf(bp, size); 235 free(b_save, M_SEGMENT); 236 } else 237 allocbuf(bp, size * i); 238 break; 239 } 240 tbp = getblk(vp, lbn + i, 0, 0, 0); 241 tbp->b_bcount = tbp->b_bufsize = size; 242 tbp->b_blkno = bn; 243 tbp->b_flags |= flags | B_READ | B_ASYNC; 244 ++b_save->bs_nchildren; 245 b_save->bs_children[i - 1] = tbp; 246 } 247 if (!(bp->b_flags & B_ASYNC)) 248 vp->v_ralen = max(vp->v_ralen - 1, 1); 249 return(bp); 250 } 251 252 /* 253 * Either get a new buffer or grow the existing one. 254 */ 255 struct buf * 256 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 257 struct vnode *vp; 258 struct buf *bp; 259 long flags; 260 daddr_t blkno; 261 daddr_t lblkno; 262 long size; 263 int run; 264 { 265 if (!bp) { 266 bp = getblk(vp, lblkno, size, 0, 0); 267 if (bp->b_flags & (B_DONE | B_DELWRI)) { 268 bp->b_blkno = blkno; 269 return(bp); 270 } 271 } 272 allocbuf(bp, run * size); 273 bp->b_blkno = blkno; 274 bp->b_iodone = cluster_callback; 275 bp->b_flags |= flags | B_CALL; 276 return(bp); 277 } 278 279 /* 280 * Cleanup after a clustered read or write. 281 */ 282 void 283 cluster_callback(bp) 284 struct buf *bp; 285 { 286 struct cluster_save *b_save; 287 struct buf **tbp; 288 long bsize; 289 caddr_t cp; 290 291 b_save = (struct cluster_save *)(bp->b_saveaddr); 292 bp->b_saveaddr = b_save->bs_saveaddr; 293 294 cp = (char *)bp->b_data + b_save->bs_bufsize; 295 for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { 296 pagemove(cp, (*tbp)->b_data, (*tbp)->b_bufsize); 297 cp += (*tbp)->b_bufsize; 298 bp->b_bufsize -= (*tbp)->b_bufsize; 299 biodone(*tbp); 300 } 301 #ifdef DIAGNOSTIC 302 if (bp->b_bufsize != b_save->bs_bufsize) 303 panic ("cluster_callback: more space to reclaim"); 304 #endif 305 bp->b_bcount = bp->b_bufsize; 306 bp->b_iodone = NULL; 307 free(b_save, M_SEGMENT); 308 if (bp->b_flags & B_ASYNC) 309 brelse(bp); 310 else 311 wakeup((caddr_t)bp); 312 } 313 314 /* 315 * Do clustered write for FFS. 316 * 317 * Three cases: 318 * 1. Write is not sequential (write asynchronously) 319 * Write is sequential: 320 * 2. beginning of cluster - begin cluster 321 * 3. middle of a cluster - add to cluster 322 * 4. end of a cluster - asynchronously write cluster 323 */ 324 void 325 cluster_write(bp, filesize) 326 struct buf *bp; 327 u_quad_t filesize; 328 { 329 struct vnode *vp; 330 daddr_t lbn; 331 int clen; 332 333 vp = bp->b_vp; 334 lbn = bp->b_lblkno; 335 336 /* Initialize vnode to beginning of file. */ 337 if (lbn == 0) 338 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 339 340 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 341 (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) { 342 if (vp->v_clen != 0) 343 /* 344 * Write is not sequential. 345 */ 346 cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 347 vp->v_lastw - vp->v_cstart + 1, lbn); 348 /* 349 * Consider beginning a cluster. 350 */ 351 if ((lbn + 1) * bp->b_bcount == filesize) 352 /* End of file, make cluster as large as possible */ 353 clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 354 else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen) || 355 bp->b_blkno == -1) { 356 bawrite(bp); 357 vp->v_clen = 0; 358 vp->v_lasta = bp->b_blkno; 359 vp->v_cstart = lbn + 1; 360 vp->v_lastw = lbn; 361 return; 362 } 363 vp->v_clen = clen; 364 if (clen == 0) { /* I/O not contiguous */ 365 vp->v_cstart = lbn + 1; 366 bawrite(bp); 367 } else { /* Wait for rest of cluster */ 368 vp->v_cstart = lbn; 369 bdwrite(bp); 370 } 371 } else if (lbn == vp->v_cstart + vp->v_clen) { 372 /* 373 * At end of cluster, write it out. 374 */ 375 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 376 vp->v_clen + 1, lbn); 377 vp->v_clen = 0; 378 vp->v_cstart = lbn + 1; 379 } else 380 /* 381 * In the middle of a cluster, so just delay the 382 * I/O for now. 383 */ 384 bdwrite(bp); 385 vp->v_lastw = lbn; 386 vp->v_lasta = bp->b_blkno; 387 } 388 389 390 /* 391 * This is an awful lot like cluster_rbuild...wish they could be combined. 392 * The last lbn argument is the current block on which I/O is being 393 * performed. Check to see that it doesn't fall in the middle of 394 * the current block. 395 */ 396 void 397 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 398 struct vnode *vp; 399 struct buf *last_bp; 400 long size; 401 daddr_t start_lbn; 402 int len; 403 daddr_t lbn; 404 { 405 struct cluster_save *b_save; 406 struct buf *bp, *tbp; 407 caddr_t cp; 408 int i, s; 409 410 #ifdef DIAGNOSTIC 411 if (size != vp->v_mount->mnt_stat.f_iosize) 412 panic("cluster_wbuild: size %d != filesize %d\n", 413 size, vp->v_mount->mnt_stat.f_iosize); 414 #endif 415 redo: 416 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 417 ++start_lbn; 418 --len; 419 } 420 421 /* Get more memory for current buffer */ 422 if (len <= 1) { 423 if (last_bp) { 424 bawrite(last_bp); 425 } else if (len) { 426 bp = getblk(vp, start_lbn, size, 0, 0); 427 bawrite(bp); 428 } 429 return; 430 } 431 432 bp = getblk(vp, start_lbn, size, 0, 0); 433 if (!(bp->b_flags & B_DELWRI)) { 434 ++start_lbn; 435 --len; 436 brelse(bp); 437 goto redo; 438 } 439 440 --len; 441 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 442 M_SEGMENT, M_WAITOK); 443 b_save->bs_bcount = bp->b_bcount; 444 b_save->bs_bufsize = bp->b_bufsize; 445 b_save->bs_nchildren = 0; 446 b_save->bs_children = (struct buf **)(b_save + 1); 447 b_save->bs_saveaddr = bp->b_saveaddr; 448 bp->b_saveaddr = (caddr_t) b_save; 449 450 451 bp->b_flags |= B_CALL; 452 bp->b_iodone = cluster_callback; 453 cp = (char *)bp->b_data + bp->b_bufsize; 454 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 455 if (!incore(vp, start_lbn) || start_lbn == lbn) 456 break; 457 458 if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { 459 tbp = getblk(vp, start_lbn, size, 0, 0); 460 #ifdef DIAGNOSTIC 461 if (tbp->b_bcount != tbp->b_bufsize) 462 panic("cluster_wbuild: Buffer too big"); 463 #endif 464 if (!(tbp->b_flags & B_DELWRI)) { 465 brelse(tbp); 466 break; 467 } 468 } else 469 tbp = last_bp; 470 471 ++b_save->bs_nchildren; 472 473 /* Move memory from children to parent */ 474 if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) { 475 printf("Clustered Block: %d addr %x bufsize: %d\n", 476 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 477 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 478 tbp->b_blkno); 479 panic("Clustered write to wrong blocks"); 480 } 481 482 pagemove(tbp->b_data, cp, size); 483 bp->b_bcount += size; 484 bp->b_bufsize += size; 485 486 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 487 tbp->b_flags |= B_ASYNC; 488 s = splbio(); 489 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 490 ++tbp->b_vp->v_numoutput; 491 splx(s); 492 b_save->bs_children[i] = tbp; 493 494 cp += tbp->b_bufsize; 495 } 496 497 if (i == 0) { 498 /* None to cluster */ 499 bp->b_saveaddr = b_save->bs_saveaddr; 500 bp->b_flags &= ~B_CALL; 501 bp->b_iodone = NULL; 502 free(b_save, M_SEGMENT); 503 } 504 bawrite(bp); 505 if (i < len) { 506 len -= i + 1; 507 start_lbn += 1; 508 goto redo; 509 } 510 } 511