1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 * 7 * @(#)vfs_cluster.c 8.1 (Berkeley) 06/10/93 8 */ 9 10 #include <sys/param.h> 11 #include <sys/proc.h> 12 #include <sys/buf.h> 13 #include <sys/vnode.h> 14 #include <sys/mount.h> 15 #include <sys/trace.h> 16 #include <sys/malloc.h> 17 #include <sys/resourcevar.h> 18 #include <libkern/libkern.h> 19 20 /* 21 * Local declarations 22 */ 23 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 24 daddr_t, long, int)); 25 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 26 daddr_t, daddr_t, long, int, long)); 27 void cluster_wbuild __P((struct vnode *, struct buf *, long size, 28 daddr_t start_lbn, int len, daddr_t lbn)); 29 30 /* 31 * We could optimize this by keeping track of where the last read-ahead 32 * was, but it would involve adding fields to the vnode. For now, let's 33 * just get it working. 34 * 35 * This replaces bread. If this is a bread at the beginning of a file and 36 * lastr is 0, we assume this is the first read and we'll read up to two 37 * blocks if they are sequential. After that, we'll do regular read ahead 38 * in clustered chunks. 39 * 40 * There are 4 or 5 cases depending on how you count: 41 * Desired block is in the cache: 42 * 1 Not sequential access (0 I/Os). 43 * 2 Access is sequential, do read-ahead (1 ASYNC). 44 * Desired block is not in cache: 45 * 3 Not sequential access (1 SYNC). 46 * 4 Sequential access, next block is contiguous (1 SYNC). 47 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 48 * 49 * There are potentially two buffers that require I/O. 50 * bp is the block requested. 51 * rbp is the read-ahead block. 52 * If either is NULL, then you don't have to do the I/O. 53 */ 54 cluster_read(vp, filesize, lblkno, size, cred, bpp) 55 struct vnode *vp; 56 u_quad_t filesize; 57 daddr_t lblkno; 58 long size; 59 struct ucred *cred; 60 struct buf **bpp; 61 { 62 struct buf *bp, *rbp; 63 daddr_t blkno, ioblkno; 64 long flags; 65 int error, num_ra, alreadyincore; 66 67 #ifdef DIAGNOSTIC 68 if (size == 0) 69 panic("cluster_read: size = 0"); 70 #endif 71 72 error = 0; 73 flags = B_READ; 74 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 75 if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { 76 /* 77 * Desired block is in cache; do any readahead ASYNC. 78 * Case 1, 2. 79 */ 80 trace(TR_BREADHIT, pack(vp, size), lblkno); 81 flags |= B_ASYNC; 82 ioblkno = lblkno + 83 (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); 84 alreadyincore = (int)incore(vp, ioblkno); 85 bp = NULL; 86 } else { 87 /* Block wasn't in cache, case 3, 4, 5. */ 88 trace(TR_BREADMISS, pack(vp, size), lblkno); 89 ioblkno = lblkno; 90 bp->b_flags |= flags; 91 alreadyincore = 0; 92 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 93 } 94 /* 95 * XXX 96 * Replace 1 with a window size based on some permutation of 97 * maxcontig and rot_delay. This will let you figure out how 98 * many blocks you should read-ahead (case 2, 4, 5). 99 * 100 * If the access isn't sequential, cut the window size in half. 101 */ 102 rbp = NULL; 103 if (lblkno != vp->v_lastr + 1 && lblkno != 0) 104 vp->v_ralen = max(vp->v_ralen >> 1, 1); 105 else if ((ioblkno + 1) * size < filesize && !alreadyincore && 106 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { 107 /* 108 * Reading sequentially, and the next block is not in the 109 * cache. We are going to try reading ahead. If this is 110 * the first read of a file, then limit read-ahead to a 111 * single block, else read as much as we're allowed. 112 */ 113 if (num_ra > vp->v_ralen) { 114 num_ra = vp->v_ralen; 115 vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); 116 } else 117 vp->v_ralen = num_ra + 1; 118 119 120 if (num_ra) /* case 2, 4 */ 121 rbp = cluster_rbuild(vp, filesize, 122 bp, ioblkno, blkno, size, num_ra, flags); 123 else if (lblkno != 0 && ioblkno == lblkno) { 124 /* Case 5: check how many blocks to read ahead */ 125 ++ioblkno; 126 if ((ioblkno + 1) * size > filesize || 127 (error = VOP_BMAP(vp, 128 ioblkno, NULL, &blkno, &num_ra))) 129 goto skip_readahead; 130 flags |= B_ASYNC; 131 if (num_ra) 132 rbp = cluster_rbuild(vp, filesize, 133 NULL, ioblkno, blkno, size, num_ra, flags); 134 else { 135 rbp = getblk(vp, ioblkno, size, 0, 0); 136 rbp->b_flags |= flags; 137 rbp->b_blkno = blkno; 138 } 139 } else if (lblkno != 0) { 140 /* case 2; read ahead single block */ 141 rbp = getblk(vp, ioblkno, size, 0, 0); 142 rbp->b_flags |= flags; 143 rbp->b_blkno = blkno; 144 } else if (bp) /* case 1, 3, block 0 */ 145 bp->b_blkno = blkno; 146 /* Case 1 on block 0; not really doing sequential I/O */ 147 148 if (rbp == bp) /* case 4 */ 149 rbp = NULL; 150 else if (rbp) { /* case 2, 5 */ 151 trace(TR_BREADMISSRA, 152 pack(vp, (num_ra + 1) * size), ioblkno); 153 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 154 } 155 } 156 157 /* XXX Kirk, do we need to make sure the bp has creds? */ 158 skip_readahead: 159 if (bp) 160 if (bp->b_flags & (B_DONE | B_DELWRI)) 161 panic("cluster_read: DONE bp"); 162 else 163 error = VOP_STRATEGY(bp); 164 165 if (rbp) 166 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 167 rbp->b_flags &= ~(B_ASYNC | B_READ); 168 brelse(rbp); 169 } else 170 (void) VOP_STRATEGY(rbp); 171 172 if (bp) 173 return(biowait(bp)); 174 return(error); 175 } 176 177 /* 178 * If blocks are contiguous on disk, use this to provide clustered 179 * read ahead. We will read as many blocks as possible sequentially 180 * and then parcel them up into logical blocks in the buffer hash table. 181 */ 182 struct buf * 183 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 184 struct vnode *vp; 185 u_quad_t filesize; 186 struct buf *bp; 187 daddr_t lbn; 188 daddr_t blkno; 189 long size; 190 int run; 191 long flags; 192 { 193 struct cluster_save *b_save; 194 struct buf *tbp; 195 daddr_t bn; 196 int i, inc; 197 198 #ifdef DIAGNOSTIC 199 if (size != vp->v_mount->mnt_stat.f_iosize) 200 panic("cluster_rbuild: size %d != filesize %d\n", 201 size, vp->v_mount->mnt_stat.f_iosize); 202 #endif 203 if (size * (lbn + run + 1) > filesize) 204 --run; 205 if (run == 0) { 206 if (!bp) { 207 bp = getblk(vp, lbn, size, 0, 0); 208 bp->b_blkno = blkno; 209 bp->b_flags |= flags; 210 } 211 return(bp); 212 } 213 214 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 215 if (bp->b_flags & (B_DONE | B_DELWRI)) 216 return (bp); 217 218 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 219 M_SEGMENT, M_WAITOK); 220 b_save->bs_bufsize = b_save->bs_bcount = size; 221 b_save->bs_nchildren = 0; 222 b_save->bs_children = (struct buf **)(b_save + 1); 223 b_save->bs_saveaddr = bp->b_saveaddr; 224 bp->b_saveaddr = (caddr_t) b_save; 225 226 inc = size / DEV_BSIZE; 227 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 228 if (incore(vp, lbn + i)) { 229 if (i == 1) { 230 bp->b_saveaddr = b_save->bs_saveaddr; 231 bp->b_flags &= ~B_CALL; 232 bp->b_iodone = NULL; 233 allocbuf(bp, size); 234 free(b_save, M_SEGMENT); 235 } else 236 allocbuf(bp, size * i); 237 break; 238 } 239 tbp = getblk(vp, lbn + i, 0, 0, 0); 240 tbp->b_bcount = tbp->b_bufsize = size; 241 tbp->b_blkno = bn; 242 tbp->b_flags |= flags | B_READ | B_ASYNC; 243 ++b_save->bs_nchildren; 244 b_save->bs_children[i - 1] = tbp; 245 } 246 if (!(bp->b_flags & B_ASYNC)) 247 vp->v_ralen = max(vp->v_ralen - 1, 1); 248 return(bp); 249 } 250 251 /* 252 * Either get a new buffer or grow the existing one. 253 */ 254 struct buf * 255 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 256 struct vnode *vp; 257 struct buf *bp; 258 long flags; 259 daddr_t blkno; 260 daddr_t lblkno; 261 long size; 262 int run; 263 { 264 if (!bp) { 265 bp = getblk(vp, lblkno, size, 0, 0); 266 if (bp->b_flags & (B_DONE | B_DELWRI)) { 267 bp->b_blkno = blkno; 268 return(bp); 269 } 270 } 271 allocbuf(bp, run * size); 272 bp->b_blkno = blkno; 273 bp->b_iodone = cluster_callback; 274 bp->b_flags |= flags | B_CALL; 275 return(bp); 276 } 277 278 /* 279 * Cleanup after a clustered read or write. 280 */ 281 void 282 cluster_callback(bp) 283 struct buf *bp; 284 { 285 struct cluster_save *b_save; 286 struct buf **tbp; 287 long bsize; 288 caddr_t cp; 289 b_save = (struct cluster_save *)(bp->b_saveaddr); 290 bp->b_saveaddr = b_save->bs_saveaddr; 291 292 cp = bp->b_un.b_addr + b_save->bs_bufsize; 293 for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { 294 pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); 295 cp += (*tbp)->b_bufsize; 296 bp->b_bufsize -= (*tbp)->b_bufsize; 297 biodone(*tbp); 298 } 299 #ifdef DIAGNOSTIC 300 if (bp->b_bufsize != b_save->bs_bufsize) 301 panic ("cluster_callback: more space to reclaim"); 302 #endif 303 bp->b_bcount = bp->b_bufsize; 304 bp->b_iodone = NULL; 305 free(b_save, M_SEGMENT); 306 if (bp->b_flags & B_ASYNC) 307 brelse(bp); 308 else 309 wakeup((caddr_t)bp); 310 } 311 312 /* 313 * Do clustered write for FFS. 314 * 315 * Three cases: 316 * 1. Write is not sequential (write asynchronously) 317 * Write is sequential: 318 * 2. beginning of cluster - begin cluster 319 * 3. middle of a cluster - add to cluster 320 * 4. end of a cluster - asynchronously write cluster 321 */ 322 void 323 cluster_write(bp, filesize) 324 struct buf *bp; 325 u_quad_t filesize; 326 { 327 struct vnode *vp; 328 daddr_t lbn; 329 int clen; 330 331 vp = bp->b_vp; 332 lbn = bp->b_lblkno; 333 334 /* Initialize vnode to beginning of file. */ 335 if (lbn == 0) 336 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 337 338 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 339 (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) { 340 if (vp->v_clen != 0) 341 /* 342 * Write is not sequential. 343 */ 344 cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 345 vp->v_lastw - vp->v_cstart + 1, lbn); 346 /* 347 * Consider beginning a cluster. 348 */ 349 if ((lbn + 1) * bp->b_bcount == filesize) 350 /* End of file, make cluster as large as possible */ 351 clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 352 else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { 353 bawrite(bp); 354 vp->v_clen = 0; 355 vp->v_lasta = bp->b_blkno; 356 vp->v_cstart = lbn + 1; 357 vp->v_lastw = lbn; 358 return; 359 } else 360 clen = 0; 361 vp->v_clen = clen; 362 if (clen == 0) { /* I/O not contiguous */ 363 vp->v_cstart = lbn + 1; 364 bawrite(bp); 365 } else { /* Wait for rest of cluster */ 366 vp->v_cstart = lbn; 367 bdwrite(bp); 368 } 369 } else if (lbn == vp->v_cstart + vp->v_clen) { 370 /* 371 * At end of cluster, write it out. 372 */ 373 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 374 vp->v_clen + 1, lbn); 375 vp->v_clen = 0; 376 vp->v_cstart = lbn + 1; 377 } else 378 /* 379 * In the middle of a cluster, so just delay the 380 * I/O for now. 381 */ 382 bdwrite(bp); 383 vp->v_lastw = lbn; 384 vp->v_lasta = bp->b_blkno; 385 } 386 387 388 /* 389 * This is an awful lot like cluster_rbuild...wish they could be combined. 390 * The last lbn argument is the current block on which I/O is being 391 * performed. Check to see that it doesn't fall in the middle of 392 * the current block. 393 */ 394 void 395 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 396 struct vnode *vp; 397 struct buf *last_bp; 398 long size; 399 daddr_t start_lbn; 400 int len; 401 daddr_t lbn; 402 { 403 struct cluster_save *b_save; 404 struct buf *bp, *tbp; 405 caddr_t cp; 406 int i, s; 407 408 #ifdef DIAGNOSTIC 409 if (size != vp->v_mount->mnt_stat.f_iosize) 410 panic("cluster_wbuild: size %d != filesize %d\n", 411 size, vp->v_mount->mnt_stat.f_iosize); 412 #endif 413 redo: 414 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 415 ++start_lbn; 416 --len; 417 } 418 419 /* Get more memory for current buffer */ 420 if (len <= 1) { 421 if (last_bp) { 422 bawrite(last_bp); 423 } else if (len) { 424 bp = getblk(vp, start_lbn, size, 0, 0); 425 bawrite(bp); 426 } 427 return; 428 } 429 430 bp = getblk(vp, start_lbn, size, 0, 0); 431 if (!(bp->b_flags & B_DELWRI)) { 432 ++start_lbn; 433 --len; 434 brelse(bp); 435 goto redo; 436 } 437 438 --len; 439 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 440 M_SEGMENT, M_WAITOK); 441 b_save->bs_bcount = bp->b_bcount; 442 b_save->bs_bufsize = bp->b_bufsize; 443 b_save->bs_nchildren = 0; 444 b_save->bs_children = (struct buf **)(b_save + 1); 445 b_save->bs_saveaddr = bp->b_saveaddr; 446 bp->b_saveaddr = (caddr_t) b_save; 447 448 449 bp->b_flags |= B_CALL; 450 bp->b_iodone = cluster_callback; 451 cp = bp->b_un.b_addr + bp->b_bufsize; 452 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 453 if (!incore(vp, start_lbn) || start_lbn == lbn) 454 break; 455 456 if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { 457 tbp = getblk(vp, start_lbn, size, 0, 0); 458 #ifdef DIAGNOSTIC 459 if (tbp->b_bcount != tbp->b_bufsize) 460 panic("cluster_wbuild: Buffer too big"); 461 #endif 462 if (!(tbp->b_flags & B_DELWRI)) { 463 brelse(tbp); 464 break; 465 } 466 } else 467 tbp = last_bp; 468 469 ++b_save->bs_nchildren; 470 471 /* Move memory from children to parent */ 472 if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) { 473 printf("Clustered Block: %d addr %x bufsize: %d\n", 474 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 475 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 476 tbp->b_blkno); 477 panic("Clustered write to wrong blocks"); 478 } 479 480 pagemove(tbp->b_un.b_daddr, cp, size); 481 bp->b_bcount += size; 482 bp->b_bufsize += size; 483 484 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 485 tbp->b_flags |= B_ASYNC; 486 s = splbio(); 487 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 488 ++tbp->b_vp->v_numoutput; 489 splx(s); 490 b_save->bs_children[i] = tbp; 491 492 cp += tbp->b_bufsize; 493 } 494 495 if (i == 0) { 496 /* None to cluster */ 497 bp->b_saveaddr = b_save->bs_saveaddr; 498 bp->b_flags &= ~B_CALL; 499 bp->b_iodone = NULL; 500 free(b_save, M_SEGMENT); 501 } 502 bawrite(bp); 503 if (i < len) { 504 len -= i + 1; 505 start_lbn += 1; 506 goto redo; 507 } 508 } 509