1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 * 7 * @(#)vfs_cluster.c 8.5 (Berkeley) 01/12/94 8 */ 9 10 #include <sys/param.h> 11 #include <sys/proc.h> 12 #include <sys/buf.h> 13 #include <sys/vnode.h> 14 #include <sys/mount.h> 15 #include <sys/trace.h> 16 #include <sys/malloc.h> 17 #include <sys/resourcevar.h> 18 #include <libkern/libkern.h> 19 20 /* 21 * Local declarations 22 */ 23 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 24 daddr_t, long, int)); 25 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 26 daddr_t, daddr_t, long, int, long)); 27 void cluster_wbuild __P((struct vnode *, struct buf *, long, 28 daddr_t, int, daddr_t)); 29 30 #ifdef DIAGNOSTIC 31 /* 32 * Set to 1 if reads of block zero should cause readahead to be done. 33 * Set to 0 treats a read of block zero as a non-sequential read. 34 * 35 * Setting to one assumes that most reads of block zero of files are due to 36 * sequential passes over the files (e.g. cat, sum) where additional blocks 37 * will soon be needed. Setting to zero assumes that the majority are 38 * surgical strikes to get particular info (e.g. size, file) where readahead 39 * blocks will not be used and, in fact, push out other potentially useful 40 * blocks from the cache. The former seems intuitive, but some quick tests 41 * showed that the latter performed better from a system-wide point of view. 42 */ 43 int doclusterraz = 0; 44 #define ISSEQREAD(vp, blk) \ 45 (((blk) != 0 || doclusterraz) && \ 46 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 47 #else 48 #define ISSEQREAD(vp, blk) \ 49 ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 50 #endif 51 52 /* 53 * This replaces bread. If this is a bread at the beginning of a file and 54 * lastr is 0, we assume this is the first read and we'll read up to two 55 * blocks if they are sequential. After that, we'll do regular read ahead 56 * in clustered chunks. 57 * 58 * There are 4 or 5 cases depending on how you count: 59 * Desired block is in the cache: 60 * 1 Not sequential access (0 I/Os). 61 * 2 Access is sequential, do read-ahead (1 ASYNC). 62 * Desired block is not in cache: 63 * 3 Not sequential access (1 SYNC). 64 * 4 Sequential access, next block is contiguous (1 SYNC). 65 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 66 * 67 * There are potentially two buffers that require I/O. 68 * bp is the block requested. 69 * rbp is the read-ahead block. 70 * If either is NULL, then you don't have to do the I/O. 71 */ 72 cluster_read(vp, filesize, lblkno, size, cred, bpp) 73 struct vnode *vp; 74 u_quad_t filesize; 75 daddr_t lblkno; 76 long size; 77 struct ucred *cred; 78 struct buf **bpp; 79 { 80 struct buf *bp, *rbp; 81 daddr_t blkno, ioblkno; 82 long flags; 83 int error, num_ra, alreadyincore; 84 85 #ifdef DIAGNOSTIC 86 if (size == 0) 87 panic("cluster_read: size = 0"); 88 #endif 89 90 error = 0; 91 flags = B_READ; 92 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 93 if (bp->b_flags & B_CACHE) { 94 /* 95 * Desired block is in cache; do any readahead ASYNC. 96 * Case 1, 2. 97 */ 98 trace(TR_BREADHIT, pack(vp, size), lblkno); 99 flags |= B_ASYNC; 100 ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 101 alreadyincore = (int)incore(vp, ioblkno); 102 bp = NULL; 103 } else { 104 /* Block wasn't in cache, case 3, 4, 5. */ 105 trace(TR_BREADMISS, pack(vp, size), lblkno); 106 bp->b_flags |= B_READ; 107 ioblkno = lblkno; 108 alreadyincore = 0; 109 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 110 } 111 /* 112 * XXX 113 * Replace 1 with a window size based on some permutation of 114 * maxcontig and rot_delay. This will let you figure out how 115 * many blocks you should read-ahead (case 2, 4, 5). 116 * 117 * If the access isn't sequential, reset the window to 1. 118 * Note that a read to the same block is considered sequential. 119 * This catches the case where the file is being read sequentially, 120 * but at smaller than the filesystem block size. 121 */ 122 rbp = NULL; 123 if (!ISSEQREAD(vp, lblkno)) { 124 vp->v_ralen = 0; 125 vp->v_maxra = lblkno; 126 } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 127 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 128 blkno != -1) { 129 /* 130 * Reading sequentially, and the next block is not in the 131 * cache. We are going to try reading ahead. 132 */ 133 if (num_ra) { 134 /* 135 * If our desired readahead block had been read 136 * in a previous readahead but is no longer in 137 * core, then we may be reading ahead too far 138 * or are not using our readahead very rapidly. 139 * In this case we scale back the window. 140 */ 141 if (!alreadyincore && ioblkno <= vp->v_maxra) 142 vp->v_ralen = max(vp->v_ralen >> 1, 1); 143 /* 144 * There are more sequential blocks than our current 145 * window allows, scale up. Ideally we want to get 146 * in sync with the filesystem maxcontig value. 147 */ 148 else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 149 vp->v_ralen = vp->v_ralen ? 150 min(num_ra, vp->v_ralen << 1) : 1; 151 152 if (num_ra > vp->v_ralen) 153 num_ra = vp->v_ralen; 154 } 155 156 if (num_ra) /* case 2, 4 */ 157 rbp = cluster_rbuild(vp, filesize, 158 bp, ioblkno, blkno, size, num_ra, flags); 159 else if (ioblkno == lblkno) { 160 bp->b_blkno = blkno; 161 /* Case 5: check how many blocks to read ahead */ 162 ++ioblkno; 163 if ((ioblkno + 1) * size > filesize || 164 incore(vp, ioblkno) || (error = VOP_BMAP(vp, 165 ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 166 goto skip_readahead; 167 /* 168 * Adjust readahead as above 169 */ 170 if (num_ra) { 171 if (!alreadyincore && ioblkno <= vp->v_maxra) 172 vp->v_ralen = max(vp->v_ralen >> 1, 1); 173 else if (num_ra > vp->v_ralen && 174 lblkno != vp->v_lastr) 175 vp->v_ralen = vp->v_ralen ? 176 min(num_ra,vp->v_ralen<<1) : 1; 177 if (num_ra > vp->v_ralen) 178 num_ra = vp->v_ralen; 179 } 180 flags |= B_ASYNC; 181 if (num_ra) 182 rbp = cluster_rbuild(vp, filesize, 183 NULL, ioblkno, blkno, size, num_ra, flags); 184 else { 185 rbp = getblk(vp, ioblkno, size, 0, 0); 186 rbp->b_flags |= flags; 187 rbp->b_blkno = blkno; 188 } 189 } else { 190 /* case 2; read ahead single block */ 191 rbp = getblk(vp, ioblkno, size, 0, 0); 192 rbp->b_flags |= flags; 193 rbp->b_blkno = blkno; 194 } 195 196 if (rbp == bp) /* case 4 */ 197 rbp = NULL; 198 else if (rbp) { /* case 2, 5 */ 199 trace(TR_BREADMISSRA, 200 pack(vp, (num_ra + 1) * size), ioblkno); 201 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 202 } 203 } 204 205 /* XXX Kirk, do we need to make sure the bp has creds? */ 206 skip_readahead: 207 if (bp) 208 if (bp->b_flags & (B_DONE | B_DELWRI)) 209 panic("cluster_read: DONE bp"); 210 else 211 error = VOP_STRATEGY(bp); 212 213 if (rbp) 214 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 215 rbp->b_flags &= ~(B_ASYNC | B_READ); 216 brelse(rbp); 217 } else 218 (void) VOP_STRATEGY(rbp); 219 220 /* 221 * Recalculate our maximum readahead 222 */ 223 if (rbp == NULL) 224 rbp = bp; 225 if (rbp) 226 vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 227 228 if (bp) 229 return(biowait(bp)); 230 return(error); 231 } 232 233 /* 234 * If blocks are contiguous on disk, use this to provide clustered 235 * read ahead. We will read as many blocks as possible sequentially 236 * and then parcel them up into logical blocks in the buffer hash table. 237 */ 238 struct buf * 239 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 240 struct vnode *vp; 241 u_quad_t filesize; 242 struct buf *bp; 243 daddr_t lbn; 244 daddr_t blkno; 245 long size; 246 int run; 247 long flags; 248 { 249 struct cluster_save *b_save; 250 struct buf *tbp; 251 daddr_t bn; 252 int i, inc; 253 254 #ifdef DIAGNOSTIC 255 if (size != vp->v_mount->mnt_stat.f_iosize) 256 panic("cluster_rbuild: size %d != filesize %d\n", 257 size, vp->v_mount->mnt_stat.f_iosize); 258 #endif 259 if (size * (lbn + run + 1) > filesize) 260 --run; 261 if (run == 0) { 262 if (!bp) { 263 bp = getblk(vp, lbn, size, 0, 0); 264 bp->b_blkno = blkno; 265 bp->b_flags |= flags; 266 } 267 return(bp); 268 } 269 270 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 271 if (bp->b_flags & (B_DONE | B_DELWRI)) 272 return (bp); 273 274 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 275 M_SEGMENT, M_WAITOK); 276 b_save->bs_bufsize = b_save->bs_bcount = size; 277 b_save->bs_nchildren = 0; 278 b_save->bs_children = (struct buf **)(b_save + 1); 279 b_save->bs_saveaddr = bp->b_saveaddr; 280 bp->b_saveaddr = (caddr_t) b_save; 281 282 inc = btodb(size); 283 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 284 if (incore(vp, lbn + i)) { 285 if (i == 1) { 286 bp->b_saveaddr = b_save->bs_saveaddr; 287 bp->b_flags &= ~B_CALL; 288 bp->b_iodone = NULL; 289 allocbuf(bp, size); 290 free(b_save, M_SEGMENT); 291 } else 292 allocbuf(bp, size * i); 293 break; 294 } 295 tbp = getblk(vp, lbn + i, 0, 0, 0); 296 /* 297 * getblk may return some memory in the buffer if there were 298 * no empty buffers to shed it to. If there is currently 299 * memory in the buffer, we move it down size bytes to make 300 * room for the valid pages that cluster_callback will insert. 301 * We do this now so we don't have to do it at interrupt time 302 * in the callback routine. 303 */ 304 if (tbp->b_bufsize != 0) { 305 caddr_t bdata = (char *)tbp->b_data; 306 307 if (tbp->b_bufsize + size > MAXBSIZE) 308 panic("cluster_rbuild: too much memory"); 309 if (tbp->b_bufsize > size) { 310 /* 311 * XXX if the source and destination regions 312 * overlap we have to copy backward to avoid 313 * clobbering any valid pages (i.e. pagemove 314 * implementations typically can't handle 315 * overlap). 316 */ 317 bdata += tbp->b_bufsize; 318 while (bdata > (char *)tbp->b_data) { 319 bdata -= CLBYTES; 320 pagemove(bdata, bdata + size, CLBYTES); 321 } 322 } else 323 pagemove(bdata, bdata + size, tbp->b_bufsize); 324 } 325 tbp->b_blkno = bn; 326 tbp->b_flags |= flags | B_READ | B_ASYNC; 327 ++b_save->bs_nchildren; 328 b_save->bs_children[i - 1] = tbp; 329 } 330 return(bp); 331 } 332 333 /* 334 * Either get a new buffer or grow the existing one. 335 */ 336 struct buf * 337 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 338 struct vnode *vp; 339 struct buf *bp; 340 long flags; 341 daddr_t blkno; 342 daddr_t lblkno; 343 long size; 344 int run; 345 { 346 if (!bp) { 347 bp = getblk(vp, lblkno, size, 0, 0); 348 if (bp->b_flags & (B_DONE | B_DELWRI)) { 349 bp->b_blkno = blkno; 350 return(bp); 351 } 352 } 353 allocbuf(bp, run * size); 354 bp->b_blkno = blkno; 355 bp->b_iodone = cluster_callback; 356 bp->b_flags |= flags | B_CALL; 357 return(bp); 358 } 359 360 /* 361 * Cleanup after a clustered read or write. 362 * This is complicated by the fact that any of the buffers might have 363 * extra memory (if there were no empty buffer headers at allocbuf time) 364 * that we will need to shift around. 365 */ 366 void 367 cluster_callback(bp) 368 struct buf *bp; 369 { 370 struct cluster_save *b_save; 371 struct buf **bpp, *tbp; 372 long bsize; 373 caddr_t cp; 374 int error = 0; 375 376 /* 377 * Must propogate errors to all the components. 378 */ 379 if (bp->b_flags & B_ERROR) 380 error = bp->b_error; 381 382 b_save = (struct cluster_save *)(bp->b_saveaddr); 383 bp->b_saveaddr = b_save->bs_saveaddr; 384 385 bsize = b_save->bs_bufsize; 386 cp = (char *)bp->b_data + bsize; 387 /* 388 * Move memory from the large cluster buffer into the component 389 * buffers and mark IO as done on these. 390 */ 391 for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 392 tbp = *bpp; 393 pagemove(cp, tbp->b_data, bsize); 394 tbp->b_bufsize += bsize; 395 tbp->b_bcount = bsize; 396 if (error) { 397 tbp->b_flags |= B_ERROR; 398 tbp->b_error = error; 399 } 400 biodone(tbp); 401 bp->b_bufsize -= bsize; 402 cp += bsize; 403 } 404 /* 405 * If there was excess memory in the cluster buffer, 406 * slide it up adjacent to the remaining valid data. 407 */ 408 if (bp->b_bufsize != bsize) { 409 if (bp->b_bufsize < bsize) 410 panic("cluster_callback: too little memory"); 411 pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 412 } 413 bp->b_bcount = bsize; 414 bp->b_iodone = NULL; 415 free(b_save, M_SEGMENT); 416 if (bp->b_flags & B_ASYNC) 417 brelse(bp); 418 else { 419 bp->b_flags &= ~B_WANTED; 420 wakeup((caddr_t)bp); 421 } 422 } 423 424 /* 425 * Do clustered write for FFS. 426 * 427 * Three cases: 428 * 1. Write is not sequential (write asynchronously) 429 * Write is sequential: 430 * 2. beginning of cluster - begin cluster 431 * 3. middle of a cluster - add to cluster 432 * 4. end of a cluster - asynchronously write cluster 433 */ 434 void 435 cluster_write(bp, filesize) 436 struct buf *bp; 437 u_quad_t filesize; 438 { 439 struct vnode *vp; 440 daddr_t lbn; 441 int clen; 442 443 vp = bp->b_vp; 444 lbn = bp->b_lblkno; 445 446 /* Initialize vnode to beginning of file. */ 447 if (lbn == 0) 448 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 449 450 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 451 (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 452 if (vp->v_clen != 0) 453 /* 454 * Write is not sequential. 455 */ 456 cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 457 vp->v_lastw - vp->v_cstart + 1, lbn); 458 /* 459 * Consider beginning a cluster. 460 */ 461 if ((lbn + 1) * bp->b_bcount == filesize) 462 /* End of file, make cluster as large as possible */ 463 clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 464 else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen) || 465 bp->b_blkno == -1) { 466 bawrite(bp); 467 vp->v_clen = 0; 468 vp->v_lasta = bp->b_blkno; 469 vp->v_cstart = lbn + 1; 470 vp->v_lastw = lbn; 471 return; 472 } 473 vp->v_clen = clen; 474 if (clen == 0) { /* I/O not contiguous */ 475 vp->v_cstart = lbn + 1; 476 bawrite(bp); 477 } else { /* Wait for rest of cluster */ 478 vp->v_cstart = lbn; 479 bdwrite(bp); 480 } 481 } else if (lbn == vp->v_cstart + vp->v_clen) { 482 /* 483 * At end of cluster, write it out. 484 */ 485 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 486 vp->v_clen + 1, lbn); 487 vp->v_clen = 0; 488 vp->v_cstart = lbn + 1; 489 } else 490 /* 491 * In the middle of a cluster, so just delay the 492 * I/O for now. 493 */ 494 bdwrite(bp); 495 vp->v_lastw = lbn; 496 vp->v_lasta = bp->b_blkno; 497 } 498 499 500 /* 501 * This is an awful lot like cluster_rbuild...wish they could be combined. 502 * The last lbn argument is the current block on which I/O is being 503 * performed. Check to see that it doesn't fall in the middle of 504 * the current block (if last_bp == NULL). 505 */ 506 void 507 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 508 struct vnode *vp; 509 struct buf *last_bp; 510 long size; 511 daddr_t start_lbn; 512 int len; 513 daddr_t lbn; 514 { 515 struct cluster_save *b_save; 516 struct buf *bp, *tbp; 517 caddr_t cp; 518 int i, s; 519 520 #ifdef DIAGNOSTIC 521 if (size != vp->v_mount->mnt_stat.f_iosize) 522 panic("cluster_wbuild: size %d != filesize %d\n", 523 size, vp->v_mount->mnt_stat.f_iosize); 524 #endif 525 redo: 526 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 527 ++start_lbn; 528 --len; 529 } 530 531 /* Get more memory for current buffer */ 532 if (len <= 1) { 533 if (last_bp) { 534 bawrite(last_bp); 535 } else if (len) { 536 bp = getblk(vp, start_lbn, size, 0, 0); 537 bawrite(bp); 538 } 539 return; 540 } 541 542 bp = getblk(vp, start_lbn, size, 0, 0); 543 if (!(bp->b_flags & B_DELWRI)) { 544 ++start_lbn; 545 --len; 546 brelse(bp); 547 goto redo; 548 } 549 550 /* 551 * Extra memory in the buffer, punt on this buffer. 552 * XXX we could handle this in most cases, but we would have to 553 * push the extra memory down to after our max possible cluster 554 * size and then potentially pull it back up if the cluster was 555 * terminated prematurely--too much hassle. 556 */ 557 if (bp->b_bcount != bp->b_bufsize) { 558 ++start_lbn; 559 --len; 560 bawrite(bp); 561 goto redo; 562 } 563 564 --len; 565 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 566 M_SEGMENT, M_WAITOK); 567 b_save->bs_bcount = bp->b_bcount; 568 b_save->bs_bufsize = bp->b_bufsize; 569 b_save->bs_nchildren = 0; 570 b_save->bs_children = (struct buf **)(b_save + 1); 571 b_save->bs_saveaddr = bp->b_saveaddr; 572 bp->b_saveaddr = (caddr_t) b_save; 573 574 bp->b_flags |= B_CALL; 575 bp->b_iodone = cluster_callback; 576 cp = (char *)bp->b_data + size; 577 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 578 /* 579 * Block is not in core or the non-sequential block 580 * ending our cluster was part of the cluster (in which 581 * case we don't want to write it twice). 582 */ 583 if (!incore(vp, start_lbn) || 584 last_bp == NULL && start_lbn == lbn) 585 break; 586 587 /* 588 * Get the desired block buffer (unless it is the final 589 * sequential block whose buffer was passed in explictly 590 * as last_bp). 591 */ 592 if (last_bp == NULL || start_lbn != lbn) { 593 tbp = getblk(vp, start_lbn, size, 0, 0); 594 if (!(tbp->b_flags & B_DELWRI)) { 595 brelse(tbp); 596 break; 597 } 598 } else 599 tbp = last_bp; 600 601 ++b_save->bs_nchildren; 602 603 /* Move memory from children to parent */ 604 if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 605 printf("Clustered Block: %d addr %x bufsize: %d\n", 606 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 607 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 608 tbp->b_blkno); 609 panic("Clustered write to wrong blocks"); 610 } 611 612 pagemove(tbp->b_data, cp, size); 613 bp->b_bcount += size; 614 bp->b_bufsize += size; 615 616 tbp->b_bufsize -= size; 617 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 618 tbp->b_flags |= (B_ASYNC | B_AGE); 619 s = splbio(); 620 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 621 ++tbp->b_vp->v_numoutput; 622 splx(s); 623 b_save->bs_children[i] = tbp; 624 625 cp += size; 626 } 627 628 if (i == 0) { 629 /* None to cluster */ 630 bp->b_saveaddr = b_save->bs_saveaddr; 631 bp->b_flags &= ~B_CALL; 632 bp->b_iodone = NULL; 633 free(b_save, M_SEGMENT); 634 } 635 bawrite(bp); 636 if (i < len) { 637 len -= i + 1; 638 start_lbn += 1; 639 goto redo; 640 } 641 } 642