1 /* 2 * Copyright (c) 1982, 1986, 1989 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms are permitted 6 * provided that the above copyright notice and this paragraph are 7 * duplicated in all such forms and that any documentation, 8 * advertising materials, and other materials related to such 9 * distribution and use acknowledge that the software was developed 10 * by the University of California, Berkeley. The name of the 11 * University may not be used to endorse or promote products derived 12 * from this software without specific prior written permission. 13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 16 * 17 * @(#)vfs_cluster.c 7.29 (Berkeley) 05/30/90 18 */ 19 20 #include "param.h" 21 #include "user.h" 22 #include "buf.h" 23 #include "vnode.h" 24 #include "specdev.h" 25 #include "mount.h" 26 #include "trace.h" 27 #include "ucred.h" 28 29 /* 30 * Read in (if necessary) the block and return a buffer pointer. 31 */ 32 bread(vp, blkno, size, cred, bpp) 33 struct vnode *vp; 34 daddr_t blkno; 35 int size; 36 struct ucred *cred; 37 struct buf **bpp; 38 { 39 register struct buf *bp; 40 41 if (size == 0) 42 panic("bread: size 0"); 43 *bpp = bp = getblk(vp, blkno, size); 44 if (bp->b_flags&(B_DONE|B_DELWRI)) { 45 trace(TR_BREADHIT, pack(vp, size), blkno); 46 return (0); 47 } 48 bp->b_flags |= B_READ; 49 if (bp->b_bcount > bp->b_bufsize) 50 panic("bread"); 51 if (bp->b_rcred == NOCRED && cred != NOCRED) { 52 crhold(cred); 53 bp->b_rcred = cred; 54 } 55 VOP_STRATEGY(bp); 56 trace(TR_BREADMISS, pack(vp, size), blkno); 57 u.u_ru.ru_inblock++; /* pay for read */ 58 return (biowait(bp)); 59 } 60 61 /* 62 * Read in the block, like bread, but also start I/O on the 63 * read-ahead block (which is not allocated to the caller) 64 */ 65 breada(vp, blkno, size, rablkno, rabsize, cred, bpp) 66 struct vnode *vp; 67 daddr_t blkno; int size; 68 daddr_t rablkno; int rabsize; 69 struct ucred *cred; 70 struct buf **bpp; 71 { 72 register struct buf *bp, *rabp; 73 74 bp = NULL; 75 /* 76 * If the block isn't in core, then allocate 77 * a buffer and initiate i/o (getblk checks 78 * for a cache hit). 79 */ 80 if (!incore(vp, blkno)) { 81 *bpp = bp = getblk(vp, blkno, size); 82 if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) { 83 bp->b_flags |= B_READ; 84 if (bp->b_bcount > bp->b_bufsize) 85 panic("breada"); 86 if (bp->b_rcred == NOCRED && cred != NOCRED) { 87 crhold(cred); 88 bp->b_rcred = cred; 89 } 90 VOP_STRATEGY(bp); 91 trace(TR_BREADMISS, pack(vp, size), blkno); 92 u.u_ru.ru_inblock++; /* pay for read */ 93 } else 94 trace(TR_BREADHIT, pack(vp, size), blkno); 95 } 96 97 /* 98 * If there's a read-ahead block, start i/o 99 * on it also (as above). 100 */ 101 if (!incore(vp, rablkno)) { 102 rabp = getblk(vp, rablkno, rabsize); 103 if (rabp->b_flags & (B_DONE|B_DELWRI)) { 104 brelse(rabp); 105 trace(TR_BREADHITRA, pack(vp, rabsize), rablkno); 106 } else { 107 rabp->b_flags |= B_READ|B_ASYNC; 108 if (rabp->b_bcount > rabp->b_bufsize) 109 panic("breadrabp"); 110 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 111 crhold(cred); 112 rabp->b_rcred = cred; 113 } 114 VOP_STRATEGY(rabp); 115 trace(TR_BREADMISSRA, pack(vp, rabsize), rablkno); 116 u.u_ru.ru_inblock++; /* pay in advance */ 117 } 118 } 119 120 /* 121 * If block was in core, let bread get it. 122 * If block wasn't in core, then the read was started 123 * above, and just wait for it. 124 */ 125 if (bp == NULL) 126 return (bread(vp, blkno, size, cred, bpp)); 127 return (biowait(bp)); 128 } 129 130 /* 131 * Write the buffer, waiting for completion. 132 * Then release the buffer. 133 */ 134 bwrite(bp) 135 register struct buf *bp; 136 { 137 register int flag; 138 int s, error; 139 140 flag = bp->b_flags; 141 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 142 if ((flag&B_DELWRI) == 0) 143 u.u_ru.ru_oublock++; /* noone paid yet */ 144 else 145 reassignbuf(bp, bp->b_vp); 146 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 147 if (bp->b_bcount > bp->b_bufsize) 148 panic("bwrite"); 149 s = splbio(); 150 bp->b_vp->v_numoutput++; 151 splx(s); 152 VOP_STRATEGY(bp); 153 154 /* 155 * If the write was synchronous, then await i/o completion. 156 * If the write was "delayed", then we put the buffer on 157 * the q of blocks awaiting i/o completion status. 158 */ 159 if ((flag&B_ASYNC) == 0) { 160 error = biowait(bp); 161 brelse(bp); 162 } else if (flag & B_DELWRI) { 163 bp->b_flags |= B_AGE; 164 error = 0; 165 } 166 return (error); 167 } 168 169 /* 170 * Release the buffer, marking it so that if it is grabbed 171 * for another purpose it will be written out before being 172 * given up (e.g. when writing a partial block where it is 173 * assumed that another write for the same block will soon follow). 174 * This can't be done for magtape, since writes must be done 175 * in the same order as requested. 176 */ 177 bdwrite(bp) 178 register struct buf *bp; 179 { 180 181 if ((bp->b_flags & B_DELWRI) == 0) { 182 bp->b_flags |= B_DELWRI; 183 reassignbuf(bp, bp->b_vp); 184 u.u_ru.ru_oublock++; /* noone paid yet */ 185 } 186 /* 187 * If this is a tape drive, the write must be initiated. 188 */ 189 if (VOP_IOCTL(bp->b_vp, 0, B_TAPE, 0, NOCRED) == 0) { 190 bawrite(bp); 191 } else { 192 bp->b_flags |= B_DELWRI | B_DONE; 193 brelse(bp); 194 } 195 } 196 197 /* 198 * Release the buffer, start I/O on it, but don't wait for completion. 199 */ 200 bawrite(bp) 201 register struct buf *bp; 202 { 203 204 bp->b_flags |= B_ASYNC; 205 (void) bwrite(bp); 206 } 207 208 /* 209 * Release the buffer, with no I/O implied. 210 */ 211 brelse(bp) 212 register struct buf *bp; 213 { 214 register struct buf *flist; 215 register s; 216 217 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 218 /* 219 * If a process is waiting for the buffer, or 220 * is waiting for a free buffer, awaken it. 221 */ 222 if (bp->b_flags&B_WANTED) 223 wakeup((caddr_t)bp); 224 if (bfreelist[0].b_flags&B_WANTED) { 225 bfreelist[0].b_flags &= ~B_WANTED; 226 wakeup((caddr_t)bfreelist); 227 } 228 /* 229 * Retry I/O for locked buffers rather than invalidating them. 230 */ 231 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 232 bp->b_flags &= ~B_ERROR; 233 234 /* 235 * Disassociate buffers that are no longer valid. 236 */ 237 if (bp->b_flags & (B_NOCACHE|B_ERROR)) 238 bp->b_flags |= B_INVAL; 239 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR|B_INVAL))) { 240 if (bp->b_vp) 241 brelvp(bp); 242 bp->b_flags &= ~B_DELWRI; 243 } 244 /* 245 * Stick the buffer back on a free list. 246 */ 247 s = splbio(); 248 if (bp->b_bufsize <= 0) { 249 /* block has no buffer ... put at front of unused buffer list */ 250 flist = &bfreelist[BQ_EMPTY]; 251 binsheadfree(bp, flist); 252 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 253 /* block has no info ... put at front of most free list */ 254 flist = &bfreelist[BQ_AGE]; 255 binsheadfree(bp, flist); 256 } else { 257 if (bp->b_flags & B_LOCKED) 258 flist = &bfreelist[BQ_LOCKED]; 259 else if (bp->b_flags & B_AGE) 260 flist = &bfreelist[BQ_AGE]; 261 else 262 flist = &bfreelist[BQ_LRU]; 263 binstailfree(bp, flist); 264 } 265 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE|B_NOCACHE); 266 splx(s); 267 } 268 269 /* 270 * See if the block is associated with some buffer 271 * (mainly to avoid getting hung up on a wait in breada) 272 */ 273 incore(vp, blkno) 274 struct vnode *vp; 275 daddr_t blkno; 276 { 277 register struct buf *bp; 278 register struct buf *dp; 279 280 dp = BUFHASH(vp, blkno); 281 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 282 if (bp->b_lblkno == blkno && bp->b_vp == vp && 283 (bp->b_flags & B_INVAL) == 0) 284 return (1); 285 return (0); 286 } 287 288 /* 289 * Return a block if it is in memory. 290 */ 291 baddr(vp, blkno, size, cred, bpp) 292 struct vnode *vp; 293 daddr_t blkno; 294 int size; 295 struct ucred *cred; 296 struct buf **bpp; 297 { 298 299 if (incore(vp, blkno)) 300 return (bread(vp, blkno, size, cred, bpp)); 301 *bpp = 0; 302 return (0); 303 } 304 305 /* 306 * Assign a buffer for the given block. If the appropriate 307 * block is already associated, return it; otherwise search 308 * for the oldest non-busy buffer and reassign it. 309 * 310 * We use splx here because this routine may be called 311 * on the interrupt stack during a dump, and we don't 312 * want to lower the ipl back to 0. 313 */ 314 struct buf * 315 getblk(vp, blkno, size) 316 register struct vnode *vp; 317 daddr_t blkno; 318 int size; 319 { 320 register struct buf *bp, *dp; 321 int s; 322 323 if (size > MAXBSIZE) 324 panic("getblk: size too big"); 325 /* 326 * Search the cache for the block. If we hit, but 327 * the buffer is in use for i/o, then we wait until 328 * the i/o has completed. 329 */ 330 dp = BUFHASH(vp, blkno); 331 loop: 332 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 333 if (bp->b_lblkno != blkno || bp->b_vp != vp || 334 bp->b_flags&B_INVAL) 335 continue; 336 s = splbio(); 337 if (bp->b_flags&B_BUSY) { 338 bp->b_flags |= B_WANTED; 339 sleep((caddr_t)bp, PRIBIO+1); 340 splx(s); 341 goto loop; 342 } 343 bremfree(bp); 344 bp->b_flags |= B_BUSY; 345 splx(s); 346 if (bp->b_bcount != size) { 347 printf("getblk: stray size"); 348 bp->b_flags |= B_INVAL; 349 bwrite(bp); 350 goto loop; 351 } 352 bp->b_flags |= B_CACHE; 353 return (bp); 354 } 355 bp = getnewbuf(); 356 bfree(bp); 357 bremhash(bp); 358 bgetvp(vp, bp); 359 bp->b_lblkno = blkno; 360 bp->b_blkno = blkno; 361 bp->b_error = 0; 362 bp->b_resid = 0; 363 binshash(bp, dp); 364 brealloc(bp, size); 365 return (bp); 366 } 367 368 /* 369 * get an empty block, 370 * not assigned to any particular device 371 */ 372 struct buf * 373 geteblk(size) 374 int size; 375 { 376 register struct buf *bp, *flist; 377 378 if (size > MAXBSIZE) 379 panic("geteblk: size too big"); 380 bp = getnewbuf(); 381 bp->b_flags |= B_INVAL; 382 bfree(bp); 383 bremhash(bp); 384 flist = &bfreelist[BQ_AGE]; 385 bp->b_error = 0; 386 bp->b_resid = 0; 387 binshash(bp, flist); 388 brealloc(bp, size); 389 return (bp); 390 } 391 392 /* 393 * Allocate space associated with a buffer. 394 */ 395 brealloc(bp, size) 396 register struct buf *bp; 397 int size; 398 { 399 daddr_t start, last; 400 register struct buf *ep; 401 struct buf *dp; 402 int s; 403 404 if (size == bp->b_bcount) 405 return; 406 allocbuf(bp, size); 407 } 408 409 /* 410 * Find a buffer which is available for use. 411 * Select something from a free list. 412 * Preference is to AGE list, then LRU list. 413 */ 414 struct buf * 415 getnewbuf() 416 { 417 register struct buf *bp, *dp; 418 register struct ucred *cred; 419 int s; 420 421 loop: 422 s = splbio(); 423 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 424 if (dp->av_forw != dp) 425 break; 426 if (dp == bfreelist) { /* no free blocks */ 427 dp->b_flags |= B_WANTED; 428 sleep((caddr_t)dp, PRIBIO+1); 429 splx(s); 430 goto loop; 431 } 432 bp = dp->av_forw; 433 bremfree(bp); 434 bp->b_flags |= B_BUSY; 435 splx(s); 436 if (bp->b_flags & B_DELWRI) { 437 (void) bawrite(bp); 438 goto loop; 439 } 440 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 441 if (bp->b_vp) 442 brelvp(bp); 443 if (bp->b_rcred != NOCRED) { 444 cred = bp->b_rcred; 445 bp->b_rcred = NOCRED; 446 crfree(cred); 447 } 448 if (bp->b_wcred != NOCRED) { 449 cred = bp->b_wcred; 450 bp->b_wcred = NOCRED; 451 crfree(cred); 452 } 453 bp->b_flags = B_BUSY; 454 return (bp); 455 } 456 457 /* 458 * Wait for I/O completion on the buffer; return errors 459 * to the user. 460 */ 461 biowait(bp) 462 register struct buf *bp; 463 { 464 int s; 465 466 s = splbio(); 467 while ((bp->b_flags & B_DONE) == 0) 468 sleep((caddr_t)bp, PRIBIO); 469 splx(s); 470 /* 471 * Pick up the device's error number and pass it to the user; 472 * if there is an error but the number is 0 set a generalized code. 473 */ 474 if ((bp->b_flags & B_ERROR) == 0) 475 return (0); 476 if (bp->b_error) 477 return (bp->b_error); 478 return (EIO); 479 } 480 481 /* 482 * Mark I/O complete on a buffer. 483 * If someone should be called, e.g. the pageout 484 * daemon, do so. Otherwise, wake up anyone 485 * waiting for it. 486 */ 487 biodone(bp) 488 register struct buf *bp; 489 { 490 register struct vnode *vp; 491 492 if (bp->b_flags & B_DONE) 493 panic("dup biodone"); 494 bp->b_flags |= B_DONE; 495 if ((bp->b_flags & B_READ) == 0) { 496 bp->b_dirtyoff = bp->b_dirtyend = 0; 497 if (vp = bp->b_vp) { 498 vp->v_numoutput--; 499 if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { 500 if (vp->v_numoutput < 0) 501 panic("biodone: neg numoutput"); 502 vp->v_flag &= ~VBWAIT; 503 wakeup((caddr_t)&vp->v_numoutput); 504 } 505 } 506 } 507 if (bp->b_flags & B_CALL) { 508 bp->b_flags &= ~B_CALL; 509 (*bp->b_iodone)(bp); 510 return; 511 } 512 if (bp->b_flags&B_ASYNC) 513 brelse(bp); 514 else { 515 bp->b_flags &= ~B_WANTED; 516 wakeup((caddr_t)bp); 517 } 518 } 519 520 /* 521 * Make sure all write-behind blocks associated 522 * with mount point are flushed out (from sync). 523 */ 524 mntflushbuf(mountp, flags) 525 struct mount *mountp; 526 int flags; 527 { 528 register struct vnode *vp; 529 530 if ((mountp->mnt_flag & MNT_MPBUSY) == 0) 531 panic("mntflushbuf: not busy"); 532 loop: 533 for (vp = mountp->mnt_mounth; vp; vp = vp->v_mountf) { 534 if (vget(vp)) 535 goto loop; 536 vflushbuf(vp, flags); 537 vput(vp); 538 if (vp->v_mount != mountp) 539 goto loop; 540 } 541 } 542 543 /* 544 * Flush all dirty buffers associated with a vnode. 545 */ 546 vflushbuf(vp, flags) 547 register struct vnode *vp; 548 int flags; 549 { 550 register struct buf *bp; 551 struct buf *nbp; 552 int s; 553 554 loop: 555 s = splbio(); 556 for (bp = vp->v_dirtyblkhd; bp; bp = nbp) { 557 nbp = bp->b_blockf; 558 if ((bp->b_flags & B_BUSY)) 559 continue; 560 if ((bp->b_flags & B_DELWRI) == 0) 561 panic("vflushbuf: not dirty"); 562 bremfree(bp); 563 bp->b_flags |= B_BUSY; 564 splx(s); 565 /* 566 * Wait for I/O associated with indirect blocks to complete, 567 * since there is no way to quickly wait for them below. 568 * NB - This is really specific to ufs, but is done here 569 * as it is easier and quicker. 570 */ 571 if (bp->b_vp == vp || (flags & B_SYNC) == 0) { 572 (void) bawrite(bp); 573 s = splbio(); 574 } else { 575 (void) bwrite(bp); 576 goto loop; 577 } 578 } 579 splx(s); 580 if ((flags & B_SYNC) == 0) 581 return; 582 s = splbio(); 583 while (vp->v_numoutput) { 584 vp->v_flag |= VBWAIT; 585 sleep((caddr_t)&vp->v_numoutput, PRIBIO+1); 586 } 587 splx(s); 588 if (vp->v_dirtyblkhd) { 589 vprint("vflushbuf: dirty", vp); 590 goto loop; 591 } 592 } 593 594 /* 595 * Invalidate in core blocks belonging to closed or umounted filesystem 596 * 597 * Go through the list of vnodes associated with the file system; 598 * for each vnode invalidate any buffers that it holds. Normally 599 * this routine is preceeded by a bflush call, so that on a quiescent 600 * filesystem there will be no dirty buffers when we are done. Binval 601 * returns the count of dirty buffers when it is finished. 602 */ 603 mntinvalbuf(mountp) 604 struct mount *mountp; 605 { 606 register struct vnode *vp; 607 int dirty = 0; 608 609 if ((mountp->mnt_flag & MNT_MPBUSY) == 0) 610 panic("mntinvalbuf: not busy"); 611 loop: 612 for (vp = mountp->mnt_mounth; vp; vp = vp->v_mountf) { 613 if (vget(vp)) 614 goto loop; 615 dirty += vinvalbuf(vp, 1); 616 vput(vp); 617 if (vp->v_mount != mountp) 618 goto loop; 619 } 620 return (dirty); 621 } 622 623 /* 624 * Flush out and invalidate all buffers associated with a vnode. 625 * Called with the underlying object locked. 626 */ 627 vinvalbuf(vp, save) 628 register struct vnode *vp; 629 int save; 630 { 631 register struct buf *bp; 632 struct buf *nbp, *blist; 633 int s, dirty = 0; 634 635 for (;;) { 636 if (blist = vp->v_dirtyblkhd) 637 /* void */; 638 else if (blist = vp->v_cleanblkhd) 639 /* void */; 640 else 641 break; 642 for (bp = blist; bp; bp = nbp) { 643 nbp = bp->b_blockf; 644 s = splbio(); 645 if (bp->b_flags & B_BUSY) { 646 bp->b_flags |= B_WANTED; 647 sleep((caddr_t)bp, PRIBIO+1); 648 splx(s); 649 break; 650 } 651 bremfree(bp); 652 bp->b_flags |= B_BUSY; 653 splx(s); 654 if (save && (bp->b_flags & B_DELWRI)) { 655 dirty++; 656 (void) bwrite(bp); 657 break; 658 } 659 if (bp->b_vp != vp) 660 reassignbuf(bp, bp->b_vp); 661 else 662 bp->b_flags |= B_INVAL; 663 brelse(bp); 664 } 665 } 666 if (vp->v_dirtyblkhd || vp->v_cleanblkhd) 667 panic("vinvalbuf: flush failed"); 668 return (dirty); 669 } 670 671 /* 672 * Associate a buffer with a vnode. 673 */ 674 bgetvp(vp, bp) 675 register struct vnode *vp; 676 register struct buf *bp; 677 { 678 679 if (bp->b_vp) 680 panic("bgetvp: not free"); 681 VHOLD(vp); 682 bp->b_vp = vp; 683 if (vp->v_type == VBLK || vp->v_type == VCHR) 684 bp->b_dev = vp->v_rdev; 685 else 686 bp->b_dev = NODEV; 687 /* 688 * Insert onto list for new vnode. 689 */ 690 if (vp->v_cleanblkhd) { 691 bp->b_blockf = vp->v_cleanblkhd; 692 bp->b_blockb = &vp->v_cleanblkhd; 693 vp->v_cleanblkhd->b_blockb = &bp->b_blockf; 694 vp->v_cleanblkhd = bp; 695 } else { 696 vp->v_cleanblkhd = bp; 697 bp->b_blockb = &vp->v_cleanblkhd; 698 bp->b_blockf = NULL; 699 } 700 } 701 702 /* 703 * Disassociate a buffer from a vnode. 704 */ 705 brelvp(bp) 706 register struct buf *bp; 707 { 708 struct buf *bq; 709 struct vnode *vp; 710 711 if (bp->b_vp == (struct vnode *) 0) 712 panic("brelvp: NULL"); 713 /* 714 * Delete from old vnode list, if on one. 715 */ 716 if (bp->b_blockb) { 717 if (bq = bp->b_blockf) 718 bq->b_blockb = bp->b_blockb; 719 *bp->b_blockb = bq; 720 bp->b_blockf = NULL; 721 bp->b_blockb = NULL; 722 } 723 vp = bp->b_vp; 724 bp->b_vp = (struct vnode *) 0; 725 HOLDRELE(vp); 726 } 727 728 /* 729 * Reassign a buffer from one vnode to another. 730 * Used to assign file specific control information 731 * (indirect blocks) to the vnode to which they belong. 732 */ 733 reassignbuf(bp, newvp) 734 register struct buf *bp; 735 register struct vnode *newvp; 736 { 737 register struct buf *bq, **listheadp; 738 739 if (newvp == NULL) 740 panic("reassignbuf: NULL"); 741 /* 742 * Delete from old vnode list, if on one. 743 */ 744 if (bp->b_blockb) { 745 if (bq = bp->b_blockf) 746 bq->b_blockb = bp->b_blockb; 747 *bp->b_blockb = bq; 748 } 749 /* 750 * If dirty, put on list of dirty buffers; 751 * otherwise insert onto list of clean buffers. 752 */ 753 if (bp->b_flags & B_DELWRI) 754 listheadp = &newvp->v_dirtyblkhd; 755 else 756 listheadp = &newvp->v_cleanblkhd; 757 if (*listheadp) { 758 bp->b_blockf = *listheadp; 759 bp->b_blockb = listheadp; 760 bp->b_blockf->b_blockb = &bp->b_blockf; 761 *listheadp = bp; 762 } else { 763 *listheadp = bp; 764 bp->b_blockb = listheadp; 765 bp->b_blockf = NULL; 766 } 767 } 768