1 /* 2 * Copyright (c) 1982, 1986, 1989 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms are permitted 6 * provided that the above copyright notice and this paragraph are 7 * duplicated in all such forms and that any documentation, 8 * advertising materials, and other materials related to such 9 * distribution and use acknowledge that the software was developed 10 * by the University of California, Berkeley. The name of the 11 * University may not be used to endorse or promote products derived 12 * from this software without specific prior written permission. 13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 16 * 17 * @(#)vfs_cluster.c 7.5 (Berkeley) 05/09/89 18 */ 19 20 #include "param.h" 21 #include "user.h" 22 #include "buf.h" 23 #include "vnode.h" 24 #include "trace.h" 25 26 /* 27 * Read in (if necessary) the block and return a buffer pointer. 28 */ 29 bread(vp, blkno, size, bpp) 30 struct vnode *vp; 31 daddr_t blkno; 32 int size; 33 struct buf **bpp; 34 { 35 register struct buf *bp; 36 37 if (size == 0) 38 panic("bread: size 0"); 39 *bpp = bp = getblk(vp, blkno, size); 40 if (bp->b_flags&(B_DONE|B_DELWRI)) { 41 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), blkno); 42 return (0); 43 } 44 bp->b_flags |= B_READ; 45 if (bp->b_bcount > bp->b_bufsize) 46 panic("bread"); 47 VOP_STRATEGY(bp); 48 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), blkno); 49 u.u_ru.ru_inblock++; /* pay for read */ 50 return (biowait(bp)); 51 } 52 53 /* 54 * Read in the block, like bread, but also start I/O on the 55 * read-ahead block (which is not allocated to the caller) 56 */ 57 breada(vp, blkno, size, rablkno, rabsize, bpp) 58 struct vnode *vp; 59 daddr_t blkno; int size; 60 daddr_t rablkno; int rabsize; 61 struct buf **bpp; 62 { 63 register struct buf *bp, *rabp; 64 65 bp = NULL; 66 /* 67 * If the block isn't in core, then allocate 68 * a buffer and initiate i/o (getblk checks 69 * for a cache hit). 70 */ 71 if (!incore(vp, blkno)) { 72 *bpp = bp = getblk(vp, blkno, size); 73 if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) { 74 bp->b_flags |= B_READ; 75 if (bp->b_bcount > bp->b_bufsize) 76 panic("breada"); 77 VOP_STRATEGY(bp); 78 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), 79 blkno); 80 u.u_ru.ru_inblock++; /* pay for read */ 81 } else 82 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), 83 blkno); 84 } 85 86 /* 87 * If there's a read-ahead block, start i/o 88 * on it also (as above). 89 */ 90 if (rablkno && !incore(vp, rablkno)) { 91 rabp = getblk(vp, rablkno, rabsize); 92 if (rabp->b_flags & (B_DONE|B_DELWRI)) { 93 brelse(rabp); 94 trace(TR_BREADHITRA, 95 pack(vp->v_mount->m_fsid[0], rabsize), blkno); 96 } else { 97 rabp->b_flags |= B_READ|B_ASYNC; 98 if (rabp->b_bcount > rabp->b_bufsize) 99 panic("breadrabp"); 100 VOP_STRATEGY(rabp); 101 trace(TR_BREADMISSRA, 102 pack(vp->v_mount->m_fsid[0], rabsize), rablock); 103 u.u_ru.ru_inblock++; /* pay in advance */ 104 } 105 } 106 107 /* 108 * If block was in core, let bread get it. 109 * If block wasn't in core, then the read was started 110 * above, and just wait for it. 111 */ 112 if (bp == NULL) 113 return (bread(vp, blkno, size, bpp)); 114 return (biowait(bp)); 115 } 116 117 /* 118 * Write the buffer, waiting for completion. 119 * Then release the buffer. 120 */ 121 bwrite(bp) 122 register struct buf *bp; 123 { 124 register int flag; 125 int error; 126 127 flag = bp->b_flags; 128 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 129 if ((flag&B_DELWRI) == 0) 130 u.u_ru.ru_oublock++; /* noone paid yet */ 131 trace(TR_BWRITE, 132 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bcount), bp->b_blkno); 133 if (bp->b_bcount > bp->b_bufsize) 134 panic("bwrite"); 135 VOP_STRATEGY(bp); 136 137 /* 138 * If the write was synchronous, then await i/o completion. 139 * If the write was "delayed", then we put the buffer on 140 * the q of blocks awaiting i/o completion status. 141 */ 142 if ((flag&B_ASYNC) == 0) { 143 error = biowait(bp); 144 brelse(bp); 145 } else if (flag & B_DELWRI) { 146 bp->b_flags |= B_AGE; 147 error = 0; 148 } 149 return (error); 150 } 151 152 /* 153 * Release the buffer, marking it so that if it is grabbed 154 * for another purpose it will be written out before being 155 * given up (e.g. when writing a partial block where it is 156 * assumed that another write for the same block will soon follow). 157 * This can't be done for magtape, since writes must be done 158 * in the same order as requested. 159 */ 160 bdwrite(bp) 161 register struct buf *bp; 162 { 163 164 if ((bp->b_flags&B_DELWRI) == 0) 165 u.u_ru.ru_oublock++; /* noone paid yet */ 166 #ifdef notdef 167 /* 168 * This does not work for buffers associated with 169 * vnodes that are remote - they have no dev. 170 * Besides, we don't use bio with tapes, so rather 171 * than develop a fix, we just ifdef this out for now. 172 */ 173 if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) 174 bawrite(bp); 175 else { 176 bp->b_flags |= B_DELWRI | B_DONE; 177 brelse(bp); 178 } 179 #endif 180 bp->b_flags |= B_DELWRI | B_DONE; 181 brelse(bp); 182 } 183 184 /* 185 * Release the buffer, start I/O on it, but don't wait for completion. 186 */ 187 bawrite(bp) 188 register struct buf *bp; 189 { 190 191 bp->b_flags |= B_ASYNC; 192 (void) bwrite(bp); 193 } 194 195 /* 196 * Release the buffer, with no I/O implied. 197 */ 198 brelse(bp) 199 register struct buf *bp; 200 { 201 register struct buf *flist; 202 register s; 203 204 trace(TR_BRELSE, 205 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); 206 /* 207 * If someone's waiting for the buffer, or 208 * is waiting for a buffer wake 'em up. 209 */ 210 if (bp->b_flags&B_WANTED) 211 wakeup((caddr_t)bp); 212 if (bfreelist[0].b_flags&B_WANTED) { 213 bfreelist[0].b_flags &= ~B_WANTED; 214 wakeup((caddr_t)bfreelist); 215 } 216 if (bp->b_flags & B_NOCACHE) { 217 bp->b_flags |= B_INVAL; 218 } 219 if (bp->b_flags&B_ERROR) 220 if (bp->b_flags & B_LOCKED) 221 bp->b_flags &= ~B_ERROR; /* try again later */ 222 else 223 brelvp(bp); /* no assoc */ 224 225 /* 226 * Stick the buffer back on a free list. 227 */ 228 s = splbio(); 229 if (bp->b_bufsize <= 0) { 230 /* block has no buffer ... put at front of unused buffer list */ 231 flist = &bfreelist[BQ_EMPTY]; 232 binsheadfree(bp, flist); 233 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 234 /* block has no info ... put at front of most free list */ 235 flist = &bfreelist[BQ_AGE]; 236 binsheadfree(bp, flist); 237 } else { 238 if (bp->b_flags & B_LOCKED) 239 flist = &bfreelist[BQ_LOCKED]; 240 else if (bp->b_flags & B_AGE) 241 flist = &bfreelist[BQ_AGE]; 242 else 243 flist = &bfreelist[BQ_LRU]; 244 binstailfree(bp, flist); 245 } 246 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE|B_NOCACHE); 247 splx(s); 248 } 249 250 /* 251 * See if the block is associated with some buffer 252 * (mainly to avoid getting hung up on a wait in breada) 253 */ 254 incore(vp, blkno) 255 struct vnode *vp; 256 daddr_t blkno; 257 { 258 register struct buf *bp; 259 register struct buf *dp; 260 261 dp = BUFHASH(vp->v_rdev, blkno); 262 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 263 if (bp->b_blkno == blkno && bp->b_vp == vp && 264 (bp->b_flags & B_INVAL) == 0) 265 return (1); 266 return (0); 267 } 268 269 baddr(vp, blkno, size, bpp) 270 struct vnode *vp; 271 daddr_t blkno; 272 int size; 273 struct buf **bpp; 274 { 275 276 if (incore(vp, blkno)) 277 return (bread(vp, blkno, size, bpp)); 278 *bpp = 0; 279 return (0); 280 } 281 282 /* 283 * Assign a buffer for the given block. If the appropriate 284 * block is already associated, return it; otherwise search 285 * for the oldest non-busy buffer and reassign it. 286 * 287 * If we find the buffer, but it is dirty (marked DELWRI) and 288 * its size is changing, we must write it out first. When the 289 * buffer is shrinking, the write is done by brealloc to avoid 290 * losing the unwritten data. When the buffer is growing, the 291 * write is done by getblk, so that bread will not read stale 292 * disk data over the modified data in the buffer. 293 * 294 * We use splx here because this routine may be called 295 * on the interrupt stack during a dump, and we don't 296 * want to lower the ipl back to 0. 297 */ 298 struct buf * 299 getblk(vp, blkno, size) 300 register struct vnode *vp; 301 daddr_t blkno; 302 int size; 303 { 304 register struct buf *bp, *dp; 305 int s; 306 307 if (size > MAXBSIZE) 308 panic("getblk: size too big"); 309 /* 310 * To prevent overflow of 32-bit ints when converting block 311 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 312 * to the maximum number that can be converted to a byte offset 313 * without overflow. This is historic code; what bug it fixed, 314 * or whether it is still a reasonable thing to do is open to 315 * dispute. mkm 9/85 316 */ 317 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) 318 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 319 /* 320 * Search the cache for the block. If we hit, but 321 * the buffer is in use for i/o, then we wait until 322 * the i/o has completed. 323 */ 324 dp = BUFHASH(vp, blkno); 325 loop: 326 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 327 if (bp->b_blkno != blkno || bp->b_vp != vp || 328 bp->b_flags&B_INVAL) 329 continue; 330 s = splbio(); 331 if (bp->b_flags&B_BUSY) { 332 bp->b_flags |= B_WANTED; 333 sleep((caddr_t)bp, PRIBIO+1); 334 splx(s); 335 goto loop; 336 } 337 splx(s); 338 notavail(bp); 339 if (bp->b_bcount != size) { 340 if (bp->b_bcount < size && (bp->b_flags&B_DELWRI)) { 341 bp->b_flags &= ~B_ASYNC; 342 (void) bwrite(bp); 343 goto loop; 344 } 345 if (brealloc(bp, size) == 0) 346 goto loop; 347 } 348 if (bp->b_bcount != size && brealloc(bp, size) == 0) 349 goto loop; 350 bp->b_flags |= B_CACHE; 351 return (bp); 352 } 353 bp = getnewbuf(); 354 bfree(bp); 355 bremhash(bp); 356 if (bp->b_vp) 357 brelvp(bp); 358 vp->v_count++; 359 bp->b_vp = vp; 360 bp->b_dev = vp->v_rdev; 361 bp->b_blkno = blkno; 362 bp->b_error = 0; 363 bp->b_resid = 0; 364 binshash(bp, dp); 365 if (brealloc(bp, size) == 0) 366 goto loop; 367 return (bp); 368 } 369 370 /* 371 * get an empty block, 372 * not assigned to any particular device 373 */ 374 struct buf * 375 geteblk(size) 376 int size; 377 { 378 register struct buf *bp, *flist; 379 380 if (size > MAXBSIZE) 381 panic("geteblk: size too big"); 382 loop: 383 bp = getnewbuf(); 384 bp->b_flags |= B_INVAL; 385 bfree(bp); 386 bremhash(bp); 387 flist = &bfreelist[BQ_AGE]; 388 brelvp(bp); 389 bp->b_error = 0; 390 bp->b_resid = 0; 391 binshash(bp, flist); 392 if (brealloc(bp, size) == 0) 393 goto loop; 394 return (bp); 395 } 396 397 /* 398 * Allocate space associated with a buffer. 399 * If can't get space, buffer is released 400 */ 401 brealloc(bp, size) 402 register struct buf *bp; 403 int size; 404 { 405 daddr_t start, last; 406 register struct buf *ep; 407 struct buf *dp; 408 int s; 409 410 /* 411 * First need to make sure that all overlapping previous I/O 412 * is dispatched with. 413 */ 414 if (size == bp->b_bcount) 415 return (1); 416 if (size < bp->b_bcount) { 417 if (bp->b_flags & B_DELWRI) { 418 (void) bwrite(bp); 419 return (0); 420 } 421 if (bp->b_flags & B_LOCKED) 422 panic("brealloc"); 423 return (allocbuf(bp, size)); 424 } 425 bp->b_flags &= ~B_DONE; 426 if (bp->b_vp == (struct vnode *)0) 427 return (allocbuf(bp, size)); 428 429 trace(TR_BREALLOC, 430 pack(bp->b_vp->v_mount->m_fsid[0], size), bp->b_blkno); 431 /* 432 * Search cache for any buffers that overlap the one that we 433 * are trying to allocate. Overlapping buffers must be marked 434 * invalid, after being written out if they are dirty. (indicated 435 * by B_DELWRI) A disk block must be mapped by at most one buffer 436 * at any point in time. Care must be taken to avoid deadlocking 437 * when two buffer are trying to get the same set of disk blocks. 438 */ 439 start = bp->b_blkno; 440 last = start + btodb(size) - 1; 441 dp = BUFHASH(bp->b_vp, bp->b_blkno); 442 loop: 443 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 444 if (ep == bp || ep->b_vp != bp->b_vp || 445 (ep->b_flags & B_INVAL)) 446 continue; 447 /* look for overlap */ 448 if (ep->b_bcount == 0 || ep->b_blkno > last || 449 ep->b_blkno + btodb(ep->b_bcount) <= start) 450 continue; 451 s = splbio(); 452 if (ep->b_flags&B_BUSY) { 453 ep->b_flags |= B_WANTED; 454 sleep((caddr_t)ep, PRIBIO+1); 455 splx(s); 456 goto loop; 457 } 458 splx(s); 459 notavail(ep); 460 if (ep->b_flags & B_DELWRI) { 461 (void) bwrite(ep); 462 goto loop; 463 } 464 ep->b_flags |= B_INVAL; 465 brelse(ep); 466 } 467 return (allocbuf(bp, size)); 468 } 469 470 /* 471 * Find a buffer which is available for use. 472 * Select something from a free list. 473 * Preference is to AGE list, then LRU list. 474 */ 475 struct buf * 476 getnewbuf() 477 { 478 register struct buf *bp, *dp; 479 int s; 480 481 loop: 482 s = splbio(); 483 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 484 if (dp->av_forw != dp) 485 break; 486 if (dp == bfreelist) { /* no free blocks */ 487 dp->b_flags |= B_WANTED; 488 sleep((caddr_t)dp, PRIBIO+1); 489 splx(s); 490 goto loop; 491 } 492 splx(s); 493 bp = dp->av_forw; 494 notavail(bp); 495 if (bp->b_flags & B_DELWRI) { 496 bp->b_flags |= B_ASYNC; 497 (void) bwrite(bp); 498 goto loop; 499 } 500 trace(TR_BRELSE, 501 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); 502 brelvp(bp); 503 bp->b_flags = B_BUSY; 504 return (bp); 505 } 506 507 /* 508 * Wait for I/O completion on the buffer; return errors 509 * to the user. 510 */ 511 biowait(bp) 512 register struct buf *bp; 513 { 514 int s; 515 516 s = splbio(); 517 while ((bp->b_flags&B_DONE)==0) 518 sleep((caddr_t)bp, PRIBIO); 519 splx(s); 520 /* 521 * Pick up the device's error number and pass it to the user; 522 * if there is an error but the number is 0 set a generalized code. 523 */ 524 if ((bp->b_flags & B_ERROR) == 0) 525 return (0); 526 if (bp->b_error) 527 return (bp->b_error); 528 return (EIO); 529 } 530 531 /* 532 * Mark I/O complete on a buffer. 533 * If someone should be called, e.g. the pageout 534 * daemon, do so. Otherwise, wake up anyone 535 * waiting for it. 536 */ 537 biodone(bp) 538 register struct buf *bp; 539 { 540 541 if (bp->b_flags & B_DONE) 542 panic("dup biodone"); 543 bp->b_flags |= B_DONE; 544 if (bp->b_flags & B_CALL) { 545 bp->b_flags &= ~B_CALL; 546 (*bp->b_iodone)(bp); 547 return; 548 } 549 if (bp->b_flags&B_ASYNC) 550 brelse(bp); 551 else { 552 bp->b_flags &= ~B_WANTED; 553 wakeup((caddr_t)bp); 554 } 555 } 556 557 /* 558 * Ensure that no part of a specified block is in an incore buffer. 559 #ifdef SECSIZE 560 * "size" is given in device blocks (the units of b_blkno). 561 #endif SECSIZE 562 */ 563 blkflush(vp, blkno, size) 564 struct vnode *vp; 565 daddr_t blkno; 566 long size; 567 { 568 register struct buf *ep; 569 struct buf *dp; 570 daddr_t start, last; 571 int s, error, allerrors = 0; 572 573 start = blkno; 574 last = start + btodb(size) - 1; 575 dp = BUFHASH(vp, blkno); 576 loop: 577 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 578 if (ep->b_vp != vp || (ep->b_flags & B_INVAL)) 579 continue; 580 /* look for overlap */ 581 if (ep->b_bcount == 0 || ep->b_blkno > last || 582 ep->b_blkno + btodb(ep->b_bcount) <= start) 583 continue; 584 s = splbio(); 585 if (ep->b_flags&B_BUSY) { 586 ep->b_flags |= B_WANTED; 587 sleep((caddr_t)ep, PRIBIO+1); 588 splx(s); 589 goto loop; 590 } 591 if (ep->b_flags & B_DELWRI) { 592 splx(s); 593 notavail(ep); 594 if (error = bwrite(ep)) 595 allerrors = error; 596 goto loop; 597 } 598 splx(s); 599 } 600 return (allerrors); 601 } 602 603 /* 604 * Make sure all write-behind blocks associated 605 * with vp are flushed out (from sync). 606 */ 607 bflush(dev) 608 dev_t dev; 609 { 610 register struct buf *bp; 611 register struct buf *flist; 612 int s; 613 614 loop: 615 s = splbio(); 616 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 617 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 618 if ((bp->b_flags & B_DELWRI) == 0) 619 continue; 620 if (dev == NODEV || dev == bp->b_dev) { 621 bp->b_flags |= B_ASYNC; 622 notavail(bp); 623 (void) bwrite(bp); 624 splx(s); 625 goto loop; 626 } 627 } 628 splx(s); 629 } 630 631 #ifdef unused 632 /* 633 * Invalidate blocks associated with vp which are on the freelist. 634 * Make sure all write-behind blocks associated with vp are flushed out. 635 */ 636 binvalfree(vp) 637 struct vnode *vp; 638 { 639 register struct buf *bp; 640 register struct buf *flist; 641 int s; 642 643 loop: 644 s = splbio(); 645 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 646 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 647 if (vp == (struct vnode *) 0 || vp == bp->b_vp) { 648 if (bp->b_flags & B_DELWRI) { 649 bp->b_flags |= B_ASYNC; 650 notavail(bp); 651 (void) splx(s); 652 (void) bwrite(bp); 653 } else { 654 bp->b_flags |= B_INVAL; 655 brelvp(bp); 656 (void) splx(s); 657 } 658 goto loop; 659 } 660 } 661 (void) splx(s); 662 } 663 #endif /* unused */ 664 665 /* 666 * Invalidate in core blocks belonging to closed or umounted filesystem 667 * 668 * This is not nicely done at all - the buffer ought to be removed from the 669 * hash chains & have its dev/blkno fields clobbered, but unfortunately we 670 * can't do that here, as it is quite possible that the block is still 671 * being used for i/o. Eventually, all disc drivers should be forced to 672 * have a close routine, which ought ensure that the queue is empty, then 673 * properly flush the queues. Until that happy day, this suffices for 674 * correctness. ... kre 675 */ 676 binval(dev) 677 dev_t dev; 678 { 679 register struct buf *bp; 680 register struct bufhd *hp; 681 #define dp ((struct buf *)hp) 682 683 loop: 684 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 685 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 686 if (bp->b_dev == dev && (bp->b_flags & B_INVAL) == 0) { 687 bp->b_flags |= B_INVAL; 688 brelvp(bp); 689 goto loop; 690 } 691 } 692 693 brelvp(bp) 694 struct buf *bp; 695 { 696 struct vnode *vp; 697 698 if (bp->b_vp == (struct vnode *) 0) 699 return; 700 vp = bp->b_vp; 701 bp->b_vp = (struct vnode *) 0; 702 vrele(vp); 703 } 704