1 /* 2 * Copyright (c) 1982, 1986, 1989 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms are permitted 6 * provided that the above copyright notice and this paragraph are 7 * duplicated in all such forms and that any documentation, 8 * advertising materials, and other materials related to such 9 * distribution and use acknowledge that the software was developed 10 * by the University of California, Berkeley. The name of the 11 * University may not be used to endorse or promote products derived 12 * from this software without specific prior written permission. 13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 16 * 17 * @(#)vfs_bio.c 7.8 (Berkeley) 08/15/89 18 */ 19 20 #include "param.h" 21 #include "user.h" 22 #include "buf.h" 23 #include "vnode.h" 24 #include "trace.h" 25 26 /* 27 * Read in (if necessary) the block and return a buffer pointer. 28 */ 29 bread(vp, blkno, size, bpp) 30 struct vnode *vp; 31 daddr_t blkno; 32 int size; 33 struct buf **bpp; 34 { 35 register struct buf *bp; 36 37 if (size == 0) 38 panic("bread: size 0"); 39 *bpp = bp = getblk(vp, blkno, size); 40 if (bp->b_flags&(B_DONE|B_DELWRI)) { 41 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), blkno); 42 return (0); 43 } 44 bp->b_flags |= B_READ; 45 if (bp->b_bcount > bp->b_bufsize) 46 panic("bread"); 47 VOP_STRATEGY(bp); 48 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), blkno); 49 u.u_ru.ru_inblock++; /* pay for read */ 50 return (biowait(bp)); 51 } 52 53 /* 54 * Read in the block, like bread, but also start I/O on the 55 * read-ahead block (which is not allocated to the caller) 56 */ 57 breada(vp, blkno, size, rablkno, rabsize, bpp) 58 struct vnode *vp; 59 daddr_t blkno; int size; 60 daddr_t rablkno; int rabsize; 61 struct buf **bpp; 62 { 63 register struct buf *bp, *rabp; 64 65 bp = NULL; 66 /* 67 * If the block isn't in core, then allocate 68 * a buffer and initiate i/o (getblk checks 69 * for a cache hit). 70 */ 71 if (!incore(vp, blkno)) { 72 *bpp = bp = getblk(vp, blkno, size); 73 if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) { 74 bp->b_flags |= B_READ; 75 if (bp->b_bcount > bp->b_bufsize) 76 panic("breada"); 77 VOP_STRATEGY(bp); 78 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), 79 blkno); 80 u.u_ru.ru_inblock++; /* pay for read */ 81 } else 82 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), 83 blkno); 84 } 85 86 /* 87 * If there's a read-ahead block, start i/o 88 * on it also (as above). 89 */ 90 if (rablkno && !incore(vp, rablkno)) { 91 rabp = getblk(vp, rablkno, rabsize); 92 if (rabp->b_flags & (B_DONE|B_DELWRI)) { 93 brelse(rabp); 94 trace(TR_BREADHITRA, 95 pack(vp->v_mount->m_fsid[0], rabsize), blkno); 96 } else { 97 rabp->b_flags |= B_READ|B_ASYNC; 98 if (rabp->b_bcount > rabp->b_bufsize) 99 panic("breadrabp"); 100 VOP_STRATEGY(rabp); 101 trace(TR_BREADMISSRA, 102 pack(vp->v_mount->m_fsid[0], rabsize), rablock); 103 u.u_ru.ru_inblock++; /* pay in advance */ 104 } 105 } 106 107 /* 108 * If block was in core, let bread get it. 109 * If block wasn't in core, then the read was started 110 * above, and just wait for it. 111 */ 112 if (bp == NULL) 113 return (bread(vp, blkno, size, bpp)); 114 return (biowait(bp)); 115 } 116 117 /* 118 * Write the buffer, waiting for completion. 119 * Then release the buffer. 120 */ 121 bwrite(bp) 122 register struct buf *bp; 123 { 124 register int flag; 125 int error; 126 127 flag = bp->b_flags; 128 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 129 if ((flag&B_DELWRI) == 0) 130 u.u_ru.ru_oublock++; /* noone paid yet */ 131 trace(TR_BWRITE, 132 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bcount), bp->b_blkno); 133 if (bp->b_bcount > bp->b_bufsize) 134 panic("bwrite"); 135 VOP_STRATEGY(bp); 136 137 /* 138 * If the write was synchronous, then await i/o completion. 139 * If the write was "delayed", then we put the buffer on 140 * the q of blocks awaiting i/o completion status. 141 */ 142 if ((flag&B_ASYNC) == 0) { 143 error = biowait(bp); 144 brelse(bp); 145 } else if (flag & B_DELWRI) { 146 bp->b_flags |= B_AGE; 147 error = 0; 148 } 149 return (error); 150 } 151 152 /* 153 * Release the buffer, marking it so that if it is grabbed 154 * for another purpose it will be written out before being 155 * given up (e.g. when writing a partial block where it is 156 * assumed that another write for the same block will soon follow). 157 * This can't be done for magtape, since writes must be done 158 * in the same order as requested. 159 */ 160 bdwrite(bp) 161 register struct buf *bp; 162 { 163 164 if ((bp->b_flags&B_DELWRI) == 0) 165 u.u_ru.ru_oublock++; /* noone paid yet */ 166 #ifdef notdef 167 /* 168 * This does not work for buffers associated with 169 * vnodes that are remote - they have no dev. 170 * Besides, we don't use bio with tapes, so rather 171 * than develop a fix, we just ifdef this out for now. 172 */ 173 if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) 174 bawrite(bp); 175 else { 176 bp->b_flags |= B_DELWRI | B_DONE; 177 brelse(bp); 178 } 179 #endif 180 bp->b_flags |= B_DELWRI | B_DONE; 181 brelse(bp); 182 } 183 184 /* 185 * Release the buffer, start I/O on it, but don't wait for completion. 186 */ 187 bawrite(bp) 188 register struct buf *bp; 189 { 190 191 bp->b_flags |= B_ASYNC; 192 (void) bwrite(bp); 193 } 194 195 /* 196 * Release the buffer, with no I/O implied. 197 */ 198 brelse(bp) 199 register struct buf *bp; 200 { 201 register struct buf *flist; 202 register s; 203 204 trace(TR_BRELSE, 205 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); 206 /* 207 * If someone's waiting for the buffer, or 208 * is waiting for a buffer wake 'em up. 209 */ 210 if (bp->b_flags&B_WANTED) 211 wakeup((caddr_t)bp); 212 if (bfreelist[0].b_flags&B_WANTED) { 213 bfreelist[0].b_flags &= ~B_WANTED; 214 wakeup((caddr_t)bfreelist); 215 } 216 if (bp->b_flags & B_NOCACHE) { 217 bp->b_flags |= B_INVAL; 218 } 219 if (bp->b_flags&B_ERROR) 220 if (bp->b_flags & B_LOCKED) 221 bp->b_flags &= ~B_ERROR; /* try again later */ 222 else 223 brelvp(bp); /* no assoc */ 224 225 /* 226 * Stick the buffer back on a free list. 227 */ 228 s = splbio(); 229 if (bp->b_bufsize <= 0) { 230 /* block has no buffer ... put at front of unused buffer list */ 231 flist = &bfreelist[BQ_EMPTY]; 232 binsheadfree(bp, flist); 233 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 234 /* block has no info ... put at front of most free list */ 235 flist = &bfreelist[BQ_AGE]; 236 binsheadfree(bp, flist); 237 } else { 238 if (bp->b_flags & B_LOCKED) 239 flist = &bfreelist[BQ_LOCKED]; 240 else if (bp->b_flags & B_AGE) 241 flist = &bfreelist[BQ_AGE]; 242 else 243 flist = &bfreelist[BQ_LRU]; 244 binstailfree(bp, flist); 245 } 246 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE|B_NOCACHE); 247 splx(s); 248 } 249 250 /* 251 * See if the block is associated with some buffer 252 * (mainly to avoid getting hung up on a wait in breada) 253 */ 254 incore(vp, blkno) 255 struct vnode *vp; 256 daddr_t blkno; 257 { 258 register struct buf *bp; 259 register struct buf *dp; 260 261 dp = BUFHASH(vp, blkno); 262 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 263 if (bp->b_blkno == blkno && bp->b_vp == vp && 264 (bp->b_flags & B_INVAL) == 0) 265 return (1); 266 return (0); 267 } 268 269 baddr(vp, blkno, size, bpp) 270 struct vnode *vp; 271 daddr_t blkno; 272 int size; 273 struct buf **bpp; 274 { 275 276 if (incore(vp, blkno)) 277 return (bread(vp, blkno, size, bpp)); 278 *bpp = 0; 279 return (0); 280 } 281 282 /* 283 * Assign a buffer for the given block. If the appropriate 284 * block is already associated, return it; otherwise search 285 * for the oldest non-busy buffer and reassign it. 286 * 287 * If we find the buffer, but it is dirty (marked DELWRI) and 288 * its size is changing, we must write it out first. When the 289 * buffer is shrinking, the write is done by brealloc to avoid 290 * losing the unwritten data. When the buffer is growing, the 291 * write is done by getblk, so that bread will not read stale 292 * disk data over the modified data in the buffer. 293 * 294 * We use splx here because this routine may be called 295 * on the interrupt stack during a dump, and we don't 296 * want to lower the ipl back to 0. 297 */ 298 struct buf * 299 getblk(vp, blkno, size) 300 register struct vnode *vp; 301 daddr_t blkno; 302 int size; 303 { 304 register struct buf *bp, *dp; 305 int s; 306 307 if (size > MAXBSIZE) 308 panic("getblk: size too big"); 309 /* 310 * To prevent overflow of 32-bit ints when converting block 311 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 312 * to the maximum number that can be converted to a byte offset 313 * without overflow. This is historic code; what bug it fixed, 314 * or whether it is still a reasonable thing to do is open to 315 * dispute. mkm 9/85 316 */ 317 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) 318 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 319 /* 320 * Search the cache for the block. If we hit, but 321 * the buffer is in use for i/o, then we wait until 322 * the i/o has completed. 323 */ 324 dp = BUFHASH(vp, blkno); 325 loop: 326 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 327 if (bp->b_blkno != blkno || bp->b_vp != vp || 328 bp->b_flags&B_INVAL) 329 continue; 330 s = splbio(); 331 if (bp->b_flags&B_BUSY) { 332 bp->b_flags |= B_WANTED; 333 sleep((caddr_t)bp, PRIBIO+1); 334 splx(s); 335 goto loop; 336 } 337 splx(s); 338 notavail(bp); 339 if (bp->b_bcount != size) { 340 if (bp->b_bcount < size && (bp->b_flags&B_DELWRI)) { 341 bp->b_flags &= ~B_ASYNC; 342 (void) bwrite(bp); 343 goto loop; 344 } 345 if (brealloc(bp, size) == 0) 346 goto loop; 347 } 348 if (bp->b_bcount != size && brealloc(bp, size) == 0) 349 goto loop; 350 bp->b_flags |= B_CACHE; 351 return (bp); 352 } 353 bp = getnewbuf(); 354 bfree(bp); 355 bremhash(bp); 356 if (bp->b_vp) 357 brelvp(bp); 358 VREF(vp); 359 bp->b_vp = vp; 360 bp->b_dev = vp->v_rdev; 361 bp->b_blkno = blkno; 362 bp->b_error = 0; 363 bp->b_resid = 0; 364 binshash(bp, dp); 365 if (brealloc(bp, size) == 0) 366 goto loop; 367 return (bp); 368 } 369 370 /* 371 * get an empty block, 372 * not assigned to any particular device 373 */ 374 struct buf * 375 geteblk(size) 376 int size; 377 { 378 register struct buf *bp, *flist; 379 380 if (size > MAXBSIZE) 381 panic("geteblk: size too big"); 382 loop: 383 bp = getnewbuf(); 384 bp->b_flags |= B_INVAL; 385 bfree(bp); 386 bremhash(bp); 387 flist = &bfreelist[BQ_AGE]; 388 brelvp(bp); 389 bp->b_error = 0; 390 bp->b_resid = 0; 391 binshash(bp, flist); 392 if (brealloc(bp, size) == 0) 393 goto loop; 394 return (bp); 395 } 396 397 /* 398 * Allocate space associated with a buffer. 399 * If can't get space, buffer is released 400 */ 401 brealloc(bp, size) 402 register struct buf *bp; 403 int size; 404 { 405 daddr_t start, last; 406 register struct buf *ep; 407 struct buf *dp; 408 int s; 409 410 /* 411 * First need to make sure that all overlapping previous I/O 412 * is dispatched with. 413 */ 414 if (size == bp->b_bcount) 415 return (1); 416 if (size < bp->b_bcount) { 417 if (bp->b_flags & B_DELWRI) { 418 (void) bwrite(bp); 419 return (0); 420 } 421 if (bp->b_flags & B_LOCKED) 422 panic("brealloc"); 423 return (allocbuf(bp, size)); 424 } 425 bp->b_flags &= ~B_DONE; 426 if (bp->b_vp == (struct vnode *)0) 427 return (allocbuf(bp, size)); 428 429 trace(TR_BREALLOC, 430 pack(bp->b_vp->v_mount->m_fsid[0], size), bp->b_blkno); 431 /* 432 * Search cache for any buffers that overlap the one that we 433 * are trying to allocate. Overlapping buffers must be marked 434 * invalid, after being written out if they are dirty. (indicated 435 * by B_DELWRI) A disk block must be mapped by at most one buffer 436 * at any point in time. Care must be taken to avoid deadlocking 437 * when two buffer are trying to get the same set of disk blocks. 438 */ 439 start = bp->b_blkno; 440 last = start + btodb(size) - 1; 441 dp = BUFHASH(bp->b_vp, bp->b_blkno); 442 loop: 443 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 444 if (ep == bp || ep->b_vp != bp->b_vp || 445 (ep->b_flags & B_INVAL)) 446 continue; 447 /* look for overlap */ 448 if (ep->b_bcount == 0 || ep->b_blkno > last || 449 ep->b_blkno + btodb(ep->b_bcount) <= start) 450 continue; 451 s = splbio(); 452 if (ep->b_flags&B_BUSY) { 453 ep->b_flags |= B_WANTED; 454 sleep((caddr_t)ep, PRIBIO+1); 455 splx(s); 456 goto loop; 457 } 458 splx(s); 459 notavail(ep); 460 if (ep->b_flags & B_DELWRI) { 461 (void) bwrite(ep); 462 goto loop; 463 } 464 ep->b_flags |= B_INVAL; 465 brelse(ep); 466 } 467 return (allocbuf(bp, size)); 468 } 469 470 /* 471 * Find a buffer which is available for use. 472 * Select something from a free list. 473 * Preference is to AGE list, then LRU list. 474 */ 475 struct buf * 476 getnewbuf() 477 { 478 register struct buf *bp, *dp; 479 int s; 480 481 loop: 482 s = splbio(); 483 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 484 if (dp->av_forw != dp) 485 break; 486 if (dp == bfreelist) { /* no free blocks */ 487 dp->b_flags |= B_WANTED; 488 sleep((caddr_t)dp, PRIBIO+1); 489 splx(s); 490 goto loop; 491 } 492 splx(s); 493 bp = dp->av_forw; 494 notavail(bp); 495 if (bp->b_flags & B_DELWRI) { 496 (void) bawrite(bp); 497 goto loop; 498 } 499 trace(TR_BRELSE, 500 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); 501 brelvp(bp); 502 bp->b_flags = B_BUSY; 503 return (bp); 504 } 505 506 /* 507 * Wait for I/O completion on the buffer; return errors 508 * to the user. 509 */ 510 biowait(bp) 511 register struct buf *bp; 512 { 513 int s; 514 515 s = splbio(); 516 while ((bp->b_flags&B_DONE)==0) 517 sleep((caddr_t)bp, PRIBIO); 518 splx(s); 519 /* 520 * Pick up the device's error number and pass it to the user; 521 * if there is an error but the number is 0 set a generalized code. 522 */ 523 if ((bp->b_flags & B_ERROR) == 0) 524 return (0); 525 if (bp->b_error) 526 return (bp->b_error); 527 return (EIO); 528 } 529 530 /* 531 * Mark I/O complete on a buffer. 532 * If someone should be called, e.g. the pageout 533 * daemon, do so. Otherwise, wake up anyone 534 * waiting for it. 535 */ 536 biodone(bp) 537 register struct buf *bp; 538 { 539 540 if (bp->b_flags & B_DONE) 541 panic("dup biodone"); 542 bp->b_flags |= B_DONE; 543 if (bp->b_flags & B_CALL) { 544 bp->b_flags &= ~B_CALL; 545 (*bp->b_iodone)(bp); 546 return; 547 } 548 if (bp->b_flags&B_ASYNC) 549 brelse(bp); 550 else { 551 bp->b_flags &= ~B_WANTED; 552 wakeup((caddr_t)bp); 553 } 554 } 555 556 /* 557 * Ensure that no part of a specified block is in an incore buffer. 558 #ifdef SECSIZE 559 * "size" is given in device blocks (the units of b_blkno). 560 #endif SECSIZE 561 */ 562 blkflush(vp, blkno, size) 563 struct vnode *vp; 564 daddr_t blkno; 565 long size; 566 { 567 register struct buf *ep; 568 struct buf *dp; 569 daddr_t start, last; 570 int s, error, allerrors = 0; 571 572 start = blkno; 573 last = start + btodb(size) - 1; 574 dp = BUFHASH(vp, blkno); 575 loop: 576 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 577 if (ep->b_vp != vp || (ep->b_flags & B_INVAL)) 578 continue; 579 /* look for overlap */ 580 if (ep->b_bcount == 0 || ep->b_blkno > last || 581 ep->b_blkno + btodb(ep->b_bcount) <= start) 582 continue; 583 s = splbio(); 584 if (ep->b_flags&B_BUSY) { 585 ep->b_flags |= B_WANTED; 586 sleep((caddr_t)ep, PRIBIO+1); 587 splx(s); 588 goto loop; 589 } 590 if (ep->b_flags & B_DELWRI) { 591 splx(s); 592 notavail(ep); 593 if (error = bwrite(ep)) 594 allerrors = error; 595 goto loop; 596 } 597 splx(s); 598 } 599 return (allerrors); 600 } 601 602 /* 603 * Make sure all write-behind blocks associated 604 * with vp are flushed out (from sync). 605 */ 606 bflush(dev) 607 dev_t dev; 608 { 609 register struct buf *bp; 610 register struct buf *flist; 611 int s; 612 613 loop: 614 s = splbio(); 615 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 616 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 617 if ((bp->b_flags & B_DELWRI) == 0) 618 continue; 619 if (dev == NODEV || dev == bp->b_dev) { 620 notavail(bp); 621 (void) bawrite(bp); 622 splx(s); 623 goto loop; 624 } 625 } 626 splx(s); 627 } 628 629 #ifdef unused 630 /* 631 * Invalidate blocks associated with vp which are on the freelist. 632 * Make sure all write-behind blocks associated with vp are flushed out. 633 */ 634 binvalfree(vp) 635 struct vnode *vp; 636 { 637 register struct buf *bp; 638 register struct buf *flist; 639 int s; 640 641 loop: 642 s = splbio(); 643 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 644 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 645 if (vp == (struct vnode *) 0 || vp == bp->b_vp) { 646 if (bp->b_flags & B_DELWRI) { 647 notavail(bp); 648 (void) splx(s); 649 (void) bawrite(bp); 650 } else { 651 bp->b_flags |= B_INVAL; 652 brelvp(bp); 653 (void) splx(s); 654 } 655 goto loop; 656 } 657 } 658 (void) splx(s); 659 } 660 #endif /* unused */ 661 662 /* 663 * Invalidate in core blocks belonging to closed or umounted filesystem 664 * 665 * We walk through the buffer pool and invalidate any buffers for the 666 * indicated device. Normally this routine is preceeded by a bflush 667 * call, so that on a quiescent filesystem there will be no dirty 668 * buffers when we are done. We return the count of dirty buffers when 669 * we are finished. 670 */ 671 binval(dev) 672 dev_t dev; 673 { 674 register struct buf *bp; 675 register struct bufhd *hp; 676 int dirty = 0; 677 #define dp ((struct buf *)hp) 678 679 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) { 680 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 681 if (bp->b_dev != dev || (bp->b_flags & B_INVAL)) 682 continue; 683 notavail(bp); 684 if (bp->b_flags & B_DELWRI) { 685 (void) bawrite(bp); 686 dirty++; 687 continue; 688 } 689 bp->b_flags |= B_INVAL; 690 brelvp(bp); 691 brelse(bp); 692 } 693 } 694 return (dirty); 695 } 696 697 brelvp(bp) 698 struct buf *bp; 699 { 700 struct vnode *vp; 701 702 if (bp->b_vp == (struct vnode *) 0) 703 return; 704 vp = bp->b_vp; 705 bp->b_vp = (struct vnode *) 0; 706 vrele(vp); 707 } 708