1 /* vfs_cluster.c 4.39 82/11/13 */ 2 3 #include "../h/param.h" 4 #include "../h/systm.h" 5 #include "../h/dir.h" 6 #include "../h/user.h" 7 #include "../h/buf.h" 8 #include "../h/conf.h" 9 #include "../h/proc.h" 10 #include "../h/seg.h" 11 #include "../h/pte.h" 12 #include "../h/vm.h" 13 #include "../h/trace.h" 14 15 /* 16 * Read in (if necessary) the block and return a buffer pointer. 17 */ 18 struct buf * 19 bread(dev, blkno, size) 20 dev_t dev; 21 daddr_t blkno; 22 int size; 23 { 24 register struct buf *bp; 25 26 if (size == 0) 27 panic("bread: size 0"); 28 bp = getblk(dev, blkno, size); 29 if (bp->b_flags&B_DONE) { 30 trace(TR_BREADHIT, dev, blkno); 31 return(bp); 32 } 33 bp->b_flags |= B_READ; 34 if (bp->b_bcount > bp->b_bufsize) 35 panic("bread"); 36 (*bdevsw[major(dev)].d_strategy)(bp); 37 trace(TR_BREADMISS, dev, blkno); 38 u.u_ru.ru_inblock++; /* pay for read */ 39 biowait(bp); 40 return(bp); 41 } 42 43 /* 44 * Read in the block, like bread, but also start I/O on the 45 * read-ahead block (which is not allocated to the caller) 46 */ 47 struct buf * 48 breada(dev, blkno, size, rablkno, rabsize) 49 dev_t dev; 50 daddr_t blkno; int size; 51 daddr_t rablkno; int rabsize; 52 { 53 register struct buf *bp, *rabp; 54 55 bp = NULL; 56 /* 57 * If the block isn't in core, then allocate 58 * a buffer and initiate i/o (getblk checks 59 * for a cache hit). 60 */ 61 if (!incore(dev, blkno)) { 62 bp = getblk(dev, blkno, size); 63 if ((bp->b_flags&B_DONE) == 0) { 64 bp->b_flags |= B_READ; 65 if (bp->b_bcount > bp->b_bufsize) 66 panic("breada"); 67 (*bdevsw[major(dev)].d_strategy)(bp); 68 trace(TR_BREADMISS, dev, blkno); 69 u.u_ru.ru_inblock++; /* pay for read */ 70 } else 71 trace(TR_BREADHIT, dev, blkno); 72 } 73 74 /* 75 * If there's a read-ahead block, start i/o 76 * on it also (as above). 77 */ 78 if (rablkno && !incore(dev, rablkno)) { 79 rabp = getblk(dev, rablkno, rabsize); 80 if (rabp->b_flags & B_DONE) { 81 brelse(rabp); 82 trace(TR_BREADHITRA, dev, blkno); 83 } else { 84 rabp->b_flags |= B_READ|B_ASYNC; 85 if (rabp->b_bcount > rabp->b_bufsize) 86 panic("breadrabp"); 87 (*bdevsw[major(dev)].d_strategy)(rabp); 88 trace(TR_BREADMISSRA, dev, rablock); 89 u.u_ru.ru_inblock++; /* pay in advance */ 90 } 91 } 92 93 /* 94 * If block was in core, let bread get it. 95 * If block wasn't in core, then the read was started 96 * above, and just wait for it. 97 */ 98 if (bp == NULL) 99 return (bread(dev, blkno, size)); 100 biowait(bp); 101 return (bp); 102 } 103 104 /* 105 * Write the buffer, waiting for completion. 106 * Then release the buffer. 107 */ 108 bwrite(bp) 109 register struct buf *bp; 110 { 111 register flag; 112 113 flag = bp->b_flags; 114 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 115 if ((flag&B_DELWRI) == 0) 116 u.u_ru.ru_oublock++; /* noone paid yet */ 117 trace(TR_BWRITE, bp->b_dev, bp->b_blkno); 118 if (bp->b_bcount > bp->b_bufsize) 119 panic("bwrite"); 120 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 121 122 /* 123 * If the write was synchronous, then await i/o completion. 124 * If the write was "delayed", then we put the buffer on 125 * the q of blocks awaiting i/o completion status. 126 * Otherwise, the i/o must be finished and we check for 127 * an error. 128 */ 129 if ((flag&B_ASYNC) == 0) { 130 biowait(bp); 131 brelse(bp); 132 } else if (flag & B_DELWRI) 133 bp->b_flags |= B_AGE; 134 else 135 u.u_error = geterror(bp); 136 } 137 138 /* 139 * Release the buffer, marking it so that if it is grabbed 140 * for another purpose it will be written out before being 141 * given up (e.g. when writing a partial block where it is 142 * assumed that another write for the same block will soon follow). 143 * This can't be done for magtape, since writes must be done 144 * in the same order as requested. 145 */ 146 bdwrite(bp) 147 register struct buf *bp; 148 { 149 register int flags; 150 151 if ((bp->b_flags&B_DELWRI) == 0) 152 u.u_ru.ru_oublock++; /* noone paid yet */ 153 flags = bdevsw[major(bp->b_dev)].d_flags; 154 if(flags & B_TAPE) 155 bawrite(bp); 156 else { 157 bp->b_flags |= B_DELWRI | B_DONE; 158 brelse(bp); 159 } 160 } 161 162 /* 163 * Release the buffer, start I/O on it, but don't wait for completion. 164 */ 165 bawrite(bp) 166 register struct buf *bp; 167 { 168 169 bp->b_flags |= B_ASYNC; 170 bwrite(bp); 171 } 172 173 /* 174 * Release the buffer, with no I/O implied. 175 */ 176 brelse(bp) 177 register struct buf *bp; 178 { 179 register struct buf *flist; 180 register s; 181 182 /* 183 * If someone's waiting for the buffer, or 184 * is waiting for a buffer wake 'em up. 185 */ 186 if (bp->b_flags&B_WANTED) 187 wakeup((caddr_t)bp); 188 if (bfreelist[0].b_flags&B_WANTED) { 189 bfreelist[0].b_flags &= ~B_WANTED; 190 wakeup((caddr_t)bfreelist); 191 } 192 if (bp->b_flags&B_ERROR) 193 if (bp->b_flags & B_LOCKED) 194 bp->b_flags &= ~B_ERROR; /* try again later */ 195 else 196 bp->b_dev = NODEV; /* no assoc */ 197 198 /* 199 * Stick the buffer back on a free list. 200 */ 201 s = spl6(); 202 if (bp->b_bufsize <= 0) { 203 /* block has no buffer ... put at front of unused buffer list */ 204 flist = &bfreelist[BQ_EMPTY]; 205 binsheadfree(bp, flist); 206 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 207 /* block has no info ... put at front of most free list */ 208 flist = &bfreelist[BQ_AGE]; 209 binsheadfree(bp, flist); 210 } else { 211 if (bp->b_flags & B_LOCKED) 212 flist = &bfreelist[BQ_LOCKED]; 213 else if (bp->b_flags & B_AGE) 214 flist = &bfreelist[BQ_AGE]; 215 else 216 flist = &bfreelist[BQ_LRU]; 217 binstailfree(bp, flist); 218 } 219 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 220 splx(s); 221 } 222 223 /* 224 * See if the block is associated with some buffer 225 * (mainly to avoid getting hung up on a wait in breada) 226 */ 227 incore(dev, blkno) 228 dev_t dev; 229 daddr_t blkno; 230 { 231 register struct buf *bp; 232 register struct buf *dp; 233 234 dp = BUFHASH(dev, blkno); 235 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 236 if (bp->b_blkno == blkno && bp->b_dev == dev && 237 (bp->b_flags & B_INVAL) == 0) 238 return (1); 239 return (0); 240 } 241 242 struct buf * 243 baddr(dev, blkno, size) 244 dev_t dev; 245 daddr_t blkno; 246 int size; 247 { 248 249 if (incore(dev, blkno)) 250 return (bread(dev, blkno, size)); 251 return (0); 252 } 253 254 /* 255 * Assign a buffer for the given block. If the appropriate 256 * block is already associated, return it; otherwise search 257 * for the oldest non-busy buffer and reassign it. 258 * 259 * We use splx here because this routine may be called 260 * on the interrupt stack during a dump, and we don't 261 * want to lower the ipl back to 0. 262 */ 263 struct buf * 264 getblk(dev, blkno, size) 265 dev_t dev; 266 daddr_t blkno; 267 int size; 268 { 269 register struct buf *bp, *dp; 270 int s; 271 272 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT)) 273 blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1); 274 /* 275 * Search the cache for the block. If we hit, but 276 * the buffer is in use for i/o, then we wait until 277 * the i/o has completed. 278 */ 279 dp = BUFHASH(dev, blkno); 280 loop: 281 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 282 if (bp->b_blkno != blkno || bp->b_dev != dev || 283 bp->b_flags&B_INVAL) 284 continue; 285 s = spl6(); 286 if (bp->b_flags&B_BUSY) { 287 bp->b_flags |= B_WANTED; 288 sleep((caddr_t)bp, PRIBIO+1); 289 splx(s); 290 goto loop; 291 } 292 splx(s); 293 notavail(bp); 294 if (brealloc(bp, size) == 0) 295 goto loop; 296 bp->b_flags |= B_CACHE; 297 return(bp); 298 } 299 if (major(dev) >= nblkdev) 300 panic("blkdev"); 301 bp = getnewbuf(); 302 bfree(bp); 303 bremhash(bp); 304 binshash(bp, dp); 305 bp->b_dev = dev; 306 bp->b_blkno = blkno; 307 bp->b_error = 0; 308 if (brealloc(bp, size) == 0) 309 goto loop; 310 return(bp); 311 } 312 313 /* 314 * get an empty block, 315 * not assigned to any particular device 316 */ 317 struct buf * 318 geteblk(size) 319 int size; 320 { 321 register struct buf *bp, *flist; 322 323 loop: 324 bp = getnewbuf(); 325 bp->b_flags |= B_INVAL; 326 bfree(bp); 327 bremhash(bp); 328 flist = &bfreelist[BQ_AGE]; 329 binshash(bp, flist); 330 bp->b_dev = (dev_t)NODEV; 331 bp->b_error = 0; 332 if (brealloc(bp, size) == 0) 333 goto loop; 334 return(bp); 335 } 336 337 /* 338 * Allocate space associated with a buffer. 339 */ 340 brealloc(bp, size) 341 register struct buf *bp; 342 int size; 343 { 344 daddr_t start, last; 345 register struct buf *ep; 346 struct buf *dp; 347 int s; 348 349 /* 350 * First need to make sure that all overlaping previous I/O 351 * is dispatched with. 352 */ 353 if (size == bp->b_bcount) 354 return (1); 355 if (size < bp->b_bcount) { 356 if (bp->b_flags & B_DELWRI) { 357 bwrite(bp); 358 return (0); 359 } 360 if (bp->b_flags & B_LOCKED) 361 panic("brealloc"); 362 allocbuf(bp, size); 363 return (1); 364 } 365 bp->b_flags &= ~B_DONE; 366 if (bp->b_dev == NODEV) { 367 allocbuf(bp, size); 368 return (1); 369 } 370 371 /* 372 * Search cache for any buffers that overlap the one that we 373 * are trying to allocate. Overlapping buffers must be marked 374 * invalid, after being written out if they are dirty. (indicated 375 * by B_DELWRI) A disk block must be mapped by at most one buffer 376 * at any point in time. Care must be taken to avoid deadlocking 377 * when two buffer are trying to get the same set of disk blocks. 378 */ 379 start = bp->b_blkno; 380 last = start + (size / DEV_BSIZE) - 1; 381 dp = BUFHASH(bp->b_dev, bp->b_blkno); 382 loop: 383 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 384 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL)) 385 continue; 386 /* look for overlap */ 387 if (ep->b_bcount == 0 || ep->b_blkno > last || 388 ep->b_blkno + (ep->b_bcount / DEV_BSIZE) <= start) 389 continue; 390 s = spl6(); 391 if (ep->b_flags&B_BUSY) { 392 ep->b_flags |= B_WANTED; 393 sleep((caddr_t)ep, PRIBIO+1); 394 splx(s); 395 goto loop; 396 } 397 splx(s); 398 notavail(ep); 399 if (ep->b_flags & B_DELWRI) { 400 bwrite(ep); 401 goto loop; 402 } 403 ep->b_flags |= B_INVAL; 404 brelse(ep); 405 } 406 allocbuf(bp, size); 407 return (1); 408 } 409 410 /* 411 * Expand or contract the actual memory allocated to a buffer. 412 */ 413 allocbuf(tp, size) 414 register struct buf *tp; 415 int size; 416 { 417 register struct buf *bp, *ep; 418 int sizealloc, take; 419 420 sizealloc = roundup(size, CLBYTES); 421 /* 422 * Buffer size does not change 423 */ 424 if (sizealloc == tp->b_bufsize) 425 goto out; 426 /* 427 * Buffer size is shrinking. 428 * Place excess space in a buffer header taken from the 429 * BQ_EMPTY buffer list and placed on the "most free" list. 430 * If no extra buffer headers are available, leave the 431 * extra space in the present buffer. 432 */ 433 if (sizealloc < tp->b_bufsize) { 434 ep = bfreelist[BQ_EMPTY].av_forw; 435 if (ep == &bfreelist[BQ_EMPTY]) 436 goto out; 437 notavail(ep); 438 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 439 (int)tp->b_bufsize - sizealloc); 440 ep->b_bufsize = tp->b_bufsize - sizealloc; 441 tp->b_bufsize = sizealloc; 442 ep->b_flags |= B_INVAL; 443 ep->b_bcount = 0; 444 brelse(ep); 445 goto out; 446 } 447 /* 448 * More buffer space is needed. Get it out of buffers on 449 * the "most free" list, placing the empty headers on the 450 * BQ_EMPTY buffer header list. 451 */ 452 while (tp->b_bufsize < sizealloc) { 453 take = sizealloc - tp->b_bufsize; 454 bp = getnewbuf(); 455 if (take >= bp->b_bufsize) 456 take = bp->b_bufsize; 457 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 458 &tp->b_un.b_addr[tp->b_bufsize], take); 459 tp->b_bufsize += take; 460 bp->b_bufsize = bp->b_bufsize - take; 461 if (bp->b_bcount > bp->b_bufsize) 462 bp->b_bcount = bp->b_bufsize; 463 if (bp->b_bufsize <= 0) { 464 bremhash(bp); 465 binshash(bp, &bfreelist[BQ_EMPTY]); 466 bp->b_dev = (dev_t)NODEV; 467 bp->b_error = 0; 468 bp->b_flags |= B_INVAL; 469 } 470 brelse(bp); 471 } 472 out: 473 tp->b_bcount = size; 474 } 475 476 /* 477 * Release space associated with a buffer. 478 */ 479 bfree(bp) 480 struct buf *bp; 481 { 482 /* 483 * This stub is provided to allow the system to reclaim 484 * memory from the buffer pool. Currently we do not migrate 485 * memory between the buffer memory pool and the user memory 486 * pool. 487 */ 488 bp->b_bcount = 0; 489 } 490 491 /* 492 * Find a buffer which is available for use. 493 * Select something from a free list. 494 * Preference is to AGE list, then LRU list. 495 */ 496 struct buf * 497 getnewbuf() 498 { 499 register struct buf *bp, *dp; 500 int s; 501 502 loop: 503 s = spl6(); 504 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 505 if (dp->av_forw != dp) 506 break; 507 if (dp == bfreelist) { /* no free blocks */ 508 dp->b_flags |= B_WANTED; 509 sleep((caddr_t)dp, PRIBIO+1); 510 goto loop; 511 } 512 splx(s); 513 bp = dp->av_forw; 514 notavail(bp); 515 if (bp->b_flags & B_DELWRI) { 516 bp->b_flags |= B_ASYNC; 517 bwrite(bp); 518 goto loop; 519 } 520 trace(TR_BRELSE, bp->b_dev, bp->b_blkno); 521 bp->b_flags = B_BUSY; 522 return (bp); 523 } 524 525 /* 526 * Wait for I/O completion on the buffer; return errors 527 * to the user. 528 */ 529 biowait(bp) 530 register struct buf *bp; 531 { 532 int s; 533 534 s = spl6(); 535 while ((bp->b_flags&B_DONE)==0) 536 sleep((caddr_t)bp, PRIBIO); 537 splx(s); 538 u.u_error = geterror(bp); 539 } 540 541 /* 542 * Mark I/O complete on a buffer. If the header 543 * indicates a dirty page push completion, the 544 * header is inserted into the ``cleaned'' list 545 * to be processed by the pageout daemon. Otherwise 546 * release it if I/O is asynchronous, and wake 547 * up anyone waiting for it. 548 */ 549 biodone(bp) 550 register struct buf *bp; 551 { 552 register int s; 553 554 if (bp->b_flags & B_DONE) 555 panic("dup biodone"); 556 bp->b_flags |= B_DONE; 557 if (bp->b_flags & B_DIRTY) { 558 if (bp->b_flags & B_ERROR) 559 panic("IO err in push"); 560 s = spl6(); 561 bp->av_forw = bclnlist; 562 bp->b_bcount = swsize[bp - swbuf]; 563 bp->b_pfcent = swpf[bp - swbuf]; 564 cnt.v_pgout++; 565 cnt.v_pgpgout += bp->b_bcount / NBPG; 566 bclnlist = bp; 567 if (bswlist.b_flags & B_WANTED) 568 wakeup((caddr_t)&proc[2]); 569 splx(s); 570 return; 571 } 572 if (bp->b_flags&B_ASYNC) 573 brelse(bp); 574 else { 575 bp->b_flags &= ~B_WANTED; 576 wakeup((caddr_t)bp); 577 } 578 } 579 580 /* 581 * Insure that no part of a specified block is in an incore buffer. 582 */ 583 blkflush(dev, blkno, size) 584 dev_t dev; 585 daddr_t blkno; 586 long size; 587 { 588 register struct buf *ep; 589 struct buf *dp; 590 daddr_t start, last; 591 int s; 592 593 start = blkno; 594 last = start + (size / DEV_BSIZE) - 1; 595 dp = BUFHASH(dev, blkno); 596 loop: 597 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 598 if (ep->b_dev != dev || (ep->b_flags&B_INVAL)) 599 continue; 600 /* look for overlap */ 601 if (ep->b_bcount == 0 || ep->b_blkno > last || 602 ep->b_blkno + (ep->b_bcount / DEV_BSIZE) <= start) 603 continue; 604 s = spl6(); 605 if (ep->b_flags&B_BUSY) { 606 ep->b_flags |= B_WANTED; 607 sleep((caddr_t)ep, PRIBIO+1); 608 splx(s); 609 goto loop; 610 } 611 if (ep->b_flags & B_DELWRI) { 612 splx(s); 613 notavail(ep); 614 bwrite(ep); 615 goto loop; 616 } 617 splx(s); 618 } 619 } 620 621 /* 622 * make sure all write-behind blocks 623 * on dev (or NODEV for all) 624 * are flushed out. 625 * (from umount and update) 626 * (and temporarily pagein) 627 */ 628 bflush(dev) 629 dev_t dev; 630 { 631 register struct buf *bp; 632 register struct buf *flist; 633 int s; 634 635 loop: 636 s = spl6(); 637 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 638 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 639 if ((bp->b_flags & B_DELWRI) == 0) 640 continue; 641 if (dev == NODEV || dev == bp->b_dev) { 642 bp->b_flags |= B_ASYNC; 643 notavail(bp); 644 bwrite(bp); 645 goto loop; 646 } 647 } 648 splx(s); 649 } 650 651 /* 652 * Pick up the device's error number and pass it to the user; 653 * if there is an error but the number is 0 set a generalized 654 * code. Actually the latter is always true because devices 655 * don't yet return specific errors. 656 */ 657 geterror(bp) 658 register struct buf *bp; 659 { 660 int error = 0; 661 662 if (bp->b_flags&B_ERROR) 663 if ((error = bp->b_error)==0) 664 return (EIO); 665 return (error); 666 } 667 668 /* 669 * Invalidate in core blocks belonging to closed or umounted filesystem 670 * 671 * This is not nicely done at all - the buffer ought to be removed from the 672 * hash chains & have its dev/blkno fields clobbered, but unfortunately we 673 * can't do that here, as it is quite possible that the block is still 674 * being used for i/o. Eventually, all disc drivers should be forced to 675 * have a close routine, which ought ensure that the queue is empty, then 676 * properly flush the queues. Until that happy day, this suffices for 677 * correctness. ... kre 678 */ 679 binval(dev) 680 dev_t dev; 681 { 682 register struct buf *bp; 683 register struct bufhd *hp; 684 #define dp ((struct buf *)hp) 685 686 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 687 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 688 if (bp->b_dev == dev) 689 bp->b_flags |= B_INVAL; 690 } 691