1 /* vfs_bio.c 3.4 10/14/12 */ 2 3 #include "../h/param.h" 4 #include "../h/systm.h" 5 #include "../h/dir.h" 6 #include "../h/user.h" 7 #include "../h/buf.h" 8 #include "../h/conf.h" 9 #include "../h/proc.h" 10 #include "../h/seg.h" 11 #include "../h/pte.h" 12 #include "../h/vm.h" 13 14 /* 15 * The following several routines allocate and free 16 * buffers with various side effects. In general the 17 * arguments to an allocate routine are a device and 18 * a block number, and the value is a pointer to 19 * to the buffer header; the buffer is marked "busy" 20 * so that no one else can touch it. If the block was 21 * already in core, no I/O need be done; if it is 22 * already busy, the process waits until it becomes free. 23 * The following routines allocate a buffer: 24 * getblk 25 * bread 26 * breada 27 * baddr (if it is incore) 28 * Eventually the buffer must be released, possibly with the 29 * side effect of writing it out, by using one of 30 * bwrite 31 * bdwrite 32 * bawrite 33 * brelse 34 */ 35 36 #define BUFHSZ 63 37 #define BUFHASH(blkno) (blkno % BUFHSZ) 38 short bufhash[BUFHSZ]; 39 40 /* 41 * Initialize hash links for buffers. 42 */ 43 bhinit() 44 { 45 register int i; 46 47 for (i = 0; i < BUFHSZ; i++) 48 bufhash[i] = -1; 49 } 50 51 /* #define DISKMON 1 */ 52 53 #ifdef DISKMON 54 struct { 55 int nbuf; 56 long nread; 57 long nreada; 58 long ncache; 59 long nwrite; 60 long bufcount[NBUF]; 61 } io_info; 62 #endif 63 64 /* 65 * Swap IO headers - 66 * They contain the necessary information for the swap I/O. 67 * At any given time, a swap header can be in three 68 * different lists. When free it is in the free list, 69 * when allocated and the I/O queued, it is on the swap 70 * device list, and finally, if the operation was a dirty 71 * page push, when the I/O completes, it is inserted 72 * in a list of cleaned pages to be processed by the pageout daemon. 73 */ 74 struct buf swbuf[NSWBUF]; 75 short swsize[NSWBUF]; /* CAN WE JUST USE B_BCOUNT? */ 76 int swpf[NSWBUF]; 77 78 79 #ifdef FASTVAX 80 #define notavail(bp) \ 81 { \ 82 int s = spl6(); \ 83 (bp)->av_back->av_forw = (bp)->av_forw; \ 84 (bp)->av_forw->av_back = (bp)->av_back; \ 85 (bp)->b_flags |= B_BUSY; \ 86 splx(s); \ 87 } 88 #endif 89 90 /* 91 * Read in (if necessary) the block and return a buffer pointer. 92 */ 93 struct buf * 94 bread(dev, blkno) 95 dev_t dev; 96 daddr_t blkno; 97 { 98 register struct buf *bp; 99 100 bp = getblk(dev, blkno); 101 if (bp->b_flags&B_DONE) { 102 #ifdef DISKMON 103 io_info.ncache++; 104 #endif 105 return(bp); 106 } 107 bp->b_flags |= B_READ; 108 bp->b_bcount = BSIZE; 109 (*bdevsw[major(dev)].d_strategy)(bp); 110 #ifdef DISKMON 111 io_info.nread++; 112 #endif 113 u.u_vm.vm_inblk++; /* pay for read */ 114 iowait(bp); 115 return(bp); 116 } 117 118 /* 119 * Read in the block, like bread, but also start I/O on the 120 * read-ahead block (which is not allocated to the caller) 121 */ 122 struct buf * 123 breada(dev, blkno, rablkno) 124 dev_t dev; 125 daddr_t blkno, rablkno; 126 { 127 register struct buf *bp, *rabp; 128 129 bp = NULL; 130 if (!incore(dev, blkno)) { 131 bp = getblk(dev, blkno); 132 if ((bp->b_flags&B_DONE) == 0) { 133 bp->b_flags |= B_READ; 134 bp->b_bcount = BSIZE; 135 (*bdevsw[major(dev)].d_strategy)(bp); 136 #ifdef DISKMON 137 io_info.nread++; 138 #endif 139 u.u_vm.vm_inblk++; /* pay for read */ 140 } 141 } 142 if (rablkno && !incore(dev, rablkno)) { 143 rabp = getblk(dev, rablkno); 144 if (rabp->b_flags & B_DONE) 145 brelse(rabp); 146 else { 147 rabp->b_flags |= B_READ|B_ASYNC; 148 rabp->b_bcount = BSIZE; 149 (*bdevsw[major(dev)].d_strategy)(rabp); 150 #ifdef DISKMON 151 io_info.nreada++; 152 #endif 153 u.u_vm.vm_inblk++; /* pay in advance */ 154 } 155 } 156 if(bp == NULL) 157 return(bread(dev, blkno)); 158 iowait(bp); 159 return(bp); 160 } 161 162 /* 163 * Write the buffer, waiting for completion. 164 * Then release the buffer. 165 */ 166 bwrite(bp) 167 register struct buf *bp; 168 { 169 register flag; 170 171 flag = bp->b_flags; 172 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 173 bp->b_bcount = BSIZE; 174 #ifdef DISKMON 175 io_info.nwrite++; 176 #endif 177 if ((flag&B_DELWRI) == 0) 178 u.u_vm.vm_oublk++; /* noone paid yet */ 179 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 180 if ((flag&B_ASYNC) == 0) { 181 iowait(bp); 182 brelse(bp); 183 } else if (flag & B_DELWRI) 184 bp->b_flags |= B_AGE; 185 else 186 geterror(bp); 187 } 188 189 /* 190 * Release the buffer, marking it so that if it is grabbed 191 * for another purpose it will be written out before being 192 * given up (e.g. when writing a partial block where it is 193 * assumed that another write for the same block will soon follow). 194 * This can't be done for magtape, since writes must be done 195 * in the same order as requested. 196 */ 197 bdwrite(bp) 198 register struct buf *bp; 199 { 200 register struct buf *dp; 201 202 if ((bp->b_flags&B_DELWRI) == 0) 203 u.u_vm.vm_oublk++; /* noone paid yet */ 204 dp = bdevsw[major(bp->b_dev)].d_tab; 205 if(dp->b_flags & B_TAPE) 206 bawrite(bp); 207 else { 208 bp->b_flags |= B_DELWRI | B_DONE; 209 brelse(bp); 210 } 211 } 212 213 /* 214 * Release the buffer, start I/O on it, but don't wait for completion. 215 */ 216 bawrite(bp) 217 register struct buf *bp; 218 { 219 220 bp->b_flags |= B_ASYNC; 221 bwrite(bp); 222 } 223 224 /* 225 * release the buffer, with no I/O implied. 226 */ 227 brelse(bp) 228 register struct buf *bp; 229 { 230 register struct buf **backp; 231 register s; 232 233 if (bp->b_flags&B_WANTED) 234 wakeup((caddr_t)bp); 235 if (bfreelist.b_flags&B_WANTED) { 236 bfreelist.b_flags &= ~B_WANTED; 237 wakeup((caddr_t)&bfreelist); 238 } 239 if ((bp->b_flags&B_ERROR) && bp->b_dev != NODEV) { 240 bunhash(bp); 241 bp->b_dev = NODEV; /* no assoc. on error */ 242 } 243 s = spl6(); 244 if(bp->b_flags & (B_AGE|B_ERROR)) { 245 backp = &bfreelist.av_forw; 246 (*backp)->av_back = bp; 247 bp->av_forw = *backp; 248 *backp = bp; 249 bp->av_back = &bfreelist; 250 } else { 251 backp = &bfreelist.av_back; 252 (*backp)->av_forw = bp; 253 bp->av_back = *backp; 254 *backp = bp; 255 bp->av_forw = &bfreelist; 256 } 257 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 258 splx(s); 259 } 260 261 /* 262 * See if the block is associated with some buffer 263 * (mainly to avoid getting hung up on a wait in breada) 264 */ 265 incore(dev, blkno) 266 dev_t dev; 267 daddr_t blkno; 268 { 269 register struct buf *bp; 270 register int dblkno = fsbtodb(blkno); 271 272 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 273 bp = &buf[bp->b_hlink]) 274 if (bp->b_blkno == dblkno && bp->b_dev == dev) 275 return (1); 276 return (0); 277 } 278 279 struct buf * 280 baddr(dev, blkno) 281 dev_t dev; 282 daddr_t blkno; 283 { 284 285 if (incore(dev, blkno)) 286 return (bread(dev, blkno)); 287 return (0); 288 } 289 290 /* 291 * Assign a buffer for the given block. If the appropriate 292 * block is already associated, return it; otherwise search 293 * for the oldest non-busy buffer and reassign it. 294 */ 295 struct buf * 296 getblk(dev, blkno) 297 dev_t dev; 298 daddr_t blkno; 299 { 300 register struct buf *bp, *dp, *ep; 301 register int i, x; 302 register int dblkno = fsbtodb(blkno); 303 304 loop: 305 (void) spl0(); 306 for (bp = &buf[bufhash[BUFHASH(blkno)]]; bp != &buf[-1]; 307 bp = &buf[bp->b_hlink]) { 308 if (bp->b_blkno != dblkno || bp->b_dev != dev) 309 continue; 310 (void) spl6(); 311 if (bp->b_flags&B_BUSY) { 312 bp->b_flags |= B_WANTED; 313 sleep((caddr_t)bp, PRIBIO+1); 314 goto loop; 315 } 316 (void) spl0(); 317 #ifdef DISKMON 318 i = 0; 319 dp = bp->av_forw; 320 while (dp != &bfreelist) { 321 i++; 322 dp = dp->av_forw; 323 } 324 if (i<NBUF) 325 io_info.bufcount[i]++; 326 #endif 327 notavail(bp); 328 bp->b_flags |= B_CACHE; 329 return(bp); 330 } 331 if (major(dev) >= nblkdev) 332 panic("blkdev"); 333 dp = bdevsw[major(dev)].d_tab; 334 if (dp == NULL) 335 panic("devtab"); 336 (void) spl6(); 337 if (bfreelist.av_forw == &bfreelist) { 338 bfreelist.b_flags |= B_WANTED; 339 sleep((caddr_t)&bfreelist, PRIBIO+1); 340 goto loop; 341 } 342 spl0(); 343 bp = bfreelist.av_forw; 344 notavail(bp); 345 if (bp->b_flags & B_DELWRI) { 346 bp->b_flags |= B_ASYNC; 347 bwrite(bp); 348 goto loop; 349 } 350 if (bp->b_dev == NODEV) 351 goto done; 352 /* INLINE EXPANSION OF bunhash(bp) */ 353 i = BUFHASH(dbtofsb(bp->b_blkno)); 354 x = bp - buf; 355 if (bufhash[i] == x) { 356 bufhash[i] = bp->b_hlink; 357 } else { 358 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 359 ep = &buf[ep->b_hlink]) 360 if (ep->b_hlink == x) { 361 ep->b_hlink = bp->b_hlink; 362 goto done; 363 } 364 panic("getblk"); 365 } 366 done: 367 /* END INLINE EXPANSION */ 368 bp->b_flags = B_BUSY; 369 bp->b_back->b_forw = bp->b_forw; 370 bp->b_forw->b_back = bp->b_back; 371 bp->b_forw = dp->b_forw; 372 bp->b_back = dp; 373 dp->b_forw->b_back = bp; 374 dp->b_forw = bp; 375 bp->b_dev = dev; 376 bp->b_blkno = dblkno; 377 i = BUFHASH(blkno); 378 bp->b_hlink = bufhash[i]; 379 bufhash[i] = bp - buf; 380 return(bp); 381 } 382 383 /* 384 * get an empty block, 385 * not assigned to any particular device 386 */ 387 struct buf * 388 geteblk() 389 { 390 register struct buf *bp, *dp, *ep; 391 register int i, x; 392 393 loop: 394 (void) spl6(); 395 while (bfreelist.av_forw == &bfreelist) { 396 bfreelist.b_flags |= B_WANTED; 397 sleep((caddr_t)&bfreelist, PRIBIO+1); 398 } 399 (void) spl0(); 400 dp = &bfreelist; 401 bp = bfreelist.av_forw; 402 notavail(bp); 403 if (bp->b_flags & B_DELWRI) { 404 bp->b_flags |= B_ASYNC; 405 bwrite(bp); 406 goto loop; 407 } 408 if (bp->b_dev != NODEV) 409 bunhash(bp); 410 bp->b_flags = B_BUSY; 411 bp->b_back->b_forw = bp->b_forw; 412 bp->b_forw->b_back = bp->b_back; 413 bp->b_forw = dp->b_forw; 414 bp->b_back = dp; 415 dp->b_forw->b_back = bp; 416 dp->b_forw = bp; 417 bp->b_dev = (dev_t)NODEV; 418 bp->b_hlink = -1; 419 return(bp); 420 } 421 422 bunhash(bp) 423 register struct buf *bp; 424 { 425 register struct buf *ep; 426 register int i, x; 427 428 if (bp->b_dev == NODEV) 429 return; 430 i = BUFHASH(dbtofsb(bp->b_blkno)); 431 x = bp - buf; 432 if (bufhash[i] == x) { 433 bufhash[i] = bp->b_hlink; 434 return; 435 } 436 for (ep = &buf[bufhash[i]]; ep != &buf[-1]; 437 ep = &buf[ep->b_hlink]) 438 if (ep->b_hlink == x) { 439 ep->b_hlink = bp->b_hlink; 440 return; 441 } 442 panic("bunhash"); 443 } 444 445 /* 446 * Wait for I/O completion on the buffer; return errors 447 * to the user. 448 */ 449 iowait(bp) 450 register struct buf *bp; 451 { 452 453 (void) spl6(); 454 while ((bp->b_flags&B_DONE)==0) 455 sleep((caddr_t)bp, PRIBIO); 456 (void) spl0(); 457 geterror(bp); 458 } 459 460 #ifndef FASTVAX 461 /* 462 * Unlink a buffer from the available list and mark it busy. 463 * (internal interface) 464 */ 465 notavail(bp) 466 register struct buf *bp; 467 { 468 register s; 469 470 s = spl6(); 471 bp->av_back->av_forw = bp->av_forw; 472 bp->av_forw->av_back = bp->av_back; 473 bp->b_flags |= B_BUSY; 474 splx(s); 475 } 476 #endif 477 478 /* 479 * Mark I/O complete on a buffer. If the header 480 * indicates a dirty page push completion, the 481 * header is inserted into the ``cleaned'' list 482 * to be processed by the pageout daemon. Otherwise 483 * release it if I/O is asynchronous, and wake 484 * up anyone waiting for it. 485 */ 486 iodone(bp) 487 register struct buf *bp; 488 { 489 register int s; 490 491 bp->b_flags |= B_DONE; 492 if (bp->b_flags & B_DIRTY) { 493 if (bp->b_flags & B_ERROR) 494 panic("IO err in push"); 495 s = spl6(); 496 cnt.v_pgout++; 497 bp->av_forw = bclnlist; 498 bp->b_bcount = swsize[bp - swbuf]; 499 bp->b_pfcent = swpf[bp - swbuf]; 500 bclnlist = bp; 501 if (bswlist.b_flags & B_WANTED) 502 wakeup((caddr_t)&proc[2]); 503 splx(s); 504 } 505 if (bp->b_flags&B_ASYNC) 506 brelse(bp); 507 else { 508 bp->b_flags &= ~B_WANTED; 509 wakeup((caddr_t)bp); 510 } 511 } 512 513 /* 514 * Zero the core associated with a buffer. 515 */ 516 clrbuf(bp) 517 struct buf *bp; 518 { 519 register *p; 520 register c; 521 522 p = bp->b_un.b_words; 523 c = BSIZE/sizeof(int); 524 do 525 *p++ = 0; 526 while (--c); 527 bp->b_resid = 0; 528 } 529 530 /* 531 * swap I/O - 532 * 533 * If the flag indicates a dirty page push initiated 534 * by the pageout daemon, we map the page into the i th 535 * virtual page of process 2 (the daemon itself) where i is 536 * the index of the swap header that has been allocated. 537 * We simply initialize the header and queue the I/O but 538 * do not wait for completion. When the I/O completes, 539 * iodone() will link the header to a list of cleaned 540 * pages to be processed by the pageout daemon. 541 */ 542 swap(p, dblkno, addr, nbytes, rdflg, flag, dev, pfcent) 543 struct proc *p; 544 swblk_t dblkno; 545 caddr_t addr; 546 int flag, nbytes; 547 dev_t dev; 548 unsigned pfcent; 549 { 550 register struct buf *bp; 551 register int c; 552 int p2dp; 553 register struct pte *dpte, *vpte; 554 555 (void) spl6(); 556 while (bswlist.av_forw == NULL) { 557 bswlist.b_flags |= B_WANTED; 558 sleep((caddr_t)&bswlist, PSWP+1); 559 } 560 bp = bswlist.av_forw; 561 bswlist.av_forw = bp->av_forw; 562 (void) spl0(); 563 564 bp->b_flags = B_BUSY | B_PHYS | rdflg | flag; 565 if ((bp->b_flags & (B_DIRTY|B_PGIN)) == 0) 566 if (rdflg == B_READ) 567 sum.v_pswpin += btoc(nbytes); 568 else 569 sum.v_pswpout += btoc(nbytes); 570 bp->b_proc = p; 571 if (flag & B_DIRTY) { 572 p2dp = ((bp - swbuf) * CLSIZE) * KLMAX; 573 dpte = dptopte(&proc[2], p2dp); 574 vpte = vtopte(p, btop(addr)); 575 for (c = 0; c < nbytes; c += NBPG) { 576 if (vpte->pg_pfnum == 0 || vpte->pg_fod) 577 panic("swap bad pte"); 578 *dpte++ = *vpte++; 579 } 580 bp->b_un.b_addr = (caddr_t)ctob(p2dp); 581 } else 582 bp->b_un.b_addr = addr; 583 while (nbytes > 0) { 584 c = imin(ctob(120), nbytes); 585 bp->b_bcount = c; 586 bp->b_blkno = dblkno; 587 bp->b_dev = dev; 588 if (dev == swapdev) 589 bp->b_blkno += swplo; 590 (*bdevsw[major(dev)].d_strategy)(bp); 591 if (flag & B_DIRTY) { 592 if (c < nbytes) 593 panic("big push"); 594 swsize[bp - swbuf] = nbytes; 595 swpf[bp - swbuf] = pfcent; 596 return; 597 } 598 (void) spl6(); 599 while((bp->b_flags&B_DONE)==0) 600 sleep((caddr_t)bp, PSWP); 601 (void) spl0(); 602 bp->b_un.b_addr += c; 603 bp->b_flags &= ~B_DONE; 604 if (bp->b_flags & B_ERROR) { 605 if ((flag & (B_UAREA|B_PAGET)) || rdflg == B_WRITE) 606 panic("hard IO err in swap"); 607 swkill(p, (char *)0); 608 } 609 nbytes -= c; 610 dblkno += btoc(c); 611 } 612 (void) spl6(); 613 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY); 614 bp->av_forw = bswlist.av_forw; 615 bswlist.av_forw = bp; 616 if (bswlist.b_flags & B_WANTED) { 617 bswlist.b_flags &= ~B_WANTED; 618 wakeup((caddr_t)&bswlist); 619 wakeup((caddr_t)&proc[2]); 620 } 621 (void) spl0(); 622 } 623 624 /* 625 * If rout == 0 then killed on swap error, else 626 * rout is the name of the routine where we ran out of 627 * swap space. 628 */ 629 swkill(p, rout) 630 struct proc *p; 631 char *rout; 632 { 633 634 printf("%d: ", p->p_pid); 635 if (rout) 636 printf("out of swap space in %s\n", rout); 637 else 638 printf("killed on swap error\n"); 639 /* 640 * To be sure no looping (e.g. in vmsched trying to 641 * swap out) mark process locked in core (as though 642 * done by user) after killing it so noone will try 643 * to swap it out. 644 */ 645 psignal(p, SIGKIL); 646 p->p_flag |= SULOCK; 647 } 648 649 /* 650 * make sure all write-behind blocks 651 * on dev (or NODEV for all) 652 * are flushed out. 653 * (from umount and update) 654 */ 655 bflush(dev) 656 dev_t dev; 657 { 658 register struct buf *bp; 659 660 loop: 661 (void) spl6(); 662 for (bp = bfreelist.av_forw; bp != &bfreelist; bp = bp->av_forw) { 663 if (bp->b_flags&B_DELWRI && (dev == NODEV||dev==bp->b_dev)) { 664 bp->b_flags |= B_ASYNC; 665 notavail(bp); 666 bwrite(bp); 667 goto loop; 668 } 669 } 670 (void) spl0(); 671 } 672 673 /* 674 * Raw I/O. The arguments are 675 * The strategy routine for the device 676 * A buffer, which will always be a special buffer 677 * header owned exclusively by the device for this purpose 678 * The device number 679 * Read/write flag 680 * Essentially all the work is computing physical addresses and 681 * validating them. 682 * If the user has the proper access privilidges, the process is 683 * marked 'delayed unlock' and the pages involved in the I/O are 684 * faulted and locked. After the completion of the I/O, the above pages 685 * are unlocked. 686 */ 687 physio(strat, bp, dev, rw, mincnt) 688 int (*strat)(); 689 register struct buf *bp; 690 unsigned (*mincnt)(); 691 { 692 register int c; 693 char *a; 694 695 if (useracc(u.u_base,u.u_count,rw==B_READ?B_WRITE:B_READ) == NULL) { 696 u.u_error = EFAULT; 697 return; 698 } 699 (void) spl6(); 700 while (bp->b_flags&B_BUSY) { 701 bp->b_flags |= B_WANTED; 702 sleep((caddr_t)bp, PRIBIO+1); 703 } 704 bp->b_error = 0; 705 bp->b_proc = u.u_procp; 706 bp->b_un.b_addr = u.u_base; 707 while (u.u_count != 0 && bp->b_error==0) { 708 bp->b_flags = B_BUSY | B_PHYS | rw; 709 bp->b_dev = dev; 710 bp->b_blkno = u.u_offset >> PGSHIFT; 711 bp->b_bcount = u.u_count; 712 (*mincnt)(bp); 713 c = bp->b_bcount; 714 u.u_procp->p_flag |= SPHYSIO; 715 vslock(a = bp->b_un.b_addr, c); 716 (*strat)(bp); 717 (void) spl6(); 718 while ((bp->b_flags&B_DONE) == 0) 719 sleep((caddr_t)bp, PRIBIO); 720 vsunlock(a, c, rw); 721 u.u_procp->p_flag &= ~SPHYSIO; 722 if (bp->b_flags&B_WANTED) 723 wakeup((caddr_t)bp); 724 (void) spl0(); 725 bp->b_un.b_addr += c; 726 u.u_count -= c; 727 u.u_offset += c; 728 } 729 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS); 730 u.u_count = bp->b_resid; 731 geterror(bp); 732 } 733 734 /*ARGSUSED*/ 735 unsigned 736 minphys(bp) 737 struct buf *bp; 738 { 739 740 if (bp->b_bcount > 60 * 1024) 741 bp->b_bcount = 60 * 1024; 742 } 743 744 /* 745 * Pick up the device's error number and pass it to the user; 746 * if there is an error but the number is 0 set a generalized 747 * code. Actually the latter is always true because devices 748 * don't yet return specific errors. 749 */ 750 geterror(bp) 751 register struct buf *bp; 752 { 753 754 if (bp->b_flags&B_ERROR) 755 if ((u.u_error = bp->b_error)==0) 756 u.u_error = EIO; 757 } 758