1 /* 2 * Copyright (c) 1982, 1986 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 * 6 * @(#)vfs_bio.c 7.3 (Berkeley) 11/12/87 7 */ 8 9 #include "../machine/pte.h" 10 11 #include "param.h" 12 #include "systm.h" 13 #include "dir.h" 14 #include "user.h" 15 #include "buf.h" 16 #include "conf.h" 17 #include "proc.h" 18 #include "seg.h" 19 #include "vm.h" 20 #include "trace.h" 21 22 /* 23 * Read in (if necessary) the block and return a buffer pointer. 24 */ 25 struct buf * 26 bread(dev, blkno, size) 27 dev_t dev; 28 daddr_t blkno; 29 int size; 30 { 31 register struct buf *bp; 32 33 if (size == 0) 34 panic("bread: size 0"); 35 bp = getblk(dev, blkno, size); 36 if (bp->b_flags&(B_DONE|B_DELWRI)) { 37 trace(TR_BREADHIT, pack(dev, size), blkno); 38 return (bp); 39 } 40 bp->b_flags |= B_READ; 41 if (bp->b_bcount > bp->b_bufsize) 42 panic("bread"); 43 (*bdevsw[major(dev)].d_strategy)(bp); 44 trace(TR_BREADMISS, pack(dev, size), blkno); 45 u.u_ru.ru_inblock++; /* pay for read */ 46 biowait(bp); 47 return (bp); 48 } 49 50 /* 51 * Read in the block, like bread, but also start I/O on the 52 * read-ahead block (which is not allocated to the caller) 53 */ 54 struct buf * 55 breada(dev, blkno, size, rablkno, rabsize) 56 dev_t dev; 57 daddr_t blkno; int size; 58 daddr_t rablkno; int rabsize; 59 { 60 register struct buf *bp, *rabp; 61 62 bp = NULL; 63 /* 64 * If the block isn't in core, then allocate 65 * a buffer and initiate i/o (getblk checks 66 * for a cache hit). 67 */ 68 if (!incore(dev, blkno)) { 69 bp = getblk(dev, blkno, size); 70 if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) { 71 bp->b_flags |= B_READ; 72 if (bp->b_bcount > bp->b_bufsize) 73 panic("breada"); 74 (*bdevsw[major(dev)].d_strategy)(bp); 75 trace(TR_BREADMISS, pack(dev, size), blkno); 76 u.u_ru.ru_inblock++; /* pay for read */ 77 } else 78 trace(TR_BREADHIT, pack(dev, size), blkno); 79 } 80 81 /* 82 * If there's a read-ahead block, start i/o 83 * on it also (as above). 84 */ 85 if (rablkno && !incore(dev, rablkno)) { 86 rabp = getblk(dev, rablkno, rabsize); 87 if (rabp->b_flags & (B_DONE|B_DELWRI)) { 88 brelse(rabp); 89 trace(TR_BREADHITRA, pack(dev, rabsize), blkno); 90 } else { 91 rabp->b_flags |= B_READ|B_ASYNC; 92 if (rabp->b_bcount > rabp->b_bufsize) 93 panic("breadrabp"); 94 (*bdevsw[major(dev)].d_strategy)(rabp); 95 trace(TR_BREADMISSRA, pack(dev, rabsize), rablock); 96 u.u_ru.ru_inblock++; /* pay in advance */ 97 } 98 } 99 100 /* 101 * If block was in core, let bread get it. 102 * If block wasn't in core, then the read was started 103 * above, and just wait for it. 104 */ 105 if (bp == NULL) 106 return (bread(dev, blkno, size)); 107 biowait(bp); 108 return (bp); 109 } 110 111 /* 112 * Write the buffer, waiting for completion. 113 * Then release the buffer. 114 */ 115 bwrite(bp) 116 register struct buf *bp; 117 { 118 register flag; 119 120 flag = bp->b_flags; 121 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 122 if ((flag&B_DELWRI) == 0) 123 u.u_ru.ru_oublock++; /* noone paid yet */ 124 trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno); 125 if (bp->b_bcount > bp->b_bufsize) 126 panic("bwrite"); 127 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 128 129 /* 130 * If the write was synchronous, then await i/o completion. 131 * If the write was "delayed", then we put the buffer on 132 * the q of blocks awaiting i/o completion status. 133 */ 134 if ((flag&B_ASYNC) == 0) { 135 biowait(bp); 136 brelse(bp); 137 } else if (flag & B_DELWRI) 138 bp->b_flags |= B_AGE; 139 } 140 141 /* 142 * Release the buffer, marking it so that if it is grabbed 143 * for another purpose it will be written out before being 144 * given up (e.g. when writing a partial block where it is 145 * assumed that another write for the same block will soon follow). 146 * This can't be done for magtape, since writes must be done 147 * in the same order as requested. 148 */ 149 bdwrite(bp) 150 register struct buf *bp; 151 { 152 153 if ((bp->b_flags&B_DELWRI) == 0) 154 u.u_ru.ru_oublock++; /* noone paid yet */ 155 if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) 156 bawrite(bp); 157 else { 158 bp->b_flags |= B_DELWRI | B_DONE; 159 brelse(bp); 160 } 161 } 162 163 /* 164 * Release the buffer, start I/O on it, but don't wait for completion. 165 */ 166 bawrite(bp) 167 register struct buf *bp; 168 { 169 170 bp->b_flags |= B_ASYNC; 171 bwrite(bp); 172 } 173 174 /* 175 * Release the buffer, with no I/O implied. 176 */ 177 brelse(bp) 178 register struct buf *bp; 179 { 180 register struct buf *flist; 181 register s; 182 183 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 184 /* 185 * If someone's waiting for the buffer, or 186 * is waiting for a buffer wake 'em up. 187 */ 188 if (bp->b_flags&B_WANTED) 189 wakeup((caddr_t)bp); 190 if (bfreelist[0].b_flags&B_WANTED) { 191 bfreelist[0].b_flags &= ~B_WANTED; 192 wakeup((caddr_t)bfreelist); 193 } 194 if (bp->b_flags&B_ERROR) 195 if (bp->b_flags & B_LOCKED) 196 bp->b_flags &= ~B_ERROR; /* try again later */ 197 else 198 bp->b_dev = NODEV; /* no assoc */ 199 200 /* 201 * Stick the buffer back on a free list. 202 */ 203 s = splbio(); 204 if (bp->b_bufsize <= 0) { 205 /* block has no buffer ... put at front of unused buffer list */ 206 flist = &bfreelist[BQ_EMPTY]; 207 binsheadfree(bp, flist); 208 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 209 /* block has no info ... put at front of most free list */ 210 flist = &bfreelist[BQ_AGE]; 211 binsheadfree(bp, flist); 212 } else { 213 if (bp->b_flags & B_LOCKED) 214 flist = &bfreelist[BQ_LOCKED]; 215 else if (bp->b_flags & B_AGE) 216 flist = &bfreelist[BQ_AGE]; 217 else 218 flist = &bfreelist[BQ_LRU]; 219 binstailfree(bp, flist); 220 } 221 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 222 splx(s); 223 } 224 225 /* 226 * See if the block is associated with some buffer 227 * (mainly to avoid getting hung up on a wait in breada) 228 */ 229 incore(dev, blkno) 230 dev_t dev; 231 daddr_t blkno; 232 { 233 register struct buf *bp; 234 register struct buf *dp; 235 236 dp = BUFHASH(dev, blkno); 237 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 238 if (bp->b_blkno == blkno && bp->b_dev == dev && 239 (bp->b_flags & B_INVAL) == 0) 240 return (1); 241 return (0); 242 } 243 244 struct buf * 245 baddr(dev, blkno, size) 246 dev_t dev; 247 daddr_t blkno; 248 int size; 249 { 250 251 if (incore(dev, blkno)) 252 return (bread(dev, blkno, size)); 253 return (0); 254 } 255 256 /* 257 * Assign a buffer for the given block. If the appropriate 258 * block is already associated, return it; otherwise search 259 * for the oldest non-busy buffer and reassign it. 260 * 261 * If we find the buffer, but it is dirty (marked DELWRI) and 262 * its size is changing, we must write it out first. When the 263 * buffer is shrinking, the write is done by brealloc to avoid 264 * losing the unwritten data. When the buffer is growing, the 265 * write is done by getblk, so that bread will not read stale 266 * disk data over the modified data in the buffer. 267 * 268 * We use splx here because this routine may be called 269 * on the interrupt stack during a dump, and we don't 270 * want to lower the ipl back to 0. 271 */ 272 struct buf * 273 getblk(dev, blkno, size) 274 dev_t dev; 275 daddr_t blkno; 276 int size; 277 { 278 register struct buf *bp, *dp; 279 int s; 280 281 if (size > MAXBSIZE) 282 panic("getblk: size too big"); 283 /* 284 * To prevent overflow of 32-bit ints when converting block 285 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 286 * to the maximum number that can be converted to a byte offset 287 * without overflow. This is historic code; what bug it fixed, 288 * or whether it is still a reasonable thing to do is open to 289 * dispute. mkm 9/85 290 */ 291 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) 292 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 293 /* 294 * Search the cache for the block. If we hit, but 295 * the buffer is in use for i/o, then we wait until 296 * the i/o has completed. 297 */ 298 dp = BUFHASH(dev, blkno); 299 loop: 300 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 301 if (bp->b_blkno != blkno || bp->b_dev != dev || 302 bp->b_flags&B_INVAL) 303 continue; 304 s = splbio(); 305 if (bp->b_flags&B_BUSY) { 306 bp->b_flags |= B_WANTED; 307 sleep((caddr_t)bp, PRIBIO+1); 308 splx(s); 309 goto loop; 310 } 311 splx(s); 312 notavail(bp); 313 if (bp->b_bcount != size) { 314 if (bp->b_bcount < size && (bp->b_flags&B_DELWRI)) { 315 bp->b_flags &= ~B_ASYNC; 316 bwrite(bp); 317 goto loop; 318 } 319 if (brealloc(bp, size) == 0) 320 goto loop; 321 } 322 if (bp->b_bcount != size && brealloc(bp, size) == 0) 323 goto loop; 324 bp->b_flags |= B_CACHE; 325 return (bp); 326 } 327 if (major(dev) >= nblkdev) 328 panic("blkdev"); 329 bp = getnewbuf(); 330 bfree(bp); 331 bremhash(bp); 332 binshash(bp, dp); 333 bp->b_dev = dev; 334 bp->b_blkno = blkno; 335 bp->b_error = 0; 336 if (brealloc(bp, size) == 0) 337 goto loop; 338 return (bp); 339 } 340 341 /* 342 * get an empty block, 343 * not assigned to any particular device 344 */ 345 struct buf * 346 geteblk(size) 347 int size; 348 { 349 register struct buf *bp, *flist; 350 351 if (size > MAXBSIZE) 352 panic("geteblk: size too big"); 353 loop: 354 bp = getnewbuf(); 355 bp->b_flags |= B_INVAL; 356 bfree(bp); 357 bremhash(bp); 358 flist = &bfreelist[BQ_AGE]; 359 binshash(bp, flist); 360 bp->b_dev = (dev_t)NODEV; 361 bp->b_error = 0; 362 if (brealloc(bp, size) == 0) 363 goto loop; 364 return (bp); 365 } 366 367 /* 368 * Allocate space associated with a buffer. 369 * If can't get space, buffer is released 370 */ 371 brealloc(bp, size) 372 register struct buf *bp; 373 int size; 374 { 375 daddr_t start, last; 376 register struct buf *ep; 377 struct buf *dp; 378 int s; 379 380 /* 381 * First need to make sure that all overlapping previous I/O 382 * is dispatched with. 383 */ 384 if (size == bp->b_bcount) 385 return (1); 386 if (size < bp->b_bcount) { 387 if (bp->b_flags & B_DELWRI) { 388 bwrite(bp); 389 return (0); 390 } 391 if (bp->b_flags & B_LOCKED) 392 panic("brealloc"); 393 return (allocbuf(bp, size)); 394 } 395 bp->b_flags &= ~B_DONE; 396 if (bp->b_dev == NODEV) 397 return (allocbuf(bp, size)); 398 399 trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno); 400 /* 401 * Search cache for any buffers that overlap the one that we 402 * are trying to allocate. Overlapping buffers must be marked 403 * invalid, after being written out if they are dirty. (indicated 404 * by B_DELWRI) A disk block must be mapped by at most one buffer 405 * at any point in time. Care must be taken to avoid deadlocking 406 * when two buffer are trying to get the same set of disk blocks. 407 */ 408 start = bp->b_blkno; 409 last = start + btodb(size) - 1; 410 dp = BUFHASH(bp->b_dev, bp->b_blkno); 411 loop: 412 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 413 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL)) 414 continue; 415 /* look for overlap */ 416 if (ep->b_bcount == 0 || ep->b_blkno > last || 417 ep->b_blkno + btodb(ep->b_bcount) <= start) 418 continue; 419 s = splbio(); 420 if (ep->b_flags&B_BUSY) { 421 ep->b_flags |= B_WANTED; 422 sleep((caddr_t)ep, PRIBIO+1); 423 splx(s); 424 goto loop; 425 } 426 splx(s); 427 notavail(ep); 428 if (ep->b_flags & B_DELWRI) { 429 bwrite(ep); 430 goto loop; 431 } 432 ep->b_flags |= B_INVAL; 433 brelse(ep); 434 } 435 return (allocbuf(bp, size)); 436 } 437 438 /* 439 * Find a buffer which is available for use. 440 * Select something from a free list. 441 * Preference is to AGE list, then LRU list. 442 */ 443 struct buf * 444 getnewbuf() 445 { 446 register struct buf *bp, *dp; 447 int s; 448 449 loop: 450 s = splbio(); 451 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 452 if (dp->av_forw != dp) 453 break; 454 if (dp == bfreelist) { /* no free blocks */ 455 dp->b_flags |= B_WANTED; 456 sleep((caddr_t)dp, PRIBIO+1); 457 splx(s); 458 goto loop; 459 } 460 splx(s); 461 bp = dp->av_forw; 462 notavail(bp); 463 if (bp->b_flags & B_DELWRI) { 464 bp->b_flags |= B_ASYNC; 465 bwrite(bp); 466 goto loop; 467 } 468 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 469 bp->b_flags = B_BUSY; 470 return (bp); 471 } 472 473 /* 474 * Wait for I/O completion on the buffer; return errors 475 * to the user. 476 */ 477 biowait(bp) 478 register struct buf *bp; 479 { 480 int s; 481 482 s = splbio(); 483 while ((bp->b_flags&B_DONE)==0) 484 sleep((caddr_t)bp, PRIBIO); 485 splx(s); 486 if (u.u_error == 0) /* XXX */ 487 u.u_error = geterror(bp); 488 } 489 490 /* 491 * Mark I/O complete on a buffer. 492 * If someone should be called, e.g. the pageout 493 * daemon, do so. Otherwise, wake up anyone 494 * waiting for it. 495 */ 496 biodone(bp) 497 register struct buf *bp; 498 { 499 500 if (bp->b_flags & B_DONE) 501 panic("dup biodone"); 502 bp->b_flags |= B_DONE; 503 if (bp->b_flags & B_CALL) { 504 bp->b_flags &= ~B_CALL; 505 (*bp->b_iodone)(bp); 506 return; 507 } 508 if (bp->b_flags&B_ASYNC) 509 brelse(bp); 510 else { 511 bp->b_flags &= ~B_WANTED; 512 wakeup((caddr_t)bp); 513 } 514 } 515 516 /* 517 * Insure that no part of a specified block is in an incore buffer. 518 #ifdef SECSIZE 519 * "size" is given in device blocks (the units of b_blkno). 520 #endif SECSIZE 521 */ 522 blkflush(dev, blkno, size) 523 dev_t dev; 524 daddr_t blkno; 525 long size; 526 { 527 register struct buf *ep; 528 struct buf *dp; 529 daddr_t start, last; 530 int s; 531 532 start = blkno; 533 last = start + btodb(size) - 1; 534 dp = BUFHASH(dev, blkno); 535 loop: 536 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 537 if (ep->b_dev != dev || (ep->b_flags&B_INVAL)) 538 continue; 539 /* look for overlap */ 540 if (ep->b_bcount == 0 || ep->b_blkno > last || 541 ep->b_blkno + btodb(ep->b_bcount) <= start) 542 continue; 543 s = splbio(); 544 if (ep->b_flags&B_BUSY) { 545 ep->b_flags |= B_WANTED; 546 sleep((caddr_t)ep, PRIBIO+1); 547 splx(s); 548 goto loop; 549 } 550 if (ep->b_flags & B_DELWRI) { 551 splx(s); 552 notavail(ep); 553 bwrite(ep); 554 goto loop; 555 } 556 splx(s); 557 } 558 } 559 560 /* 561 * Make sure all write-behind blocks 562 * on dev (or NODEV for all) 563 * are flushed out. 564 * (from umount and update) 565 */ 566 bflush(dev) 567 dev_t dev; 568 { 569 register struct buf *bp; 570 register struct buf *flist; 571 int s; 572 573 loop: 574 s = splbio(); 575 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 576 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 577 if ((bp->b_flags & B_DELWRI) == 0) 578 continue; 579 if (dev == NODEV || dev == bp->b_dev) { 580 bp->b_flags |= B_ASYNC; 581 notavail(bp); 582 bwrite(bp); 583 splx(s); 584 goto loop; 585 } 586 } 587 splx(s); 588 } 589 590 /* 591 * Pick up the device's error number and pass it to the user; 592 * if there is an error but the number is 0 set a generalized code. 593 */ 594 geterror(bp) 595 register struct buf *bp; 596 { 597 int error = 0; 598 599 if (bp->b_flags&B_ERROR) 600 if ((error = bp->b_error)==0) 601 return (EIO); 602 return (error); 603 } 604 605 /* 606 * Invalidate in core blocks belonging to closed or umounted filesystem 607 * 608 * This is not nicely done at all - the buffer ought to be removed from the 609 * hash chains & have its dev/blkno fields clobbered, but unfortunately we 610 * can't do that here, as it is quite possible that the block is still 611 * being used for i/o. Eventually, all disc drivers should be forced to 612 * have a close routine, which ought ensure that the queue is empty, then 613 * properly flush the queues. Until that happy day, this suffices for 614 * correctness. ... kre 615 */ 616 binval(dev) 617 dev_t dev; 618 { 619 register struct buf *bp; 620 register struct bufhd *hp; 621 #define dp ((struct buf *)hp) 622 623 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 624 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 625 if (bp->b_dev == dev) 626 bp->b_flags |= B_INVAL; 627 } 628