1 /* 2 * Copyright (c) 1982 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 * 6 * @(#)vfs_cluster.c 6.9 (Berkeley) 02/20/86 7 */ 8 9 #include "../machine/pte.h" 10 11 #include "param.h" 12 #include "systm.h" 13 #include "dir.h" 14 #include "user.h" 15 #include "buf.h" 16 #include "conf.h" 17 #include "proc.h" 18 #include "seg.h" 19 #include "vm.h" 20 #include "trace.h" 21 22 /* 23 * Read in (if necessary) the block and return a buffer pointer. 24 */ 25 struct buf * 26 bread(dev, blkno, size) 27 dev_t dev; 28 daddr_t blkno; 29 int size; 30 { 31 register struct buf *bp; 32 33 if (size == 0) 34 panic("bread: size 0"); 35 bp = getblk(dev, blkno, size); 36 if (bp->b_flags&B_DONE) { 37 trace(TR_BREADHIT, pack(dev, size), blkno); 38 return (bp); 39 } 40 bp->b_flags |= B_READ; 41 if (bp->b_bcount > bp->b_bufsize) 42 panic("bread"); 43 (*bdevsw[major(dev)].d_strategy)(bp); 44 trace(TR_BREADMISS, pack(dev, size), blkno); 45 u.u_ru.ru_inblock++; /* pay for read */ 46 biowait(bp); 47 return (bp); 48 } 49 50 /* 51 * Read in the block, like bread, but also start I/O on the 52 * read-ahead block (which is not allocated to the caller) 53 */ 54 struct buf * 55 breada(dev, blkno, size, rablkno, rabsize) 56 dev_t dev; 57 daddr_t blkno; int size; 58 daddr_t rablkno; int rabsize; 59 { 60 register struct buf *bp, *rabp; 61 62 bp = NULL; 63 /* 64 * If the block isn't in core, then allocate 65 * a buffer and initiate i/o (getblk checks 66 * for a cache hit). 67 */ 68 if (!incore(dev, blkno)) { 69 bp = getblk(dev, blkno, size); 70 if ((bp->b_flags&B_DONE) == 0) { 71 bp->b_flags |= B_READ; 72 if (bp->b_bcount > bp->b_bufsize) 73 panic("breada"); 74 (*bdevsw[major(dev)].d_strategy)(bp); 75 trace(TR_BREADMISS, pack(dev, size), blkno); 76 u.u_ru.ru_inblock++; /* pay for read */ 77 } else 78 trace(TR_BREADHIT, pack(dev, size), blkno); 79 } 80 81 /* 82 * If there's a read-ahead block, start i/o 83 * on it also (as above). 84 */ 85 if (rablkno && !incore(dev, rablkno)) { 86 rabp = getblk(dev, rablkno, rabsize); 87 if (rabp->b_flags & B_DONE) { 88 brelse(rabp); 89 trace(TR_BREADHITRA, pack(dev, rabsize), blkno); 90 } else { 91 rabp->b_flags |= B_READ|B_ASYNC; 92 if (rabp->b_bcount > rabp->b_bufsize) 93 panic("breadrabp"); 94 (*bdevsw[major(dev)].d_strategy)(rabp); 95 trace(TR_BREADMISSRA, pack(dev, rabsize), rablock); 96 u.u_ru.ru_inblock++; /* pay in advance */ 97 } 98 } 99 100 /* 101 * If block was in core, let bread get it. 102 * If block wasn't in core, then the read was started 103 * above, and just wait for it. 104 */ 105 if (bp == NULL) 106 return (bread(dev, blkno, size)); 107 biowait(bp); 108 return (bp); 109 } 110 111 /* 112 * Write the buffer, waiting for completion. 113 * Then release the buffer. 114 */ 115 bwrite(bp) 116 register struct buf *bp; 117 { 118 register flag; 119 120 flag = bp->b_flags; 121 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 122 if ((flag&B_DELWRI) == 0) 123 u.u_ru.ru_oublock++; /* noone paid yet */ 124 trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno); 125 if (bp->b_bcount > bp->b_bufsize) 126 panic("bwrite"); 127 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 128 129 /* 130 * If the write was synchronous, then await i/o completion. 131 * If the write was "delayed", then we put the buffer on 132 * the q of blocks awaiting i/o completion status. 133 */ 134 if ((flag&B_ASYNC) == 0) { 135 biowait(bp); 136 brelse(bp); 137 } else if (flag & B_DELWRI) 138 bp->b_flags |= B_AGE; 139 } 140 141 /* 142 * Release the buffer, marking it so that if it is grabbed 143 * for another purpose it will be written out before being 144 * given up (e.g. when writing a partial block where it is 145 * assumed that another write for the same block will soon follow). 146 * This can't be done for magtape, since writes must be done 147 * in the same order as requested. 148 */ 149 bdwrite(bp) 150 register struct buf *bp; 151 { 152 register int flags; 153 154 if ((bp->b_flags&B_DELWRI) == 0) 155 u.u_ru.ru_oublock++; /* noone paid yet */ 156 flags = bdevsw[major(bp->b_dev)].d_flags; 157 if(flags & B_TAPE) 158 bawrite(bp); 159 else { 160 bp->b_flags |= B_DELWRI | B_DONE; 161 brelse(bp); 162 } 163 } 164 165 /* 166 * Release the buffer, start I/O on it, but don't wait for completion. 167 */ 168 bawrite(bp) 169 register struct buf *bp; 170 { 171 172 bp->b_flags |= B_ASYNC; 173 bwrite(bp); 174 } 175 176 /* 177 * Release the buffer, with no I/O implied. 178 */ 179 brelse(bp) 180 register struct buf *bp; 181 { 182 register struct buf *flist; 183 register s; 184 185 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 186 /* 187 * If someone's waiting for the buffer, or 188 * is waiting for a buffer wake 'em up. 189 */ 190 if (bp->b_flags&B_WANTED) 191 wakeup((caddr_t)bp); 192 if (bfreelist[0].b_flags&B_WANTED) { 193 bfreelist[0].b_flags &= ~B_WANTED; 194 wakeup((caddr_t)bfreelist); 195 } 196 if (bp->b_flags&B_ERROR) 197 if (bp->b_flags & B_LOCKED) 198 bp->b_flags &= ~B_ERROR; /* try again later */ 199 else 200 bp->b_dev = NODEV; /* no assoc */ 201 202 /* 203 * Stick the buffer back on a free list. 204 */ 205 s = splbio(); 206 if (bp->b_bufsize <= 0) { 207 /* block has no buffer ... put at front of unused buffer list */ 208 flist = &bfreelist[BQ_EMPTY]; 209 binsheadfree(bp, flist); 210 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 211 /* block has no info ... put at front of most free list */ 212 flist = &bfreelist[BQ_AGE]; 213 binsheadfree(bp, flist); 214 } else { 215 if (bp->b_flags & B_LOCKED) 216 flist = &bfreelist[BQ_LOCKED]; 217 else if (bp->b_flags & B_AGE) 218 flist = &bfreelist[BQ_AGE]; 219 else 220 flist = &bfreelist[BQ_LRU]; 221 binstailfree(bp, flist); 222 } 223 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 224 splx(s); 225 } 226 227 /* 228 * See if the block is associated with some buffer 229 * (mainly to avoid getting hung up on a wait in breada) 230 */ 231 incore(dev, blkno) 232 dev_t dev; 233 daddr_t blkno; 234 { 235 register struct buf *bp; 236 register struct buf *dp; 237 238 dp = BUFHASH(dev, blkno); 239 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 240 if (bp->b_blkno == blkno && bp->b_dev == dev && 241 (bp->b_flags & B_INVAL) == 0) 242 return (1); 243 return (0); 244 } 245 246 struct buf * 247 baddr(dev, blkno, size) 248 dev_t dev; 249 daddr_t blkno; 250 int size; 251 { 252 253 if (incore(dev, blkno)) 254 return (bread(dev, blkno, size)); 255 return (0); 256 } 257 258 /* 259 * Assign a buffer for the given block. If the appropriate 260 * block is already associated, return it; otherwise search 261 * for the oldest non-busy buffer and reassign it. 262 * 263 * We use splx here because this routine may be called 264 * on the interrupt stack during a dump, and we don't 265 * want to lower the ipl back to 0. 266 */ 267 struct buf * 268 getblk(dev, blkno, size) 269 dev_t dev; 270 daddr_t blkno; 271 int size; 272 { 273 register struct buf *bp, *dp; 274 int s; 275 276 if (size > MAXBSIZE) 277 panic("getblk: size too big"); 278 /* 279 * To prevent overflow of 32-bit ints when converting block 280 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 281 * to the maximum number that can be converted to a byte offset 282 * without overflow. This is historic code; what bug it fixed, 283 * or whether it is still a reasonable thing to do is open to 284 * dispute. mkm 9/85 285 */ 286 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) 287 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 288 /* 289 * Search the cache for the block. If we hit, but 290 * the buffer is in use for i/o, then we wait until 291 * the i/o has completed. 292 */ 293 dp = BUFHASH(dev, blkno); 294 loop: 295 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 296 if (bp->b_blkno != blkno || bp->b_dev != dev || 297 bp->b_flags&B_INVAL) 298 continue; 299 s = splbio(); 300 if (bp->b_flags&B_BUSY) { 301 bp->b_flags |= B_WANTED; 302 sleep((caddr_t)bp, PRIBIO+1); 303 splx(s); 304 goto loop; 305 } 306 splx(s); 307 notavail(bp); 308 if (bp->b_bcount != size && brealloc(bp, size) == 0) 309 goto loop; 310 bp->b_flags |= B_CACHE; 311 return (bp); 312 } 313 if (major(dev) >= nblkdev) 314 panic("blkdev"); 315 bp = getnewbuf(); 316 bfree(bp); 317 bremhash(bp); 318 binshash(bp, dp); 319 bp->b_dev = dev; 320 bp->b_blkno = blkno; 321 bp->b_error = 0; 322 if (brealloc(bp, size) == 0) 323 goto loop; 324 return (bp); 325 } 326 327 /* 328 * get an empty block, 329 * not assigned to any particular device 330 */ 331 struct buf * 332 geteblk(size) 333 int size; 334 { 335 register struct buf *bp, *flist; 336 337 if (size > MAXBSIZE) 338 panic("geteblk: size too big"); 339 loop: 340 bp = getnewbuf(); 341 bp->b_flags |= B_INVAL; 342 bfree(bp); 343 bremhash(bp); 344 flist = &bfreelist[BQ_AGE]; 345 binshash(bp, flist); 346 bp->b_dev = (dev_t)NODEV; 347 bp->b_error = 0; 348 if (brealloc(bp, size) == 0) 349 goto loop; 350 return (bp); 351 } 352 353 /* 354 * Allocate space associated with a buffer. 355 * If can't get space, buffer is released 356 */ 357 brealloc(bp, size) 358 register struct buf *bp; 359 int size; 360 { 361 daddr_t start, last; 362 register struct buf *ep; 363 struct buf *dp; 364 int s; 365 366 /* 367 * First need to make sure that all overlaping previous I/O 368 * is dispatched with. 369 */ 370 if (size == bp->b_bcount) 371 return (1); 372 if (size < bp->b_bcount) { 373 if (bp->b_flags & B_DELWRI) { 374 bwrite(bp); 375 return (0); 376 } 377 if (bp->b_flags & B_LOCKED) 378 panic("brealloc"); 379 return (allocbuf(bp, size)); 380 } 381 bp->b_flags &= ~B_DONE; 382 if (bp->b_dev == NODEV) 383 return (allocbuf(bp, size)); 384 385 trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno); 386 /* 387 * Search cache for any buffers that overlap the one that we 388 * are trying to allocate. Overlapping buffers must be marked 389 * invalid, after being written out if they are dirty. (indicated 390 * by B_DELWRI) A disk block must be mapped by at most one buffer 391 * at any point in time. Care must be taken to avoid deadlocking 392 * when two buffer are trying to get the same set of disk blocks. 393 */ 394 start = bp->b_blkno; 395 last = start + btodb(size) - 1; 396 dp = BUFHASH(bp->b_dev, bp->b_blkno); 397 loop: 398 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 399 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL)) 400 continue; 401 /* look for overlap */ 402 if (ep->b_bcount == 0 || ep->b_blkno > last || 403 ep->b_blkno + btodb(ep->b_bcount) <= start) 404 continue; 405 s = splbio(); 406 if (ep->b_flags&B_BUSY) { 407 ep->b_flags |= B_WANTED; 408 sleep((caddr_t)ep, PRIBIO+1); 409 splx(s); 410 goto loop; 411 } 412 splx(s); 413 notavail(ep); 414 if (ep->b_flags & B_DELWRI) { 415 bwrite(ep); 416 goto loop; 417 } 418 ep->b_flags |= B_INVAL; 419 brelse(ep); 420 } 421 return (allocbuf(bp, size)); 422 } 423 424 /* 425 * Find a buffer which is available for use. 426 * Select something from a free list. 427 * Preference is to AGE list, then LRU list. 428 */ 429 struct buf * 430 getnewbuf() 431 { 432 register struct buf *bp, *dp; 433 int s; 434 435 loop: 436 s = splbio(); 437 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 438 if (dp->av_forw != dp) 439 break; 440 if (dp == bfreelist) { /* no free blocks */ 441 dp->b_flags |= B_WANTED; 442 sleep((caddr_t)dp, PRIBIO+1); 443 splx(s); 444 goto loop; 445 } 446 splx(s); 447 bp = dp->av_forw; 448 notavail(bp); 449 if (bp->b_flags & B_DELWRI) { 450 bp->b_flags |= B_ASYNC; 451 bwrite(bp); 452 goto loop; 453 } 454 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 455 bp->b_flags = B_BUSY; 456 return (bp); 457 } 458 459 /* 460 * Wait for I/O completion on the buffer; return errors 461 * to the user. 462 */ 463 biowait(bp) 464 register struct buf *bp; 465 { 466 int s; 467 468 s = splbio(); 469 while ((bp->b_flags&B_DONE)==0) 470 sleep((caddr_t)bp, PRIBIO); 471 splx(s); 472 if (u.u_error == 0) /* XXX */ 473 u.u_error = geterror(bp); 474 } 475 476 /* 477 * Mark I/O complete on a buffer. 478 * If someone should be called, e.g. the pageout 479 * daemon, do so. Otherwise, wake up anyone 480 * waiting for it. 481 */ 482 biodone(bp) 483 register struct buf *bp; 484 { 485 486 if (bp->b_flags & B_DONE) 487 panic("dup biodone"); 488 bp->b_flags |= B_DONE; 489 if (bp->b_flags & B_CALL) { 490 bp->b_flags &= ~B_CALL; 491 (*bp->b_iodone)(bp); 492 return; 493 } 494 if (bp->b_flags&B_ASYNC) 495 brelse(bp); 496 else { 497 bp->b_flags &= ~B_WANTED; 498 wakeup((caddr_t)bp); 499 } 500 } 501 502 /* 503 * Insure that no part of a specified block is in an incore buffer. 504 */ 505 blkflush(dev, blkno, size) 506 dev_t dev; 507 daddr_t blkno; 508 long size; 509 { 510 register struct buf *ep; 511 struct buf *dp; 512 daddr_t start, last; 513 int s; 514 515 start = blkno; 516 last = start + btodb(size) - 1; 517 dp = BUFHASH(dev, blkno); 518 loop: 519 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 520 if (ep->b_dev != dev || (ep->b_flags&B_INVAL)) 521 continue; 522 /* look for overlap */ 523 if (ep->b_bcount == 0 || ep->b_blkno > last || 524 ep->b_blkno + btodb(ep->b_bcount) <= start) 525 continue; 526 s = splbio(); 527 if (ep->b_flags&B_BUSY) { 528 ep->b_flags |= B_WANTED; 529 sleep((caddr_t)ep, PRIBIO+1); 530 splx(s); 531 goto loop; 532 } 533 if (ep->b_flags & B_DELWRI) { 534 splx(s); 535 notavail(ep); 536 bwrite(ep); 537 goto loop; 538 } 539 splx(s); 540 } 541 } 542 543 /* 544 * Make sure all write-behind blocks 545 * on dev (or NODEV for all) 546 * are flushed out. 547 * (from umount and update) 548 */ 549 bflush(dev) 550 dev_t dev; 551 { 552 register struct buf *bp; 553 register struct buf *flist; 554 int s; 555 556 loop: 557 s = splbio(); 558 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 559 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 560 if ((bp->b_flags & B_DELWRI) == 0) 561 continue; 562 if (dev == NODEV || dev == bp->b_dev) { 563 bp->b_flags |= B_ASYNC; 564 notavail(bp); 565 bwrite(bp); 566 splx(s); 567 goto loop; 568 } 569 } 570 splx(s); 571 } 572 573 /* 574 * Pick up the device's error number and pass it to the user; 575 * if there is an error but the number is 0 set a generalized code. 576 */ 577 geterror(bp) 578 register struct buf *bp; 579 { 580 int error = 0; 581 582 if (bp->b_flags&B_ERROR) 583 if ((error = bp->b_error)==0) 584 return (EIO); 585 return (error); 586 } 587 588 /* 589 * Invalidate in core blocks belonging to closed or umounted filesystem 590 * 591 * This is not nicely done at all - the buffer ought to be removed from the 592 * hash chains & have its dev/blkno fields clobbered, but unfortunately we 593 * can't do that here, as it is quite possible that the block is still 594 * being used for i/o. Eventually, all disc drivers should be forced to 595 * have a close routine, which ought ensure that the queue is empty, then 596 * properly flush the queues. Until that happy day, this suffices for 597 * correctness. ... kre 598 */ 599 binval(dev) 600 dev_t dev; 601 { 602 register struct buf *bp; 603 register struct bufhd *hp; 604 #define dp ((struct buf *)hp) 605 606 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 607 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 608 if (bp->b_dev == dev) 609 bp->b_flags |= B_INVAL; 610 } 611