1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_bio.c 7.54 (Berkeley) 10/02/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/mount.h> 17 #include <sys/trace.h> 18 #include <sys/resourcevar.h> 19 #include <sys/malloc.h> 20 #include <libkern/libkern.h> 21 22 /* 23 * Definitions for the buffer hash lists. 24 */ 25 #define BUFHASH(dvp, lbn) \ 26 (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 27 struct buf **bufhashtbl, *invalhash; 28 u_long bufhash; 29 30 /* 31 * Insq/Remq for the buffer hash lists. 32 */ 33 #define bremhash(bp) { \ 34 struct buf *bq; \ 35 if (bq = (bp)->b_forw) \ 36 bq->b_back = (bp)->b_back; \ 37 *(bp)->b_back = bq; \ 38 } 39 #define binshash(bp, dp) { \ 40 struct buf *bq; \ 41 if (bq = *(dp)) \ 42 bq->b_back = &(bp)->b_forw; \ 43 (bp)->b_forw = bq; \ 44 (bp)->b_back = (dp); \ 45 *(dp) = (bp); \ 46 } 47 48 /* 49 * Definitions for the buffer free lists. 50 */ 51 #define BQUEUES 4 /* number of free buffer queues */ 52 53 #define BQ_LOCKED 0 /* super-blocks &c */ 54 #define BQ_LRU 1 /* lru, useful buffers */ 55 #define BQ_AGE 2 /* rubbish */ 56 #define BQ_EMPTY 3 /* buffer headers with no memory */ 57 58 struct bufqueue { 59 struct buf *buffreehead; /* head of available list */ 60 struct buf **buffreetail; /* tail of available list */ 61 } bufqueues[BQUEUES]; 62 int needbuffer; 63 64 /* 65 * Insq/Remq for the buffer free lists. 66 */ 67 void 68 bremfree(bp) 69 struct buf *bp; 70 { 71 struct buf *bq; 72 struct bufqueue *dp; 73 74 if (bq = bp->b_actf) { 75 bq->b_actb = bp->b_actb; 76 } else { 77 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 78 if (dp->buffreetail == &bp->b_actf) 79 break; 80 if (dp == &bufqueues[BQUEUES]) 81 panic("bremfree: lost tail"); 82 dp->buffreetail = bp->b_actb; 83 } 84 *bp->b_actb = bq; 85 } 86 87 #define binsheadfree(bp, dp) { \ 88 struct buf *bq; \ 89 if (bq = (dp)->buffreehead) \ 90 bq->b_actb = &(bp)->b_actf; \ 91 else \ 92 (dp)->buffreetail = &(bp)->b_actf; \ 93 (dp)->buffreehead = (bp); \ 94 (bp)->b_actf = bq; \ 95 (bp)->b_actb = &(dp)->buffreehead; \ 96 } 97 #define binstailfree(bp, dp) { \ 98 (bp)->b_actf = NULL; \ 99 (bp)->b_actb = (dp)->buffreetail; \ 100 *(dp)->buffreetail = (bp); \ 101 (dp)->buffreetail = &(bp)->b_actf; \ 102 } 103 104 /* 105 * Initialize buffers and hash links for buffers. 106 */ 107 void 108 bufinit() 109 { 110 register struct buf *bp; 111 struct bufqueue *dp; 112 register int i; 113 int base, residual; 114 115 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 116 dp->buffreetail = &dp->buffreehead; 117 bufhashtbl = (struct buf **)hashinit(nbuf, M_CACHE, &bufhash); 118 base = bufpages / nbuf; 119 residual = bufpages % nbuf; 120 for (i = 0; i < nbuf; i++) { 121 bp = &buf[i]; 122 bzero((char *)bp, sizeof *bp); 123 bp->b_dev = NODEV; 124 bp->b_rcred = NOCRED; 125 bp->b_wcred = NOCRED; 126 bp->b_un.b_addr = buffers + i * MAXBSIZE; 127 if (i < residual) 128 bp->b_bufsize = (base + 1) * CLBYTES; 129 else 130 bp->b_bufsize = base * CLBYTES; 131 bp->b_flags = B_INVAL; 132 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 133 binsheadfree(bp, dp); 134 binshash(bp, &invalhash); 135 } 136 } 137 138 /* 139 * Find the block in the buffer pool. 140 * If the buffer is not present, allocate a new buffer and load 141 * its contents according to the filesystem fill routine. 142 */ 143 bread(vp, blkno, size, cred, bpp) 144 struct vnode *vp; 145 daddr_t blkno; 146 int size; 147 struct ucred *cred; 148 struct buf **bpp; 149 { 150 struct proc *p = curproc; /* XXX */ 151 register struct buf *bp; 152 153 if (size == 0) 154 panic("bread: size 0"); 155 *bpp = bp = getblk(vp, blkno, size); 156 if (bp->b_flags & (B_DONE | B_DELWRI)) { 157 trace(TR_BREADHIT, pack(vp, size), blkno); 158 return (0); 159 } 160 bp->b_flags |= B_READ; 161 if (bp->b_bcount > bp->b_bufsize) 162 panic("bread"); 163 if (bp->b_rcred == NOCRED && cred != NOCRED) { 164 crhold(cred); 165 bp->b_rcred = cred; 166 } 167 VOP_STRATEGY(bp); 168 trace(TR_BREADMISS, pack(vp, size), blkno); 169 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 170 return (biowait(bp)); 171 } 172 173 /* 174 * Operates like bread, but also starts I/O on the N specified 175 * read-ahead blocks. 176 */ 177 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 178 struct vnode *vp; 179 daddr_t blkno; int size; 180 daddr_t rablkno[]; int rabsize[]; 181 int num; 182 struct ucred *cred; 183 struct buf **bpp; 184 { 185 struct proc *p = curproc; /* XXX */ 186 register struct buf *bp, *rabp; 187 register int i; 188 189 bp = NULL; 190 /* 191 * If the block is not memory resident, 192 * allocate a buffer and start I/O. 193 */ 194 if (!incore(vp, blkno)) { 195 *bpp = bp = getblk(vp, blkno, size); 196 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 197 bp->b_flags |= B_READ; 198 if (bp->b_bcount > bp->b_bufsize) 199 panic("breadn"); 200 if (bp->b_rcred == NOCRED && cred != NOCRED) { 201 crhold(cred); 202 bp->b_rcred = cred; 203 } 204 VOP_STRATEGY(bp); 205 trace(TR_BREADMISS, pack(vp, size), blkno); 206 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 207 } else { 208 trace(TR_BREADHIT, pack(vp, size), blkno); 209 } 210 } 211 212 /* 213 * If there's read-ahead block(s), start I/O 214 * on them also (as above). 215 */ 216 for (i = 0; i < num; i++) { 217 if (incore(vp, rablkno[i])) 218 continue; 219 rabp = getblk(vp, rablkno[i], rabsize[i]); 220 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 221 brelse(rabp); 222 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 223 } else { 224 rabp->b_flags |= B_ASYNC | B_READ; 225 if (rabp->b_bcount > rabp->b_bufsize) 226 panic("breadrabp"); 227 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 228 crhold(cred); 229 rabp->b_rcred = cred; 230 } 231 VOP_STRATEGY(rabp); 232 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 233 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 234 } 235 } 236 237 /* 238 * If block was memory resident, let bread get it. 239 * If block was not memory resident, the read was 240 * started above, so just wait for the read to complete. 241 */ 242 if (bp == NULL) 243 return (bread(vp, blkno, size, cred, bpp)); 244 return (biowait(bp)); 245 } 246 247 /* 248 * Synchronous write. 249 * Release buffer on completion. 250 */ 251 bwrite(bp) 252 register struct buf *bp; 253 { 254 struct proc *p = curproc; /* XXX */ 255 register int flag; 256 int s, error = 0; 257 258 flag = bp->b_flags; 259 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 260 if (flag & B_ASYNC) { 261 if ((flag & B_DELWRI) == 0) 262 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 263 else 264 reassignbuf(bp, bp->b_vp); 265 } 266 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 267 if (bp->b_bcount > bp->b_bufsize) 268 panic("bwrite"); 269 s = splbio(); 270 bp->b_vp->v_numoutput++; 271 splx(s); 272 VOP_STRATEGY(bp); 273 274 /* 275 * If the write was synchronous, then await I/O completion. 276 * If the write was "delayed", then we put the buffer on 277 * the queue of blocks awaiting I/O completion status. 278 */ 279 if ((flag & B_ASYNC) == 0) { 280 error = biowait(bp); 281 if ((flag&B_DELWRI) == 0) 282 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 283 else 284 reassignbuf(bp, bp->b_vp); 285 brelse(bp); 286 } else if (flag & B_DELWRI) { 287 s = splbio(); 288 bp->b_flags |= B_AGE; 289 splx(s); 290 } 291 return (error); 292 } 293 294 int 295 vn_bwrite(ap) 296 struct vop_bwrite_args *ap; 297 { 298 return (bwrite(ap->a_bp)); 299 } 300 301 302 /* 303 * Delayed write. 304 * 305 * The buffer is marked dirty, but is not queued for I/O. 306 * This routine should be used when the buffer is expected 307 * to be modified again soon, typically a small write that 308 * partially fills a buffer. 309 * 310 * NB: magnetic tapes cannot be delayed; they must be 311 * written in the order that the writes are requested. 312 */ 313 bdwrite(bp) 314 register struct buf *bp; 315 { 316 struct proc *p = curproc; /* XXX */ 317 318 if ((bp->b_flags & B_DELWRI) == 0) { 319 bp->b_flags |= B_DELWRI; 320 reassignbuf(bp, bp->b_vp); 321 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 322 } 323 /* 324 * If this is a tape drive, the write must be initiated. 325 */ 326 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 327 bawrite(bp); 328 } else { 329 bp->b_flags |= (B_DONE | B_DELWRI); 330 brelse(bp); 331 } 332 } 333 334 /* 335 * Asynchronous write. 336 * Start I/O on a buffer, but do not wait for it to complete. 337 * The buffer is released when the I/O completes. 338 */ 339 bawrite(bp) 340 register struct buf *bp; 341 { 342 343 /* 344 * Setting the ASYNC flag causes bwrite to return 345 * after starting the I/O. 346 */ 347 bp->b_flags |= B_ASYNC; 348 (void) bwrite(bp); 349 } 350 351 /* 352 * Release a buffer. 353 * Even if the buffer is dirty, no I/O is started. 354 */ 355 brelse(bp) 356 register struct buf *bp; 357 { 358 register struct bufqueue *flist; 359 int s; 360 361 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 362 /* 363 * If a process is waiting for the buffer, or 364 * is waiting for a free buffer, awaken it. 365 */ 366 if (bp->b_flags & B_WANTED) 367 wakeup((caddr_t)bp); 368 if (needbuffer) { 369 needbuffer = 0; 370 wakeup((caddr_t)&needbuffer); 371 } 372 /* 373 * Retry I/O for locked buffers rather than invalidating them. 374 */ 375 s = splbio(); 376 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 377 bp->b_flags &= ~B_ERROR; 378 /* 379 * Disassociate buffers that are no longer valid. 380 */ 381 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 382 bp->b_flags |= B_INVAL; 383 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 384 if (bp->b_vp) 385 brelvp(bp); 386 bp->b_flags &= ~B_DELWRI; 387 } 388 /* 389 * Stick the buffer back on a free list. 390 */ 391 if (bp->b_bufsize <= 0) { 392 /* block has no buffer ... put at front of unused buffer list */ 393 flist = &bufqueues[BQ_EMPTY]; 394 binsheadfree(bp, flist); 395 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 396 /* block has no info ... put at front of most free list */ 397 flist = &bufqueues[BQ_AGE]; 398 binsheadfree(bp, flist); 399 } else { 400 if (bp->b_flags & B_LOCKED) 401 flist = &bufqueues[BQ_LOCKED]; 402 else if (bp->b_flags & B_AGE) 403 flist = &bufqueues[BQ_AGE]; 404 else 405 flist = &bufqueues[BQ_LRU]; 406 binstailfree(bp, flist); 407 } 408 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 409 splx(s); 410 } 411 412 /* 413 * Check to see if a block is currently memory resident. 414 */ 415 incore(vp, blkno) 416 struct vnode *vp; 417 daddr_t blkno; 418 { 419 register struct buf *bp; 420 421 for (bp = *BUFHASH(vp, blkno); bp; bp = bp->b_forw) 422 if (bp->b_lblkno == blkno && bp->b_vp == vp && 423 (bp->b_flags & B_INVAL) == 0) 424 return (1); 425 return (0); 426 } 427 428 /* 429 * Check to see if a block is currently memory resident. 430 * If it is resident, return it. If it is not resident, 431 * allocate a new buffer and assign it to the block. 432 */ 433 struct buf * 434 getblk(vp, blkno, size) 435 register struct vnode *vp; 436 daddr_t blkno; 437 int size; 438 { 439 register struct buf *bp, **dp; 440 int s; 441 442 if (size > MAXBSIZE) 443 panic("getblk: size too big"); 444 /* 445 * Search the cache for the block. If the buffer is found, 446 * but it is currently locked, the we must wait for it to 447 * become available. 448 */ 449 dp = BUFHASH(vp, blkno); 450 loop: 451 for (bp = *dp; bp; bp = bp->b_forw) { 452 if (bp->b_lblkno != blkno || bp->b_vp != vp || 453 (bp->b_flags & B_INVAL)) 454 continue; 455 s = splbio(); 456 if (bp->b_flags & B_BUSY) { 457 bp->b_flags |= B_WANTED; 458 sleep((caddr_t)bp, PRIBIO + 1); 459 splx(s); 460 goto loop; 461 } 462 bremfree(bp); 463 bp->b_flags |= B_BUSY; 464 splx(s); 465 if (bp->b_bcount != size) { 466 printf("getblk: stray size"); 467 bp->b_flags |= B_INVAL; 468 bwrite(bp); 469 goto loop; 470 } 471 bp->b_flags |= B_CACHE; 472 return (bp); 473 } 474 bp = getnewbuf(); 475 bremhash(bp); 476 bgetvp(vp, bp); 477 bp->b_bcount = 0; 478 bp->b_lblkno = blkno; 479 bp->b_blkno = blkno; 480 bp->b_error = 0; 481 bp->b_resid = 0; 482 binshash(bp, dp); 483 allocbuf(bp, size); 484 return (bp); 485 } 486 487 /* 488 * Allocate a buffer. 489 * The caller will assign it to a block. 490 */ 491 struct buf * 492 geteblk(size) 493 int size; 494 { 495 register struct buf *bp; 496 497 if (size > MAXBSIZE) 498 panic("geteblk: size too big"); 499 bp = getnewbuf(); 500 bp->b_flags |= B_INVAL; 501 bremhash(bp); 502 binshash(bp, &invalhash); 503 bp->b_bcount = 0; 504 bp->b_error = 0; 505 bp->b_resid = 0; 506 allocbuf(bp, size); 507 return (bp); 508 } 509 510 /* 511 * Expand or contract the actual memory allocated to a buffer. 512 * If no memory is available, release buffer and take error exit. 513 */ 514 allocbuf(tp, size) 515 register struct buf *tp; 516 int size; 517 { 518 register struct buf *bp, *ep; 519 int sizealloc, take, s; 520 521 sizealloc = roundup(size, CLBYTES); 522 /* 523 * Buffer size does not change 524 */ 525 if (sizealloc == tp->b_bufsize) 526 goto out; 527 /* 528 * Buffer size is shrinking. 529 * Place excess space in a buffer header taken from the 530 * BQ_EMPTY buffer list and placed on the "most free" list. 531 * If no extra buffer headers are available, leave the 532 * extra space in the present buffer. 533 */ 534 if (sizealloc < tp->b_bufsize) { 535 if ((ep = bufqueues[BQ_EMPTY].buffreehead) == NULL) 536 goto out; 537 s = splbio(); 538 bremfree(ep); 539 ep->b_flags |= B_BUSY; 540 splx(s); 541 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 542 (int)tp->b_bufsize - sizealloc); 543 ep->b_bufsize = tp->b_bufsize - sizealloc; 544 tp->b_bufsize = sizealloc; 545 ep->b_flags |= B_INVAL; 546 ep->b_bcount = 0; 547 brelse(ep); 548 goto out; 549 } 550 /* 551 * More buffer space is needed. Get it out of buffers on 552 * the "most free" list, placing the empty headers on the 553 * BQ_EMPTY buffer header list. 554 */ 555 while (tp->b_bufsize < sizealloc) { 556 take = sizealloc - tp->b_bufsize; 557 bp = getnewbuf(); 558 if (take >= bp->b_bufsize) 559 take = bp->b_bufsize; 560 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 561 &tp->b_un.b_addr[tp->b_bufsize], take); 562 tp->b_bufsize += take; 563 bp->b_bufsize = bp->b_bufsize - take; 564 if (bp->b_bcount > bp->b_bufsize) 565 bp->b_bcount = bp->b_bufsize; 566 if (bp->b_bufsize <= 0) { 567 bremhash(bp); 568 binshash(bp, &invalhash); 569 bp->b_dev = NODEV; 570 bp->b_error = 0; 571 bp->b_flags |= B_INVAL; 572 } 573 brelse(bp); 574 } 575 out: 576 tp->b_bcount = size; 577 return (1); 578 } 579 580 /* 581 * Find a buffer which is available for use. 582 * Select something from a free list. 583 * Preference is to AGE list, then LRU list. 584 */ 585 struct buf * 586 getnewbuf() 587 { 588 register struct buf *bp; 589 register struct bufqueue *dp; 590 register struct ucred *cred; 591 int s; 592 593 loop: 594 s = splbio(); 595 for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) 596 if (dp->buffreehead) 597 break; 598 if (dp == bufqueues) { /* no free blocks */ 599 needbuffer = 1; 600 sleep((caddr_t)&needbuffer, PRIBIO + 1); 601 splx(s); 602 goto loop; 603 } 604 bp = dp->buffreehead; 605 bremfree(bp); 606 bp->b_flags |= B_BUSY; 607 splx(s); 608 if (bp->b_flags & B_DELWRI) { 609 (void) bawrite(bp); 610 goto loop; 611 } 612 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 613 if (bp->b_vp) 614 brelvp(bp); 615 if (bp->b_rcred != NOCRED) { 616 cred = bp->b_rcred; 617 bp->b_rcred = NOCRED; 618 crfree(cred); 619 } 620 if (bp->b_wcred != NOCRED) { 621 cred = bp->b_wcred; 622 bp->b_wcred = NOCRED; 623 crfree(cred); 624 } 625 bp->b_flags = B_BUSY; 626 bp->b_dirtyoff = bp->b_dirtyend = 0; 627 bp->b_validoff = bp->b_validend = 0; 628 return (bp); 629 } 630 631 /* 632 * Wait for I/O to complete. 633 * 634 * Extract and return any errors associated with the I/O. 635 * If the error flag is set, but no specific error is 636 * given, return EIO. 637 */ 638 biowait(bp) 639 register struct buf *bp; 640 { 641 int s; 642 643 s = splbio(); 644 while ((bp->b_flags & B_DONE) == 0) 645 sleep((caddr_t)bp, PRIBIO); 646 splx(s); 647 if ((bp->b_flags & B_ERROR) == 0) 648 return (0); 649 if (bp->b_error) 650 return (bp->b_error); 651 return (EIO); 652 } 653 654 /* 655 * Mark I/O complete on a buffer. 656 * 657 * If a callback has been requested, e.g. the pageout 658 * daemon, do so. Otherwise, awaken waiting processes. 659 */ 660 void 661 biodone(bp) 662 register struct buf *bp; 663 { 664 665 if (bp->b_flags & B_DONE) 666 panic("dup biodone"); 667 bp->b_flags |= B_DONE; 668 if ((bp->b_flags & B_READ) == 0) 669 vwakeup(bp); 670 if (bp->b_flags & B_CALL) { 671 bp->b_flags &= ~B_CALL; 672 (*bp->b_iodone)(bp); 673 return; 674 } 675 if (bp->b_flags & B_ASYNC) 676 brelse(bp); 677 else { 678 bp->b_flags &= ~B_WANTED; 679 wakeup((caddr_t)bp); 680 } 681 } 682 683 #ifdef DIAGNOSTIC 684 /* 685 * Print out statistics on the current allocation of the buffer pool. 686 * Can be enabled to print out on every ``sync'' by setting "syncprt" 687 * above. 688 */ 689 void 690 vfs_bufstats() 691 { 692 int s, i, j, count; 693 register struct buf *bp; 694 register struct bufqueue *dp; 695 int counts[MAXBSIZE/CLBYTES+1]; 696 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 697 698 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 699 count = 0; 700 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 701 counts[j] = 0; 702 s = splbio(); 703 for (bp = dp->buffreehead; bp; bp = bp->b_actf) { 704 counts[bp->b_bufsize/CLBYTES]++; 705 count++; 706 } 707 splx(s); 708 printf("%s: total-%d", bname[i], count); 709 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 710 if (counts[j] != 0) 711 printf(", %d-%d", j * CLBYTES, counts[j]); 712 printf("\n"); 713 } 714 } 715 #endif /* DIAGNOSTIC */ 716