1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_bio.c 7.51 (Berkeley) 07/12/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/mount.h> 17 #include <sys/trace.h> 18 #include <sys/resourcevar.h> 19 20 /* 21 * Initialize buffers and hash links for buffers. 22 */ 23 void 24 bufinit() 25 { 26 register int i; 27 register struct buf *bp, *dp; 28 register struct bufhd *hp; 29 int base, residual; 30 31 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 32 hp->b_forw = hp->b_back = (struct buf *)hp; 33 34 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 35 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 36 dp->b_flags = B_HEAD; 37 } 38 base = bufpages / nbuf; 39 residual = bufpages % nbuf; 40 for (i = 0; i < nbuf; i++) { 41 bp = &buf[i]; 42 bp->b_dev = NODEV; 43 bp->b_bcount = 0; 44 bp->b_rcred = NOCRED; 45 bp->b_wcred = NOCRED; 46 bp->b_dirtyoff = 0; 47 bp->b_dirtyend = 0; 48 bp->b_validoff = 0; 49 bp->b_validend = 0; 50 bp->b_un.b_addr = buffers + i * MAXBSIZE; 51 if (i < residual) 52 bp->b_bufsize = (base + 1) * CLBYTES; 53 else 54 bp->b_bufsize = base * CLBYTES; 55 binshash(bp, &bfreelist[BQ_AGE]); 56 bp->b_flags = B_INVAL; 57 dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY]; 58 binsheadfree(bp, dp); 59 } 60 } 61 62 /* 63 * Find the block in the buffer pool. 64 * If the buffer is not present, allocate a new buffer and load 65 * its contents according to the filesystem fill routine. 66 */ 67 bread(vp, blkno, size, cred, bpp) 68 struct vnode *vp; 69 daddr_t blkno; 70 int size; 71 struct ucred *cred; 72 struct buf **bpp; 73 { 74 struct proc *p = curproc; /* XXX */ 75 register struct buf *bp; 76 77 if (size == 0) 78 panic("bread: size 0"); 79 *bpp = bp = getblk(vp, blkno, size); 80 if (bp->b_flags & (B_DONE | B_DELWRI)) { 81 trace(TR_BREADHIT, pack(vp, size), blkno); 82 return (0); 83 } 84 bp->b_flags |= B_READ; 85 if (bp->b_bcount > bp->b_bufsize) 86 panic("bread"); 87 if (bp->b_rcred == NOCRED && cred != NOCRED) { 88 crhold(cred); 89 bp->b_rcred = cred; 90 } 91 VOP_STRATEGY(bp); 92 trace(TR_BREADMISS, pack(vp, size), blkno); 93 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 94 return (biowait(bp)); 95 } 96 97 /* 98 * Operates like bread, but also starts I/O on the N specified 99 * read-ahead blocks. 100 */ 101 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 102 struct vnode *vp; 103 daddr_t blkno; int size; 104 daddr_t rablkno[]; int rabsize[]; 105 int num; 106 struct ucred *cred; 107 struct buf **bpp; 108 { 109 struct proc *p = curproc; /* XXX */ 110 register struct buf *bp, *rabp; 111 register int i; 112 113 bp = NULL; 114 /* 115 * If the block is not memory resident, 116 * allocate a buffer and start I/O. 117 */ 118 if (!incore(vp, blkno)) { 119 *bpp = bp = getblk(vp, blkno, size); 120 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 121 bp->b_flags |= B_READ; 122 if (bp->b_bcount > bp->b_bufsize) 123 panic("breadn"); 124 if (bp->b_rcred == NOCRED && cred != NOCRED) { 125 crhold(cred); 126 bp->b_rcred = cred; 127 } 128 VOP_STRATEGY(bp); 129 trace(TR_BREADMISS, pack(vp, size), blkno); 130 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 131 } else { 132 trace(TR_BREADHIT, pack(vp, size), blkno); 133 } 134 } 135 136 /* 137 * If there's read-ahead block(s), start I/O 138 * on them also (as above). 139 */ 140 for (i = 0; i < num; i++) { 141 if (incore(vp, rablkno[i])) 142 continue; 143 rabp = getblk(vp, rablkno[i], rabsize[i]); 144 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 145 brelse(rabp); 146 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 147 } else { 148 rabp->b_flags |= B_ASYNC | B_READ; 149 if (rabp->b_bcount > rabp->b_bufsize) 150 panic("breadrabp"); 151 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 152 crhold(cred); 153 rabp->b_rcred = cred; 154 } 155 VOP_STRATEGY(rabp); 156 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 157 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 158 } 159 } 160 161 /* 162 * If block was memory resident, let bread get it. 163 * If block was not memory resident, the read was 164 * started above, so just wait for the read to complete. 165 */ 166 if (bp == NULL) 167 return (bread(vp, blkno, size, cred, bpp)); 168 return (biowait(bp)); 169 } 170 171 /* 172 * Synchronous write. 173 * Release buffer on completion. 174 */ 175 bwrite(bp) 176 register struct buf *bp; 177 { 178 struct proc *p = curproc; /* XXX */ 179 register int flag; 180 int s, error = 0; 181 182 flag = bp->b_flags; 183 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 184 if (flag & B_ASYNC) { 185 if ((flag & B_DELWRI) == 0) 186 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 187 else 188 reassignbuf(bp, bp->b_vp); 189 } 190 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 191 if (bp->b_bcount > bp->b_bufsize) 192 panic("bwrite"); 193 s = splbio(); 194 bp->b_vp->v_numoutput++; 195 splx(s); 196 VOP_STRATEGY(bp); 197 198 /* 199 * If the write was synchronous, then await I/O completion. 200 * If the write was "delayed", then we put the buffer on 201 * the queue of blocks awaiting I/O completion status. 202 */ 203 if ((flag & B_ASYNC) == 0) { 204 error = biowait(bp); 205 if ((flag&B_DELWRI) == 0) 206 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 207 else 208 reassignbuf(bp, bp->b_vp); 209 brelse(bp); 210 } else if (flag & B_DELWRI) { 211 s = splbio(); 212 bp->b_flags |= B_AGE; 213 splx(s); 214 } 215 return (error); 216 } 217 218 int 219 vn_bwrite(ap) 220 struct vop_bwrite_args *ap; 221 { 222 return bwrite (ap->a_bp); 223 } 224 225 226 /* 227 * Delayed write. 228 * 229 * The buffer is marked dirty, but is not queued for I/O. 230 * This routine should be used when the buffer is expected 231 * to be modified again soon, typically a small write that 232 * partially fills a buffer. 233 * 234 * NB: magnetic tapes cannot be delayed; they must be 235 * written in the order that the writes are requested. 236 */ 237 bdwrite(bp) 238 register struct buf *bp; 239 { 240 struct proc *p = curproc; /* XXX */ 241 242 if ((bp->b_flags & B_DELWRI) == 0) { 243 bp->b_flags |= B_DELWRI; 244 reassignbuf(bp, bp->b_vp); 245 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 246 } 247 /* 248 * If this is a tape drive, the write must be initiated. 249 */ 250 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 251 bawrite(bp); 252 } else { 253 bp->b_flags |= (B_DONE | B_DELWRI); 254 brelse(bp); 255 } 256 } 257 258 /* 259 * Asynchronous write. 260 * Start I/O on a buffer, but do not wait for it to complete. 261 * The buffer is released when the I/O completes. 262 */ 263 bawrite(bp) 264 register struct buf *bp; 265 { 266 267 /* 268 * Setting the ASYNC flag causes bwrite to return 269 * after starting the I/O. 270 */ 271 bp->b_flags |= B_ASYNC; 272 (void) bwrite(bp); 273 } 274 275 /* 276 * Release a buffer. 277 * Even if the buffer is dirty, no I/O is started. 278 */ 279 brelse(bp) 280 register struct buf *bp; 281 { 282 register struct buf *flist; 283 int s; 284 285 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 286 /* 287 * If a process is waiting for the buffer, or 288 * is waiting for a free buffer, awaken it. 289 */ 290 if (bp->b_flags & B_WANTED) 291 wakeup((caddr_t)bp); 292 if (bfreelist[0].b_flags & B_WANTED) { 293 bfreelist[0].b_flags &= ~B_WANTED; 294 wakeup((caddr_t)bfreelist); 295 } 296 /* 297 * Retry I/O for locked buffers rather than invalidating them. 298 */ 299 s = splbio(); 300 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 301 bp->b_flags &= ~B_ERROR; 302 /* 303 * Disassociate buffers that are no longer valid. 304 */ 305 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 306 bp->b_flags |= B_INVAL; 307 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 308 if (bp->b_vp) 309 brelvp(bp); 310 bp->b_flags &= ~B_DELWRI; 311 } 312 /* 313 * Stick the buffer back on a free list. 314 */ 315 if (bp->b_bufsize <= 0) { 316 /* block has no buffer ... put at front of unused buffer list */ 317 flist = &bfreelist[BQ_EMPTY]; 318 binsheadfree(bp, flist); 319 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 320 /* block has no info ... put at front of most free list */ 321 flist = &bfreelist[BQ_AGE]; 322 binsheadfree(bp, flist); 323 } else { 324 if (bp->b_flags & B_LOCKED) 325 flist = &bfreelist[BQ_LOCKED]; 326 else if (bp->b_flags & B_AGE) 327 flist = &bfreelist[BQ_AGE]; 328 else 329 flist = &bfreelist[BQ_LRU]; 330 binstailfree(bp, flist); 331 } 332 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 333 splx(s); 334 } 335 336 /* 337 * Check to see if a block is currently memory resident. 338 */ 339 incore(vp, blkno) 340 struct vnode *vp; 341 daddr_t blkno; 342 { 343 register struct buf *bp; 344 register struct buf *dp; 345 346 dp = BUFHASH(vp, blkno); 347 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 348 if (bp->b_lblkno == blkno && bp->b_vp == vp && 349 (bp->b_flags & B_INVAL) == 0) 350 return (1); 351 return (0); 352 } 353 354 /* 355 * Check to see if a block is currently memory resident. 356 * If it is resident, return it. If it is not resident, 357 * allocate a new buffer and assign it to the block. 358 */ 359 struct buf * 360 getblk(vp, blkno, size) 361 register struct vnode *vp; 362 daddr_t blkno; 363 int size; 364 { 365 register struct buf *bp, *dp; 366 int s; 367 368 if (size > MAXBSIZE) 369 panic("getblk: size too big"); 370 /* 371 * Search the cache for the block. If the buffer is found, 372 * but it is currently locked, the we must wait for it to 373 * become available. 374 */ 375 dp = BUFHASH(vp, blkno); 376 loop: 377 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 378 if (bp->b_lblkno != blkno || bp->b_vp != vp || 379 (bp->b_flags & B_INVAL)) 380 continue; 381 s = splbio(); 382 if (bp->b_flags & B_BUSY) { 383 bp->b_flags |= B_WANTED; 384 sleep((caddr_t)bp, PRIBIO + 1); 385 splx(s); 386 goto loop; 387 } 388 bremfree(bp); 389 bp->b_flags |= B_BUSY; 390 splx(s); 391 if (bp->b_bcount != size) { 392 printf("getblk: stray size"); 393 bp->b_flags |= B_INVAL; 394 bwrite(bp); 395 goto loop; 396 } 397 bp->b_flags |= B_CACHE; 398 return (bp); 399 } 400 bp = getnewbuf(); 401 bremhash(bp); 402 bgetvp(vp, bp); 403 bp->b_bcount = 0; 404 bp->b_lblkno = blkno; 405 bp->b_blkno = blkno; 406 bp->b_error = 0; 407 bp->b_resid = 0; 408 binshash(bp, dp); 409 allocbuf(bp, size); 410 return (bp); 411 } 412 413 /* 414 * Allocate a buffer. 415 * The caller will assign it to a block. 416 */ 417 struct buf * 418 geteblk(size) 419 int size; 420 { 421 register struct buf *bp, *flist; 422 423 if (size > MAXBSIZE) 424 panic("geteblk: size too big"); 425 bp = getnewbuf(); 426 bp->b_flags |= B_INVAL; 427 bremhash(bp); 428 flist = &bfreelist[BQ_AGE]; 429 bp->b_bcount = 0; 430 bp->b_error = 0; 431 bp->b_resid = 0; 432 binshash(bp, flist); 433 allocbuf(bp, size); 434 return (bp); 435 } 436 437 /* 438 * Expand or contract the actual memory allocated to a buffer. 439 * If no memory is available, release buffer and take error exit. 440 */ 441 allocbuf(tp, size) 442 register struct buf *tp; 443 int size; 444 { 445 register struct buf *bp, *ep; 446 int sizealloc, take, s; 447 448 sizealloc = roundup(size, CLBYTES); 449 /* 450 * Buffer size does not change 451 */ 452 if (sizealloc == tp->b_bufsize) 453 goto out; 454 /* 455 * Buffer size is shrinking. 456 * Place excess space in a buffer header taken from the 457 * BQ_EMPTY buffer list and placed on the "most free" list. 458 * If no extra buffer headers are available, leave the 459 * extra space in the present buffer. 460 */ 461 if (sizealloc < tp->b_bufsize) { 462 ep = bfreelist[BQ_EMPTY].av_forw; 463 if (ep == &bfreelist[BQ_EMPTY]) 464 goto out; 465 s = splbio(); 466 bremfree(ep); 467 ep->b_flags |= B_BUSY; 468 splx(s); 469 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 470 (int)tp->b_bufsize - sizealloc); 471 ep->b_bufsize = tp->b_bufsize - sizealloc; 472 tp->b_bufsize = sizealloc; 473 ep->b_flags |= B_INVAL; 474 ep->b_bcount = 0; 475 brelse(ep); 476 goto out; 477 } 478 /* 479 * More buffer space is needed. Get it out of buffers on 480 * the "most free" list, placing the empty headers on the 481 * BQ_EMPTY buffer header list. 482 */ 483 while (tp->b_bufsize < sizealloc) { 484 take = sizealloc - tp->b_bufsize; 485 bp = getnewbuf(); 486 if (take >= bp->b_bufsize) 487 take = bp->b_bufsize; 488 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 489 &tp->b_un.b_addr[tp->b_bufsize], take); 490 tp->b_bufsize += take; 491 bp->b_bufsize = bp->b_bufsize - take; 492 if (bp->b_bcount > bp->b_bufsize) 493 bp->b_bcount = bp->b_bufsize; 494 if (bp->b_bufsize <= 0) { 495 bremhash(bp); 496 binshash(bp, &bfreelist[BQ_EMPTY]); 497 bp->b_dev = NODEV; 498 bp->b_error = 0; 499 bp->b_flags |= B_INVAL; 500 } 501 brelse(bp); 502 } 503 out: 504 tp->b_bcount = size; 505 return (1); 506 } 507 508 /* 509 * Find a buffer which is available for use. 510 * Select something from a free list. 511 * Preference is to AGE list, then LRU list. 512 */ 513 struct buf * 514 getnewbuf() 515 { 516 register struct buf *bp, *dp; 517 register struct ucred *cred; 518 int s; 519 520 #ifdef LFS 521 lfs_flush(); 522 #endif 523 loop: 524 s = splbio(); 525 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 526 if (dp->av_forw != dp) 527 break; 528 if (dp == bfreelist) { /* no free blocks */ 529 dp->b_flags |= B_WANTED; 530 sleep((caddr_t)dp, PRIBIO + 1); 531 splx(s); 532 goto loop; 533 } 534 bp = dp->av_forw; 535 bremfree(bp); 536 bp->b_flags |= B_BUSY; 537 splx(s); 538 if (bp->b_flags & B_DELWRI) { 539 (void) bawrite(bp); 540 goto loop; 541 } 542 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 543 if (bp->b_vp) 544 brelvp(bp); 545 if (bp->b_rcred != NOCRED) { 546 cred = bp->b_rcred; 547 bp->b_rcred = NOCRED; 548 crfree(cred); 549 } 550 if (bp->b_wcred != NOCRED) { 551 cred = bp->b_wcred; 552 bp->b_wcred = NOCRED; 553 crfree(cred); 554 } 555 bp->b_flags = B_BUSY; 556 bp->b_dirtyoff = bp->b_dirtyend = 0; 557 bp->b_validoff = bp->b_validend = 0; 558 return (bp); 559 } 560 561 /* 562 * Wait for I/O to complete. 563 * 564 * Extract and return any errors associated with the I/O. 565 * If the error flag is set, but no specific error is 566 * given, return EIO. 567 */ 568 biowait(bp) 569 register struct buf *bp; 570 { 571 int s; 572 573 s = splbio(); 574 while ((bp->b_flags & B_DONE) == 0) 575 sleep((caddr_t)bp, PRIBIO); 576 splx(s); 577 if ((bp->b_flags & B_ERROR) == 0) 578 return (0); 579 if (bp->b_error) 580 return (bp->b_error); 581 return (EIO); 582 } 583 584 /* 585 * Mark I/O complete on a buffer. 586 * 587 * If a callback has been requested, e.g. the pageout 588 * daemon, do so. Otherwise, awaken waiting processes. 589 */ 590 void 591 biodone(bp) 592 register struct buf *bp; 593 { 594 595 if (bp->b_flags & B_DONE) 596 panic("dup biodone"); 597 bp->b_flags |= B_DONE; 598 if ((bp->b_flags & B_READ) == 0) 599 vwakeup(bp); 600 if (bp->b_flags & B_CALL) { 601 bp->b_flags &= ~B_CALL; 602 (*bp->b_iodone)(bp); 603 return; 604 } 605 if (bp->b_flags & B_ASYNC) 606 brelse(bp); 607 else { 608 bp->b_flags &= ~B_WANTED; 609 wakeup((caddr_t)bp); 610 } 611 } 612