1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_bio.c 7.44 (Berkeley) 12/31/91 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/specdev.h> 17 #include <sys/mount.h> 18 #include <sys/trace.h> 19 #include <sys/resourcevar.h> 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 void 25 bufinit() 26 { 27 register int i; 28 register struct buf *bp, *dp; 29 register struct bufhd *hp; 30 int base, residual; 31 32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 33 hp->b_forw = hp->b_back = (struct buf *)hp; 34 35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 37 dp->b_flags = B_HEAD; 38 } 39 base = bufpages / nbuf; 40 residual = bufpages % nbuf; 41 for (i = 0; i < nbuf; i++) { 42 bp = &buf[i]; 43 bp->b_dev = NODEV; 44 bp->b_bcount = 0; 45 bp->b_rcred = NOCRED; 46 bp->b_wcred = NOCRED; 47 bp->b_dirtyoff = 0; 48 bp->b_dirtyend = 0; 49 bp->b_un.b_addr = buffers + i * MAXBSIZE; 50 if (i < residual) 51 bp->b_bufsize = (base + 1) * CLBYTES; 52 else 53 bp->b_bufsize = base * CLBYTES; 54 binshash(bp, &bfreelist[BQ_AGE]); 55 bp->b_flags = B_BUSY|B_INVAL; 56 brelse(bp); 57 } 58 } 59 60 /* 61 * Find the block in the buffer pool. 62 * If the buffer is not present, allocate a new buffer and load 63 * its contents according to the filesystem fill routine. 64 */ 65 bread(vp, blkno, size, cred, bpp) 66 struct vnode *vp; 67 daddr_t blkno; 68 int size; 69 struct ucred *cred; 70 struct buf **bpp; 71 { 72 struct proc *p = curproc; /* XXX */ 73 register struct buf *bp; 74 75 if (size == 0) 76 panic("bread: size 0"); 77 *bpp = bp = getblk(vp, blkno, size); 78 if (bp->b_flags & (B_DONE | B_DELWRI)) { 79 trace(TR_BREADHIT, pack(vp, size), blkno); 80 return (0); 81 } 82 bp->b_flags |= B_READ; 83 if (bp->b_bcount > bp->b_bufsize) 84 panic("bread"); 85 if (bp->b_rcred == NOCRED && cred != NOCRED) { 86 crhold(cred); 87 bp->b_rcred = cred; 88 } 89 VOP_STRATEGY(bp); 90 trace(TR_BREADMISS, pack(vp, size), blkno); 91 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 92 return (biowait(bp)); 93 } 94 95 /* 96 * Operates like bread, but also starts I/O on the specified 97 * read-ahead block. 98 */ 99 breada(vp, blkno, size, rablkno, rabsize, cred, bpp) 100 struct vnode *vp; 101 daddr_t blkno; int size; 102 daddr_t rablkno; int rabsize; 103 struct ucred *cred; 104 struct buf **bpp; 105 { 106 struct proc *p = curproc; /* XXX */ 107 register struct buf *bp, *rabp; 108 109 bp = NULL; 110 /* 111 * If the block is not memory resident, 112 * allocate a buffer and start I/O. 113 */ 114 if (!incore(vp, blkno)) { 115 *bpp = bp = getblk(vp, blkno, size); 116 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 117 bp->b_flags |= B_READ; 118 if (bp->b_bcount > bp->b_bufsize) 119 panic("breada"); 120 if (bp->b_rcred == NOCRED && cred != NOCRED) { 121 crhold(cred); 122 bp->b_rcred = cred; 123 } 124 VOP_STRATEGY(bp); 125 trace(TR_BREADMISS, pack(vp, size), blkno); 126 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 127 } else 128 trace(TR_BREADHIT, pack(vp, size), blkno); 129 } 130 131 /* 132 * If there is a read-ahead block, start I/O on it too. 133 */ 134 if (!incore(vp, rablkno)) { 135 rabp = getblk(vp, rablkno, rabsize); 136 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 137 brelse(rabp); 138 trace(TR_BREADHITRA, pack(vp, rabsize), rablkno); 139 } else { 140 rabp->b_flags |= B_ASYNC | B_READ; 141 if (rabp->b_bcount > rabp->b_bufsize) 142 panic("breadrabp"); 143 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 144 crhold(cred); 145 rabp->b_rcred = cred; 146 } 147 VOP_STRATEGY(rabp); 148 trace(TR_BREADMISSRA, pack(vp, rabsize), rablkno); 149 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 150 } 151 } 152 153 /* 154 * If block was memory resident, let bread get it. 155 * If block was not memory resident, the read was 156 * started above, so just wait for the read to complete. 157 */ 158 if (bp == NULL) 159 return (bread(vp, blkno, size, cred, bpp)); 160 return (biowait(bp)); 161 } 162 163 /* 164 * Synchronous write. 165 * Release buffer on completion. 166 */ 167 bwrite(bp) 168 register struct buf *bp; 169 { 170 struct proc *p = curproc; /* XXX */ 171 register int flag; 172 int s, error; 173 174 flag = bp->b_flags; 175 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 176 if (flag & B_ASYNC) { 177 if ((flag & B_DELWRI) == 0) 178 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 179 else 180 reassignbuf(bp, bp->b_vp); 181 } 182 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 183 if (bp->b_bcount > bp->b_bufsize) 184 panic("bwrite"); 185 s = splbio(); 186 bp->b_vp->v_numoutput++; 187 splx(s); 188 VOP_STRATEGY(bp); 189 190 /* 191 * If the write was synchronous, then await I/O completion. 192 * If the write was "delayed", then we put the buffer on 193 * the queue of blocks awaiting I/O completion status. 194 */ 195 if ((flag & B_ASYNC) == 0) { 196 error = biowait(bp); 197 if ((flag&B_DELWRI) == 0) 198 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 199 else 200 reassignbuf(bp, bp->b_vp); 201 brelse(bp); 202 } else if (flag & B_DELWRI) { 203 bp->b_flags |= B_AGE; 204 error = 0; 205 } 206 return (error); 207 } 208 209 /* 210 * Delayed write. 211 * 212 * The buffer is marked dirty, but is not queued for I/O. 213 * This routine should be used when the buffer is expected 214 * to be modified again soon, typically a small write that 215 * partially fills a buffer. 216 * 217 * NB: magnetic tapes cannot be delayed; they must be 218 * written in the order that the writes are requested. 219 */ 220 bdwrite(bp) 221 register struct buf *bp; 222 { 223 struct proc *p = curproc; /* XXX */ 224 225 if ((bp->b_flags & B_DELWRI) == 0) { 226 bp->b_flags |= B_DELWRI; 227 reassignbuf(bp, bp->b_vp); 228 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 229 } 230 /* 231 * If this is a tape drive, the write must be initiated. 232 */ 233 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 234 bawrite(bp); 235 } else { 236 bp->b_flags |= (B_DONE | B_DELWRI); 237 brelse(bp); 238 } 239 } 240 241 /* 242 * Asynchronous write. 243 * Start I/O on a buffer, but do not wait for it to complete. 244 * The buffer is released when the I/O completes. 245 */ 246 bawrite(bp) 247 register struct buf *bp; 248 { 249 250 /* 251 * Setting the ASYNC flag causes bwrite to return 252 * after starting the I/O. 253 */ 254 bp->b_flags |= B_ASYNC; 255 (void) bwrite(bp); 256 } 257 258 /* 259 * Release a buffer. 260 * Even if the buffer is dirty, no I/O is started. 261 */ 262 brelse(bp) 263 register struct buf *bp; 264 { 265 register struct buf *flist; 266 int s; 267 268 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 269 /* 270 * If a process is waiting for the buffer, or 271 * is waiting for a free buffer, awaken it. 272 */ 273 if (bp->b_flags & B_WANTED) 274 wakeup((caddr_t)bp); 275 if (bfreelist[0].b_flags & B_WANTED) { 276 bfreelist[0].b_flags &= ~B_WANTED; 277 wakeup((caddr_t)bfreelist); 278 } 279 /* 280 * Retry I/O for locked buffers rather than invalidating them. 281 */ 282 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 283 bp->b_flags &= ~B_ERROR; 284 /* 285 * Disassociate buffers that are no longer valid. 286 */ 287 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 288 bp->b_flags |= B_INVAL; 289 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 290 if (bp->b_vp) 291 brelvp(bp); 292 bp->b_flags &= ~B_DELWRI; 293 } 294 /* 295 * Stick the buffer back on a free list. 296 */ 297 s = splbio(); 298 if (bp->b_bufsize <= 0) { 299 /* block has no buffer ... put at front of unused buffer list */ 300 flist = &bfreelist[BQ_EMPTY]; 301 binsheadfree(bp, flist); 302 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 303 /* block has no info ... put at front of most free list */ 304 flist = &bfreelist[BQ_AGE]; 305 binsheadfree(bp, flist); 306 } else { 307 if (bp->b_flags & B_LOCKED) 308 flist = &bfreelist[BQ_LOCKED]; 309 else if (bp->b_flags & B_AGE) 310 flist = &bfreelist[BQ_AGE]; 311 else 312 flist = &bfreelist[BQ_LRU]; 313 binstailfree(bp, flist); 314 } 315 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 316 splx(s); 317 } 318 319 /* 320 * Check to see if a block is currently memory resident. 321 */ 322 incore(vp, blkno) 323 struct vnode *vp; 324 daddr_t blkno; 325 { 326 register struct buf *bp; 327 register struct buf *dp; 328 329 dp = BUFHASH(vp, blkno); 330 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 331 if (bp->b_lblkno == blkno && bp->b_vp == vp && 332 (bp->b_flags & B_INVAL) == 0) 333 return (1); 334 return (0); 335 } 336 337 /* 338 * Check to see if a block is currently memory resident. 339 * If it is resident, return it. If it is not resident, 340 * allocate a new buffer and assign it to the block. 341 */ 342 struct buf * 343 getblk(vp, blkno, size) 344 register struct vnode *vp; 345 daddr_t blkno; 346 int size; 347 { 348 register struct buf *bp, *dp; 349 int s; 350 351 if (size > MAXBSIZE) 352 panic("getblk: size too big"); 353 /* 354 * Search the cache for the block. If the buffer is found, 355 * but it is currently locked, the we must wait for it to 356 * become available. 357 */ 358 dp = BUFHASH(vp, blkno); 359 loop: 360 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 361 if (bp->b_lblkno != blkno || bp->b_vp != vp || 362 (bp->b_flags & B_INVAL)) 363 continue; 364 s = splbio(); 365 if (bp->b_flags & B_BUSY) { 366 bp->b_flags |= B_WANTED; 367 sleep((caddr_t)bp, PRIBIO + 1); 368 splx(s); 369 goto loop; 370 } 371 bremfree(bp); 372 bp->b_flags |= B_BUSY; 373 splx(s); 374 if (bp->b_bcount != size) { 375 printf("getblk: stray size"); 376 bp->b_flags |= B_INVAL; 377 bwrite(bp); 378 goto loop; 379 } 380 bp->b_flags |= B_CACHE; 381 return (bp); 382 } 383 bp = getnewbuf(); 384 bremhash(bp); 385 bgetvp(vp, bp); 386 bp->b_bcount = 0; 387 bp->b_lblkno = blkno; 388 bp->b_blkno = blkno; 389 bp->b_error = 0; 390 bp->b_resid = 0; 391 binshash(bp, dp); 392 allocbuf(bp, size); 393 return (bp); 394 } 395 396 /* 397 * Allocate a buffer. 398 * The caller will assign it to a block. 399 */ 400 struct buf * 401 geteblk(size) 402 int size; 403 { 404 register struct buf *bp, *flist; 405 406 if (size > MAXBSIZE) 407 panic("geteblk: size too big"); 408 bp = getnewbuf(); 409 bp->b_flags |= B_INVAL; 410 bremhash(bp); 411 flist = &bfreelist[BQ_AGE]; 412 bp->b_bcount = 0; 413 bp->b_error = 0; 414 bp->b_resid = 0; 415 binshash(bp, flist); 416 allocbuf(bp, size); 417 return (bp); 418 } 419 420 /* 421 * Expand or contract the actual memory allocated to a buffer. 422 * If no memory is available, release buffer and take error exit. 423 */ 424 allocbuf(tp, size) 425 register struct buf *tp; 426 int size; 427 { 428 register struct buf *bp, *ep; 429 int sizealloc, take, s; 430 431 sizealloc = roundup(size, CLBYTES); 432 /* 433 * Buffer size does not change 434 */ 435 if (sizealloc == tp->b_bufsize) 436 goto out; 437 /* 438 * Buffer size is shrinking. 439 * Place excess space in a buffer header taken from the 440 * BQ_EMPTY buffer list and placed on the "most free" list. 441 * If no extra buffer headers are available, leave the 442 * extra space in the present buffer. 443 */ 444 if (sizealloc < tp->b_bufsize) { 445 ep = bfreelist[BQ_EMPTY].av_forw; 446 if (ep == &bfreelist[BQ_EMPTY]) 447 goto out; 448 s = splbio(); 449 bremfree(ep); 450 ep->b_flags |= B_BUSY; 451 splx(s); 452 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 453 (int)tp->b_bufsize - sizealloc); 454 ep->b_bufsize = tp->b_bufsize - sizealloc; 455 tp->b_bufsize = sizealloc; 456 ep->b_flags |= B_INVAL; 457 ep->b_bcount = 0; 458 brelse(ep); 459 goto out; 460 } 461 /* 462 * More buffer space is needed. Get it out of buffers on 463 * the "most free" list, placing the empty headers on the 464 * BQ_EMPTY buffer header list. 465 */ 466 while (tp->b_bufsize < sizealloc) { 467 take = sizealloc - tp->b_bufsize; 468 bp = getnewbuf(); 469 if (take >= bp->b_bufsize) 470 take = bp->b_bufsize; 471 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 472 &tp->b_un.b_addr[tp->b_bufsize], take); 473 tp->b_bufsize += take; 474 bp->b_bufsize = bp->b_bufsize - take; 475 if (bp->b_bcount > bp->b_bufsize) 476 bp->b_bcount = bp->b_bufsize; 477 if (bp->b_bufsize <= 0) { 478 bremhash(bp); 479 binshash(bp, &bfreelist[BQ_EMPTY]); 480 bp->b_dev = NODEV; 481 bp->b_error = 0; 482 bp->b_flags |= B_INVAL; 483 } 484 brelse(bp); 485 } 486 out: 487 tp->b_bcount = size; 488 return (1); 489 } 490 491 /* 492 * Find a buffer which is available for use. 493 * Select something from a free list. 494 * Preference is to AGE list, then LRU list. 495 */ 496 struct buf * 497 getnewbuf() 498 { 499 register struct buf *bp, *dp; 500 register struct ucred *cred; 501 int s; 502 503 #ifdef LFS 504 lfs_flush(); 505 #endif 506 loop: 507 s = splbio(); 508 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 509 if (dp->av_forw != dp) 510 break; 511 if (dp == bfreelist) { /* no free blocks */ 512 dp->b_flags |= B_WANTED; 513 sleep((caddr_t)dp, PRIBIO + 1); 514 splx(s); 515 goto loop; 516 } 517 bp = dp->av_forw; 518 bremfree(bp); 519 bp->b_flags |= B_BUSY; 520 splx(s); 521 if (bp->b_flags & B_DELWRI) { 522 (void) bawrite(bp); 523 goto loop; 524 } 525 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 526 if (bp->b_vp) 527 brelvp(bp); 528 if (bp->b_rcred != NOCRED) { 529 cred = bp->b_rcred; 530 bp->b_rcred = NOCRED; 531 crfree(cred); 532 } 533 if (bp->b_wcred != NOCRED) { 534 cred = bp->b_wcred; 535 bp->b_wcred = NOCRED; 536 crfree(cred); 537 } 538 bp->b_flags = B_BUSY; 539 bp->b_dirtyoff = bp->b_dirtyend = 0; 540 return (bp); 541 } 542 543 /* 544 * Wait for I/O to complete. 545 * 546 * Extract and return any errors associated with the I/O. 547 * If the error flag is set, but no specific error is 548 * given, return EIO. 549 */ 550 biowait(bp) 551 register struct buf *bp; 552 { 553 int s; 554 555 s = splbio(); 556 while ((bp->b_flags & B_DONE) == 0) 557 sleep((caddr_t)bp, PRIBIO); 558 splx(s); 559 if ((bp->b_flags & B_ERROR) == 0) 560 return (0); 561 if (bp->b_error) 562 return (bp->b_error); 563 return (EIO); 564 } 565 566 /* 567 * Mark I/O complete on a buffer. 568 * 569 * If a callback has been requested, e.g. the pageout 570 * daemon, do so. Otherwise, awaken waiting processes. 571 */ 572 void 573 biodone(bp) 574 register struct buf *bp; 575 { 576 577 if (bp->b_flags & B_DONE) 578 panic("dup biodone"); 579 bp->b_flags |= B_DONE; 580 if ((bp->b_flags & B_READ) == 0) 581 vwakeup(bp); 582 if (bp->b_flags & B_CALL) { 583 bp->b_flags &= ~B_CALL; 584 (*bp->b_iodone)(bp); 585 return; 586 } 587 if (bp->b_flags & B_ASYNC) 588 brelse(bp); 589 else { 590 bp->b_flags &= ~B_WANTED; 591 wakeup((caddr_t)bp); 592 } 593 } 594