1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_cluster.c 7.49 (Berkeley) 06/23/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/specdev.h> 17 #include <sys/mount.h> 18 #include <sys/trace.h> 19 #include <sys/resourcevar.h> 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 void 25 bufinit() 26 { 27 register int i; 28 register struct buf *bp, *dp; 29 register struct bufhd *hp; 30 int base, residual; 31 32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 33 hp->b_forw = hp->b_back = (struct buf *)hp; 34 35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 37 dp->b_flags = B_HEAD; 38 } 39 base = bufpages / nbuf; 40 residual = bufpages % nbuf; 41 for (i = 0; i < nbuf; i++) { 42 bp = &buf[i]; 43 bp->b_dev = NODEV; 44 bp->b_bcount = 0; 45 bp->b_rcred = NOCRED; 46 bp->b_wcred = NOCRED; 47 bp->b_dirtyoff = 0; 48 bp->b_dirtyend = 0; 49 bp->b_validoff = 0; 50 bp->b_validend = 0; 51 bp->b_un.b_addr = buffers + i * MAXBSIZE; 52 if (i < residual) 53 bp->b_bufsize = (base + 1) * CLBYTES; 54 else 55 bp->b_bufsize = base * CLBYTES; 56 binshash(bp, &bfreelist[BQ_AGE]); 57 bp->b_flags = B_INVAL; 58 dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY]; 59 binsheadfree(bp, dp); 60 } 61 } 62 63 /* 64 * Find the block in the buffer pool. 65 * If the buffer is not present, allocate a new buffer and load 66 * its contents according to the filesystem fill routine. 67 */ 68 bread(vp, blkno, size, cred, bpp) 69 struct vnode *vp; 70 daddr_t blkno; 71 int size; 72 struct ucred *cred; 73 struct buf **bpp; 74 { 75 USES_VOP_STRATEGY; 76 struct proc *p = curproc; /* XXX */ 77 register struct buf *bp; 78 79 if (size == 0) 80 panic("bread: size 0"); 81 *bpp = bp = getblk(vp, blkno, size); 82 if (bp->b_flags & (B_DONE | B_DELWRI)) { 83 trace(TR_BREADHIT, pack(vp, size), blkno); 84 return (0); 85 } 86 bp->b_flags |= B_READ; 87 if (bp->b_bcount > bp->b_bufsize) 88 panic("bread"); 89 if (bp->b_rcred == NOCRED && cred != NOCRED) { 90 crhold(cred); 91 bp->b_rcred = cred; 92 } 93 VOP_STRATEGY(bp); 94 trace(TR_BREADMISS, pack(vp, size), blkno); 95 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 96 return (biowait(bp)); 97 } 98 99 /* 100 * Operates like bread, but also starts I/O on the N specified 101 * read-ahead blocks. 102 */ 103 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 104 struct vnode *vp; 105 daddr_t blkno; int size; 106 daddr_t rablkno[]; int rabsize[]; 107 int num; 108 struct ucred *cred; 109 struct buf **bpp; 110 { 111 USES_VOP_STRATEGY; 112 struct proc *p = curproc; /* XXX */ 113 register struct buf *bp, *rabp; 114 register int i; 115 116 bp = NULL; 117 /* 118 * If the block is not memory resident, 119 * allocate a buffer and start I/O. 120 */ 121 if (!incore(vp, blkno)) { 122 *bpp = bp = getblk(vp, blkno, size); 123 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 124 bp->b_flags |= B_READ; 125 if (bp->b_bcount > bp->b_bufsize) 126 panic("breadn"); 127 if (bp->b_rcred == NOCRED && cred != NOCRED) { 128 crhold(cred); 129 bp->b_rcred = cred; 130 } 131 VOP_STRATEGY(bp); 132 trace(TR_BREADMISS, pack(vp, size), blkno); 133 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 134 } else { 135 trace(TR_BREADHIT, pack(vp, size), blkno); 136 } 137 } 138 139 /* 140 * If there's read-ahead block(s), start I/O 141 * on them also (as above). 142 */ 143 for (i = 0; i < num; i++) { 144 if (incore(vp, rablkno[i])) 145 continue; 146 rabp = getblk(vp, rablkno[i], rabsize[i]); 147 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 148 brelse(rabp); 149 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 150 } else { 151 rabp->b_flags |= B_ASYNC | B_READ; 152 if (rabp->b_bcount > rabp->b_bufsize) 153 panic("breadrabp"); 154 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 155 crhold(cred); 156 rabp->b_rcred = cred; 157 } 158 VOP_STRATEGY(rabp); 159 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 160 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 161 } 162 } 163 164 /* 165 * If block was memory resident, let bread get it. 166 * If block was not memory resident, the read was 167 * started above, so just wait for the read to complete. 168 */ 169 if (bp == NULL) 170 return (bread(vp, blkno, size, cred, bpp)); 171 return (biowait(bp)); 172 } 173 174 /* 175 * Synchronous write. 176 * Release buffer on completion. 177 */ 178 bwrite(bp) 179 register struct buf *bp; 180 { 181 USES_VOP_STRATEGY; 182 struct proc *p = curproc; /* XXX */ 183 register int flag; 184 int s, error = 0; 185 186 flag = bp->b_flags; 187 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 188 if (flag & B_ASYNC) { 189 if ((flag & B_DELWRI) == 0) 190 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 191 else 192 reassignbuf(bp, bp->b_vp); 193 } 194 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 195 if (bp->b_bcount > bp->b_bufsize) 196 panic("bwrite"); 197 s = splbio(); 198 bp->b_vp->v_numoutput++; 199 splx(s); 200 VOP_STRATEGY(bp); 201 202 /* 203 * If the write was synchronous, then await I/O completion. 204 * If the write was "delayed", then we put the buffer on 205 * the queue of blocks awaiting I/O completion status. 206 */ 207 if ((flag & B_ASYNC) == 0) { 208 error = biowait(bp); 209 if ((flag&B_DELWRI) == 0) 210 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 211 else 212 reassignbuf(bp, bp->b_vp); 213 brelse(bp); 214 } else if (flag & B_DELWRI) { 215 s = splbio(); 216 bp->b_flags |= B_AGE; 217 splx(s); 218 } 219 return (error); 220 } 221 222 int 223 vn_bwrite(ap) 224 struct vop_bwrite_args *ap; 225 { 226 return bwrite (ap->a_bp); 227 } 228 229 230 /* 231 * Delayed write. 232 * 233 * The buffer is marked dirty, but is not queued for I/O. 234 * This routine should be used when the buffer is expected 235 * to be modified again soon, typically a small write that 236 * partially fills a buffer. 237 * 238 * NB: magnetic tapes cannot be delayed; they must be 239 * written in the order that the writes are requested. 240 */ 241 bdwrite(bp) 242 register struct buf *bp; 243 { 244 USES_VOP_IOCTL; 245 struct proc *p = curproc; /* XXX */ 246 247 if ((bp->b_flags & B_DELWRI) == 0) { 248 bp->b_flags |= B_DELWRI; 249 reassignbuf(bp, bp->b_vp); 250 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 251 } 252 /* 253 * If this is a tape drive, the write must be initiated. 254 */ 255 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 256 bawrite(bp); 257 } else { 258 bp->b_flags |= (B_DONE | B_DELWRI); 259 brelse(bp); 260 } 261 } 262 263 /* 264 * Asynchronous write. 265 * Start I/O on a buffer, but do not wait for it to complete. 266 * The buffer is released when the I/O completes. 267 */ 268 bawrite(bp) 269 register struct buf *bp; 270 { 271 272 /* 273 * Setting the ASYNC flag causes bwrite to return 274 * after starting the I/O. 275 */ 276 bp->b_flags |= B_ASYNC; 277 (void) bwrite(bp); 278 } 279 280 /* 281 * Release a buffer. 282 * Even if the buffer is dirty, no I/O is started. 283 */ 284 brelse(bp) 285 register struct buf *bp; 286 { 287 register struct buf *flist; 288 int s; 289 290 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 291 /* 292 * If a process is waiting for the buffer, or 293 * is waiting for a free buffer, awaken it. 294 */ 295 if (bp->b_flags & B_WANTED) 296 wakeup((caddr_t)bp); 297 if (bfreelist[0].b_flags & B_WANTED) { 298 bfreelist[0].b_flags &= ~B_WANTED; 299 wakeup((caddr_t)bfreelist); 300 } 301 /* 302 * Retry I/O for locked buffers rather than invalidating them. 303 */ 304 s = splbio(); 305 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 306 bp->b_flags &= ~B_ERROR; 307 /* 308 * Disassociate buffers that are no longer valid. 309 */ 310 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 311 bp->b_flags |= B_INVAL; 312 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 313 if (bp->b_vp) 314 brelvp(bp); 315 bp->b_flags &= ~B_DELWRI; 316 } 317 /* 318 * Stick the buffer back on a free list. 319 */ 320 if (bp->b_bufsize <= 0) { 321 /* block has no buffer ... put at front of unused buffer list */ 322 flist = &bfreelist[BQ_EMPTY]; 323 binsheadfree(bp, flist); 324 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 325 /* block has no info ... put at front of most free list */ 326 flist = &bfreelist[BQ_AGE]; 327 binsheadfree(bp, flist); 328 } else { 329 if (bp->b_flags & B_LOCKED) 330 flist = &bfreelist[BQ_LOCKED]; 331 else if (bp->b_flags & B_AGE) 332 flist = &bfreelist[BQ_AGE]; 333 else 334 flist = &bfreelist[BQ_LRU]; 335 binstailfree(bp, flist); 336 } 337 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 338 splx(s); 339 } 340 341 /* 342 * Check to see if a block is currently memory resident. 343 */ 344 incore(vp, blkno) 345 struct vnode *vp; 346 daddr_t blkno; 347 { 348 register struct buf *bp; 349 register struct buf *dp; 350 351 dp = BUFHASH(vp, blkno); 352 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 353 if (bp->b_lblkno == blkno && bp->b_vp == vp && 354 (bp->b_flags & B_INVAL) == 0) 355 return (1); 356 return (0); 357 } 358 359 /* 360 * Check to see if a block is currently memory resident. 361 * If it is resident, return it. If it is not resident, 362 * allocate a new buffer and assign it to the block. 363 */ 364 struct buf * 365 getblk(vp, blkno, size) 366 register struct vnode *vp; 367 daddr_t blkno; 368 int size; 369 { 370 register struct buf *bp, *dp; 371 int s; 372 373 if (size > MAXBSIZE) 374 panic("getblk: size too big"); 375 /* 376 * Search the cache for the block. If the buffer is found, 377 * but it is currently locked, the we must wait for it to 378 * become available. 379 */ 380 dp = BUFHASH(vp, blkno); 381 loop: 382 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 383 if (bp->b_lblkno != blkno || bp->b_vp != vp || 384 (bp->b_flags & B_INVAL)) 385 continue; 386 s = splbio(); 387 if (bp->b_flags & B_BUSY) { 388 bp->b_flags |= B_WANTED; 389 sleep((caddr_t)bp, PRIBIO + 1); 390 splx(s); 391 goto loop; 392 } 393 bremfree(bp); 394 bp->b_flags |= B_BUSY; 395 splx(s); 396 if (bp->b_bcount != size) { 397 printf("getblk: stray size"); 398 bp->b_flags |= B_INVAL; 399 bwrite(bp); 400 goto loop; 401 } 402 bp->b_flags |= B_CACHE; 403 return (bp); 404 } 405 bp = getnewbuf(); 406 bremhash(bp); 407 bgetvp(vp, bp); 408 bp->b_bcount = 0; 409 bp->b_lblkno = blkno; 410 bp->b_blkno = blkno; 411 bp->b_error = 0; 412 bp->b_resid = 0; 413 binshash(bp, dp); 414 allocbuf(bp, size); 415 return (bp); 416 } 417 418 /* 419 * Allocate a buffer. 420 * The caller will assign it to a block. 421 */ 422 struct buf * 423 geteblk(size) 424 int size; 425 { 426 register struct buf *bp, *flist; 427 428 if (size > MAXBSIZE) 429 panic("geteblk: size too big"); 430 bp = getnewbuf(); 431 bp->b_flags |= B_INVAL; 432 bremhash(bp); 433 flist = &bfreelist[BQ_AGE]; 434 bp->b_bcount = 0; 435 bp->b_error = 0; 436 bp->b_resid = 0; 437 binshash(bp, flist); 438 allocbuf(bp, size); 439 return (bp); 440 } 441 442 /* 443 * Expand or contract the actual memory allocated to a buffer. 444 * If no memory is available, release buffer and take error exit. 445 */ 446 allocbuf(tp, size) 447 register struct buf *tp; 448 int size; 449 { 450 register struct buf *bp, *ep; 451 int sizealloc, take, s; 452 453 sizealloc = roundup(size, CLBYTES); 454 /* 455 * Buffer size does not change 456 */ 457 if (sizealloc == tp->b_bufsize) 458 goto out; 459 /* 460 * Buffer size is shrinking. 461 * Place excess space in a buffer header taken from the 462 * BQ_EMPTY buffer list and placed on the "most free" list. 463 * If no extra buffer headers are available, leave the 464 * extra space in the present buffer. 465 */ 466 if (sizealloc < tp->b_bufsize) { 467 ep = bfreelist[BQ_EMPTY].av_forw; 468 if (ep == &bfreelist[BQ_EMPTY]) 469 goto out; 470 s = splbio(); 471 bremfree(ep); 472 ep->b_flags |= B_BUSY; 473 splx(s); 474 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 475 (int)tp->b_bufsize - sizealloc); 476 ep->b_bufsize = tp->b_bufsize - sizealloc; 477 tp->b_bufsize = sizealloc; 478 ep->b_flags |= B_INVAL; 479 ep->b_bcount = 0; 480 brelse(ep); 481 goto out; 482 } 483 /* 484 * More buffer space is needed. Get it out of buffers on 485 * the "most free" list, placing the empty headers on the 486 * BQ_EMPTY buffer header list. 487 */ 488 while (tp->b_bufsize < sizealloc) { 489 take = sizealloc - tp->b_bufsize; 490 bp = getnewbuf(); 491 if (take >= bp->b_bufsize) 492 take = bp->b_bufsize; 493 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 494 &tp->b_un.b_addr[tp->b_bufsize], take); 495 tp->b_bufsize += take; 496 bp->b_bufsize = bp->b_bufsize - take; 497 if (bp->b_bcount > bp->b_bufsize) 498 bp->b_bcount = bp->b_bufsize; 499 if (bp->b_bufsize <= 0) { 500 bremhash(bp); 501 binshash(bp, &bfreelist[BQ_EMPTY]); 502 bp->b_dev = NODEV; 503 bp->b_error = 0; 504 bp->b_flags |= B_INVAL; 505 } 506 brelse(bp); 507 } 508 out: 509 tp->b_bcount = size; 510 return (1); 511 } 512 513 /* 514 * Find a buffer which is available for use. 515 * Select something from a free list. 516 * Preference is to AGE list, then LRU list. 517 */ 518 struct buf * 519 getnewbuf() 520 { 521 register struct buf *bp, *dp; 522 register struct ucred *cred; 523 int s; 524 525 #ifdef LFS 526 lfs_flush(); 527 #endif 528 loop: 529 s = splbio(); 530 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 531 if (dp->av_forw != dp) 532 break; 533 if (dp == bfreelist) { /* no free blocks */ 534 dp->b_flags |= B_WANTED; 535 sleep((caddr_t)dp, PRIBIO + 1); 536 splx(s); 537 goto loop; 538 } 539 bp = dp->av_forw; 540 bremfree(bp); 541 bp->b_flags |= B_BUSY; 542 splx(s); 543 if (bp->b_flags & B_DELWRI) { 544 (void) bawrite(bp); 545 goto loop; 546 } 547 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 548 if (bp->b_vp) 549 brelvp(bp); 550 if (bp->b_rcred != NOCRED) { 551 cred = bp->b_rcred; 552 bp->b_rcred = NOCRED; 553 crfree(cred); 554 } 555 if (bp->b_wcred != NOCRED) { 556 cred = bp->b_wcred; 557 bp->b_wcred = NOCRED; 558 crfree(cred); 559 } 560 bp->b_flags = B_BUSY; 561 bp->b_dirtyoff = bp->b_dirtyend = 0; 562 bp->b_validoff = bp->b_validend = 0; 563 return (bp); 564 } 565 566 /* 567 * Wait for I/O to complete. 568 * 569 * Extract and return any errors associated with the I/O. 570 * If the error flag is set, but no specific error is 571 * given, return EIO. 572 */ 573 biowait(bp) 574 register struct buf *bp; 575 { 576 int s; 577 578 s = splbio(); 579 while ((bp->b_flags & B_DONE) == 0) 580 sleep((caddr_t)bp, PRIBIO); 581 splx(s); 582 if ((bp->b_flags & B_ERROR) == 0) 583 return (0); 584 if (bp->b_error) 585 return (bp->b_error); 586 return (EIO); 587 } 588 589 /* 590 * Mark I/O complete on a buffer. 591 * 592 * If a callback has been requested, e.g. the pageout 593 * daemon, do so. Otherwise, awaken waiting processes. 594 */ 595 void 596 biodone(bp) 597 register struct buf *bp; 598 { 599 600 if (bp->b_flags & B_DONE) 601 panic("dup biodone"); 602 bp->b_flags |= B_DONE; 603 if ((bp->b_flags & B_READ) == 0) 604 vwakeup(bp); 605 if (bp->b_flags & B_CALL) { 606 bp->b_flags &= ~B_CALL; 607 (*bp->b_iodone)(bp); 608 return; 609 } 610 if (bp->b_flags & B_ASYNC) 611 brelse(bp); 612 else { 613 bp->b_flags &= ~B_WANTED; 614 wakeup((caddr_t)bp); 615 } 616 } 617