1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_cluster.c 7.48 (Berkeley) 05/15/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/specdev.h> 17 #include <sys/mount.h> 18 #include <sys/trace.h> 19 #include <sys/resourcevar.h> 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 void 25 bufinit() 26 { 27 register int i; 28 register struct buf *bp, *dp; 29 register struct bufhd *hp; 30 int base, residual; 31 32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 33 hp->b_forw = hp->b_back = (struct buf *)hp; 34 35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 37 dp->b_flags = B_HEAD; 38 } 39 base = bufpages / nbuf; 40 residual = bufpages % nbuf; 41 for (i = 0; i < nbuf; i++) { 42 bp = &buf[i]; 43 bp->b_dev = NODEV; 44 bp->b_bcount = 0; 45 bp->b_rcred = NOCRED; 46 bp->b_wcred = NOCRED; 47 bp->b_dirtyoff = 0; 48 bp->b_dirtyend = 0; 49 bp->b_validoff = 0; 50 bp->b_validend = 0; 51 bp->b_un.b_addr = buffers + i * MAXBSIZE; 52 if (i < residual) 53 bp->b_bufsize = (base + 1) * CLBYTES; 54 else 55 bp->b_bufsize = base * CLBYTES; 56 binshash(bp, &bfreelist[BQ_AGE]); 57 bp->b_flags = B_INVAL; 58 dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY]; 59 binsheadfree(bp, dp); 60 } 61 } 62 63 /* 64 * Find the block in the buffer pool. 65 * If the buffer is not present, allocate a new buffer and load 66 * its contents according to the filesystem fill routine. 67 */ 68 bread(vp, blkno, size, cred, bpp) 69 struct vnode *vp; 70 daddr_t blkno; 71 int size; 72 struct ucred *cred; 73 struct buf **bpp; 74 { 75 USES_VOP_STRATEGY; 76 struct proc *p = curproc; /* XXX */ 77 register struct buf *bp; 78 79 if (size == 0) 80 panic("bread: size 0"); 81 *bpp = bp = getblk(vp, blkno, size); 82 if (bp->b_flags & (B_DONE | B_DELWRI)) { 83 trace(TR_BREADHIT, pack(vp, size), blkno); 84 return (0); 85 } 86 bp->b_flags |= B_READ; 87 if (bp->b_bcount > bp->b_bufsize) 88 panic("bread"); 89 if (bp->b_rcred == NOCRED && cred != NOCRED) { 90 crhold(cred); 91 bp->b_rcred = cred; 92 } 93 VOP_STRATEGY(bp); 94 trace(TR_BREADMISS, pack(vp, size), blkno); 95 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 96 return (biowait(bp)); 97 } 98 99 /* 100 * Operates like bread, but also starts I/O on the N specified 101 * read-ahead blocks. 102 */ 103 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 104 struct vnode *vp; 105 daddr_t blkno; int size; 106 daddr_t rablkno[]; int rabsize[]; 107 int num; 108 struct ucred *cred; 109 struct buf **bpp; 110 { 111 USES_VOP_STRATEGY; 112 struct proc *p = curproc; /* XXX */ 113 register struct buf *bp, *rabp; 114 register int i; 115 116 bp = NULL; 117 /* 118 * If the block is not memory resident, 119 * allocate a buffer and start I/O. 120 */ 121 if (!incore(vp, blkno)) { 122 *bpp = bp = getblk(vp, blkno, size); 123 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 124 bp->b_flags |= B_READ; 125 if (bp->b_bcount > bp->b_bufsize) 126 panic("breadn"); 127 if (bp->b_rcred == NOCRED && cred != NOCRED) { 128 crhold(cred); 129 bp->b_rcred = cred; 130 } 131 VOP_STRATEGY(bp); 132 trace(TR_BREADMISS, pack(vp, size), blkno); 133 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 134 } else 135 trace(TR_BREADHIT, pack(vp, size), blkno); 136 } 137 138 /* 139 * If there's read-ahead block(s), start I/O 140 * on them also (as above). 141 */ 142 for (i = 0; i < num; i++) { 143 if (incore(vp, rablkno[i])) 144 continue; 145 rabp = getblk(vp, rablkno[i], rabsize[i]); 146 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 147 brelse(rabp); 148 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 149 } else { 150 rabp->b_flags |= B_ASYNC | B_READ; 151 if (rabp->b_bcount > rabp->b_bufsize) 152 panic("breadrabp"); 153 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 154 crhold(cred); 155 rabp->b_rcred = cred; 156 } 157 VOP_STRATEGY(rabp); 158 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 159 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 160 } 161 } 162 163 /* 164 * If block was memory resident, let bread get it. 165 * If block was not memory resident, the read was 166 * started above, so just wait for the read to complete. 167 */ 168 if (bp == NULL) 169 return (bread(vp, blkno, size, cred, bpp)); 170 return (biowait(bp)); 171 } 172 173 /* 174 * Synchronous write. 175 * Release buffer on completion. 176 */ 177 bwrite(bp) 178 register struct buf *bp; 179 { 180 USES_VOP_STRATEGY; 181 struct proc *p = curproc; /* XXX */ 182 register int flag; 183 int s, error = 0; 184 185 flag = bp->b_flags; 186 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 187 if (flag & B_ASYNC) { 188 if ((flag & B_DELWRI) == 0) 189 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 190 else 191 reassignbuf(bp, bp->b_vp); 192 } 193 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 194 if (bp->b_bcount > bp->b_bufsize) 195 panic("bwrite"); 196 s = splbio(); 197 bp->b_vp->v_numoutput++; 198 splx(s); 199 VOP_STRATEGY(bp); 200 201 /* 202 * If the write was synchronous, then await I/O completion. 203 * If the write was "delayed", then we put the buffer on 204 * the queue of blocks awaiting I/O completion status. 205 */ 206 if ((flag & B_ASYNC) == 0) { 207 error = biowait(bp); 208 if ((flag&B_DELWRI) == 0) 209 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 210 else 211 reassignbuf(bp, bp->b_vp); 212 brelse(bp); 213 } else if (flag & B_DELWRI) { 214 s = splbio(); 215 bp->b_flags |= B_AGE; 216 splx(s); 217 } 218 return (error); 219 } 220 221 int 222 vn_bwrite(ap) 223 struct vop_bwrite_args *ap; 224 { 225 return bwrite (ap->a_bp); 226 } 227 228 229 /* 230 * Delayed write. 231 * 232 * The buffer is marked dirty, but is not queued for I/O. 233 * This routine should be used when the buffer is expected 234 * to be modified again soon, typically a small write that 235 * partially fills a buffer. 236 * 237 * NB: magnetic tapes cannot be delayed; they must be 238 * written in the order that the writes are requested. 239 */ 240 bdwrite(bp) 241 register struct buf *bp; 242 { 243 USES_VOP_IOCTL; 244 struct proc *p = curproc; /* XXX */ 245 246 if ((bp->b_flags & B_DELWRI) == 0) { 247 bp->b_flags |= B_DELWRI; 248 reassignbuf(bp, bp->b_vp); 249 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 250 } 251 /* 252 * If this is a tape drive, the write must be initiated. 253 */ 254 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 255 bawrite(bp); 256 } else { 257 bp->b_flags |= (B_DONE | B_DELWRI); 258 brelse(bp); 259 } 260 } 261 262 /* 263 * Asynchronous write. 264 * Start I/O on a buffer, but do not wait for it to complete. 265 * The buffer is released when the I/O completes. 266 */ 267 bawrite(bp) 268 register struct buf *bp; 269 { 270 271 /* 272 * Setting the ASYNC flag causes bwrite to return 273 * after starting the I/O. 274 */ 275 bp->b_flags |= B_ASYNC; 276 (void) bwrite(bp); 277 } 278 279 /* 280 * Release a buffer. 281 * Even if the buffer is dirty, no I/O is started. 282 */ 283 brelse(bp) 284 register struct buf *bp; 285 { 286 register struct buf *flist; 287 int s; 288 289 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 290 /* 291 * If a process is waiting for the buffer, or 292 * is waiting for a free buffer, awaken it. 293 */ 294 if (bp->b_flags & B_WANTED) 295 wakeup((caddr_t)bp); 296 if (bfreelist[0].b_flags & B_WANTED) { 297 bfreelist[0].b_flags &= ~B_WANTED; 298 wakeup((caddr_t)bfreelist); 299 } 300 /* 301 * Retry I/O for locked buffers rather than invalidating them. 302 */ 303 s = splbio(); 304 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 305 bp->b_flags &= ~B_ERROR; 306 /* 307 * Disassociate buffers that are no longer valid. 308 */ 309 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 310 bp->b_flags |= B_INVAL; 311 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 312 if (bp->b_vp) 313 brelvp(bp); 314 bp->b_flags &= ~B_DELWRI; 315 } 316 /* 317 * Stick the buffer back on a free list. 318 */ 319 if (bp->b_bufsize <= 0) { 320 /* block has no buffer ... put at front of unused buffer list */ 321 flist = &bfreelist[BQ_EMPTY]; 322 binsheadfree(bp, flist); 323 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 324 /* block has no info ... put at front of most free list */ 325 flist = &bfreelist[BQ_AGE]; 326 binsheadfree(bp, flist); 327 } else { 328 if (bp->b_flags & B_LOCKED) 329 flist = &bfreelist[BQ_LOCKED]; 330 else if (bp->b_flags & B_AGE) 331 flist = &bfreelist[BQ_AGE]; 332 else 333 flist = &bfreelist[BQ_LRU]; 334 binstailfree(bp, flist); 335 } 336 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 337 splx(s); 338 } 339 340 /* 341 * Check to see if a block is currently memory resident. 342 */ 343 incore(vp, blkno) 344 struct vnode *vp; 345 daddr_t blkno; 346 { 347 register struct buf *bp; 348 register struct buf *dp; 349 350 dp = BUFHASH(vp, blkno); 351 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 352 if (bp->b_lblkno == blkno && bp->b_vp == vp && 353 (bp->b_flags & B_INVAL) == 0) 354 return (1); 355 return (0); 356 } 357 358 /* 359 * Check to see if a block is currently memory resident. 360 * If it is resident, return it. If it is not resident, 361 * allocate a new buffer and assign it to the block. 362 */ 363 struct buf * 364 getblk(vp, blkno, size) 365 register struct vnode *vp; 366 daddr_t blkno; 367 int size; 368 { 369 register struct buf *bp, *dp; 370 int s; 371 372 if (size > MAXBSIZE) 373 panic("getblk: size too big"); 374 /* 375 * Search the cache for the block. If the buffer is found, 376 * but it is currently locked, the we must wait for it to 377 * become available. 378 */ 379 dp = BUFHASH(vp, blkno); 380 loop: 381 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 382 if (bp->b_lblkno != blkno || bp->b_vp != vp || 383 (bp->b_flags & B_INVAL)) 384 continue; 385 s = splbio(); 386 if (bp->b_flags & B_BUSY) { 387 bp->b_flags |= B_WANTED; 388 sleep((caddr_t)bp, PRIBIO + 1); 389 splx(s); 390 goto loop; 391 } 392 bremfree(bp); 393 bp->b_flags |= B_BUSY; 394 splx(s); 395 if (bp->b_bcount != size) { 396 printf("getblk: stray size"); 397 bp->b_flags |= B_INVAL; 398 bwrite(bp); 399 goto loop; 400 } 401 bp->b_flags |= B_CACHE; 402 return (bp); 403 } 404 bp = getnewbuf(); 405 bremhash(bp); 406 bgetvp(vp, bp); 407 bp->b_bcount = 0; 408 bp->b_lblkno = blkno; 409 bp->b_blkno = blkno; 410 bp->b_error = 0; 411 bp->b_resid = 0; 412 binshash(bp, dp); 413 allocbuf(bp, size); 414 return (bp); 415 } 416 417 /* 418 * Allocate a buffer. 419 * The caller will assign it to a block. 420 */ 421 struct buf * 422 geteblk(size) 423 int size; 424 { 425 register struct buf *bp, *flist; 426 427 if (size > MAXBSIZE) 428 panic("geteblk: size too big"); 429 bp = getnewbuf(); 430 bp->b_flags |= B_INVAL; 431 bremhash(bp); 432 flist = &bfreelist[BQ_AGE]; 433 bp->b_bcount = 0; 434 bp->b_error = 0; 435 bp->b_resid = 0; 436 binshash(bp, flist); 437 allocbuf(bp, size); 438 return (bp); 439 } 440 441 /* 442 * Expand or contract the actual memory allocated to a buffer. 443 * If no memory is available, release buffer and take error exit. 444 */ 445 allocbuf(tp, size) 446 register struct buf *tp; 447 int size; 448 { 449 register struct buf *bp, *ep; 450 int sizealloc, take, s; 451 452 sizealloc = roundup(size, CLBYTES); 453 /* 454 * Buffer size does not change 455 */ 456 if (sizealloc == tp->b_bufsize) 457 goto out; 458 /* 459 * Buffer size is shrinking. 460 * Place excess space in a buffer header taken from the 461 * BQ_EMPTY buffer list and placed on the "most free" list. 462 * If no extra buffer headers are available, leave the 463 * extra space in the present buffer. 464 */ 465 if (sizealloc < tp->b_bufsize) { 466 ep = bfreelist[BQ_EMPTY].av_forw; 467 if (ep == &bfreelist[BQ_EMPTY]) 468 goto out; 469 s = splbio(); 470 bremfree(ep); 471 ep->b_flags |= B_BUSY; 472 splx(s); 473 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 474 (int)tp->b_bufsize - sizealloc); 475 ep->b_bufsize = tp->b_bufsize - sizealloc; 476 tp->b_bufsize = sizealloc; 477 ep->b_flags |= B_INVAL; 478 ep->b_bcount = 0; 479 brelse(ep); 480 goto out; 481 } 482 /* 483 * More buffer space is needed. Get it out of buffers on 484 * the "most free" list, placing the empty headers on the 485 * BQ_EMPTY buffer header list. 486 */ 487 while (tp->b_bufsize < sizealloc) { 488 take = sizealloc - tp->b_bufsize; 489 bp = getnewbuf(); 490 if (take >= bp->b_bufsize) 491 take = bp->b_bufsize; 492 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 493 &tp->b_un.b_addr[tp->b_bufsize], take); 494 tp->b_bufsize += take; 495 bp->b_bufsize = bp->b_bufsize - take; 496 if (bp->b_bcount > bp->b_bufsize) 497 bp->b_bcount = bp->b_bufsize; 498 if (bp->b_bufsize <= 0) { 499 bremhash(bp); 500 binshash(bp, &bfreelist[BQ_EMPTY]); 501 bp->b_dev = NODEV; 502 bp->b_error = 0; 503 bp->b_flags |= B_INVAL; 504 } 505 brelse(bp); 506 } 507 out: 508 tp->b_bcount = size; 509 return (1); 510 } 511 512 /* 513 * Find a buffer which is available for use. 514 * Select something from a free list. 515 * Preference is to AGE list, then LRU list. 516 */ 517 struct buf * 518 getnewbuf() 519 { 520 register struct buf *bp, *dp; 521 register struct ucred *cred; 522 int s; 523 524 #ifdef LFS 525 lfs_flush(); 526 #endif 527 loop: 528 s = splbio(); 529 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 530 if (dp->av_forw != dp) 531 break; 532 if (dp == bfreelist) { /* no free blocks */ 533 dp->b_flags |= B_WANTED; 534 sleep((caddr_t)dp, PRIBIO + 1); 535 splx(s); 536 goto loop; 537 } 538 bp = dp->av_forw; 539 bremfree(bp); 540 bp->b_flags |= B_BUSY; 541 splx(s); 542 if (bp->b_flags & B_DELWRI) { 543 (void) bawrite(bp); 544 goto loop; 545 } 546 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 547 if (bp->b_vp) 548 brelvp(bp); 549 if (bp->b_rcred != NOCRED) { 550 cred = bp->b_rcred; 551 bp->b_rcred = NOCRED; 552 crfree(cred); 553 } 554 if (bp->b_wcred != NOCRED) { 555 cred = bp->b_wcred; 556 bp->b_wcred = NOCRED; 557 crfree(cred); 558 } 559 bp->b_flags = B_BUSY; 560 bp->b_dirtyoff = bp->b_dirtyend = 0; 561 bp->b_validoff = bp->b_validend = 0; 562 return (bp); 563 } 564 565 /* 566 * Wait for I/O to complete. 567 * 568 * Extract and return any errors associated with the I/O. 569 * If the error flag is set, but no specific error is 570 * given, return EIO. 571 */ 572 biowait(bp) 573 register struct buf *bp; 574 { 575 int s; 576 577 s = splbio(); 578 while ((bp->b_flags & B_DONE) == 0) 579 sleep((caddr_t)bp, PRIBIO); 580 splx(s); 581 if ((bp->b_flags & B_ERROR) == 0) 582 return (0); 583 if (bp->b_error) 584 return (bp->b_error); 585 return (EIO); 586 } 587 588 /* 589 * Mark I/O complete on a buffer. 590 * 591 * If a callback has been requested, e.g. the pageout 592 * daemon, do so. Otherwise, awaken waiting processes. 593 */ 594 void 595 biodone(bp) 596 register struct buf *bp; 597 { 598 599 if (bp->b_flags & B_DONE) 600 panic("dup biodone"); 601 bp->b_flags |= B_DONE; 602 if ((bp->b_flags & B_READ) == 0) 603 vwakeup(bp); 604 if (bp->b_flags & B_CALL) { 605 bp->b_flags &= ~B_CALL; 606 (*bp->b_iodone)(bp); 607 return; 608 } 609 if (bp->b_flags & B_ASYNC) 610 brelse(bp); 611 else { 612 bp->b_flags &= ~B_WANTED; 613 wakeup((caddr_t)bp); 614 } 615 } 616