1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_cluster.c 7.46 (Berkeley) 02/05/92 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/specdev.h> 17 #include <sys/mount.h> 18 #include <sys/trace.h> 19 #include <sys/resourcevar.h> 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 void 25 bufinit() 26 { 27 register int i; 28 register struct buf *bp, *dp; 29 register struct bufhd *hp; 30 int base, residual; 31 32 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 33 hp->b_forw = hp->b_back = (struct buf *)hp; 34 35 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 36 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 37 dp->b_flags = B_HEAD; 38 } 39 base = bufpages / nbuf; 40 residual = bufpages % nbuf; 41 for (i = 0; i < nbuf; i++) { 42 bp = &buf[i]; 43 bp->b_dev = NODEV; 44 bp->b_bcount = 0; 45 bp->b_rcred = NOCRED; 46 bp->b_wcred = NOCRED; 47 bp->b_dirtyoff = 0; 48 bp->b_dirtyend = 0; 49 bp->b_validoff = 0; 50 bp->b_validend = 0; 51 bp->b_un.b_addr = buffers + i * MAXBSIZE; 52 if (i < residual) 53 bp->b_bufsize = (base + 1) * CLBYTES; 54 else 55 bp->b_bufsize = base * CLBYTES; 56 binshash(bp, &bfreelist[BQ_AGE]); 57 bp->b_flags = B_INVAL; 58 dp = bp->b_bufsize ? &bfreelist[BQ_AGE] : &bfreelist[BQ_EMPTY]; 59 binsheadfree(bp, dp); 60 } 61 } 62 63 /* 64 * Find the block in the buffer pool. 65 * If the buffer is not present, allocate a new buffer and load 66 * its contents according to the filesystem fill routine. 67 */ 68 bread(vp, blkno, size, cred, bpp) 69 struct vnode *vp; 70 daddr_t blkno; 71 int size; 72 struct ucred *cred; 73 struct buf **bpp; 74 { 75 struct proc *p = curproc; /* XXX */ 76 register struct buf *bp; 77 78 if (size == 0) 79 panic("bread: size 0"); 80 *bpp = bp = getblk(vp, blkno, size); 81 if (bp->b_flags & (B_DONE | B_DELWRI)) { 82 trace(TR_BREADHIT, pack(vp, size), blkno); 83 return (0); 84 } 85 bp->b_flags |= B_READ; 86 if (bp->b_bcount > bp->b_bufsize) 87 panic("bread"); 88 if (bp->b_rcred == NOCRED && cred != NOCRED) { 89 crhold(cred); 90 bp->b_rcred = cred; 91 } 92 VOP_STRATEGY(bp); 93 trace(TR_BREADMISS, pack(vp, size), blkno); 94 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 95 return (biowait(bp)); 96 } 97 98 /* 99 * Operates like bread, but also starts I/O on the N specified 100 * read-ahead blocks. 101 */ 102 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 103 struct vnode *vp; 104 daddr_t blkno; int size; 105 daddr_t rablkno[]; int rabsize[]; 106 int num; 107 struct ucred *cred; 108 struct buf **bpp; 109 { 110 struct proc *p = curproc; /* XXX */ 111 register struct buf *bp, *rabp; 112 register int i; 113 114 bp = NULL; 115 /* 116 * If the block is not memory resident, 117 * allocate a buffer and start I/O. 118 */ 119 if (!incore(vp, blkno)) { 120 *bpp = bp = getblk(vp, blkno, size); 121 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 122 bp->b_flags |= B_READ; 123 if (bp->b_bcount > bp->b_bufsize) 124 panic("breadn"); 125 if (bp->b_rcred == NOCRED && cred != NOCRED) { 126 crhold(cred); 127 bp->b_rcred = cred; 128 } 129 VOP_STRATEGY(bp); 130 trace(TR_BREADMISS, pack(vp, size), blkno); 131 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 132 } else 133 trace(TR_BREADHIT, pack(vp, size), blkno); 134 } 135 136 /* 137 * If there's read-ahead block(s), start I/O 138 * on them also (as above). 139 */ 140 for (i = 0; i < num; i++) { 141 if (incore(vp, rablkno[i])) 142 continue; 143 rabp = getblk(vp, rablkno[i], rabsize[i]); 144 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 145 brelse(rabp); 146 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 147 } else { 148 rabp->b_flags |= B_ASYNC | B_READ; 149 if (rabp->b_bcount > rabp->b_bufsize) 150 panic("breadrabp"); 151 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 152 crhold(cred); 153 rabp->b_rcred = cred; 154 } 155 VOP_STRATEGY(rabp); 156 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 157 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 158 } 159 } 160 161 /* 162 * If block was memory resident, let bread get it. 163 * If block was not memory resident, the read was 164 * started above, so just wait for the read to complete. 165 */ 166 if (bp == NULL) 167 return (bread(vp, blkno, size, cred, bpp)); 168 return (biowait(bp)); 169 } 170 171 /* 172 * Synchronous write. 173 * Release buffer on completion. 174 */ 175 bwrite(bp) 176 register struct buf *bp; 177 { 178 struct proc *p = curproc; /* XXX */ 179 register int flag; 180 int s, error = 0; 181 182 flag = bp->b_flags; 183 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 184 if (flag & B_ASYNC) { 185 if ((flag & B_DELWRI) == 0) 186 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 187 else 188 reassignbuf(bp, bp->b_vp); 189 } 190 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 191 if (bp->b_bcount > bp->b_bufsize) 192 panic("bwrite"); 193 s = splbio(); 194 bp->b_vp->v_numoutput++; 195 splx(s); 196 VOP_STRATEGY(bp); 197 198 /* 199 * If the write was synchronous, then await I/O completion. 200 * If the write was "delayed", then we put the buffer on 201 * the queue of blocks awaiting I/O completion status. 202 */ 203 if ((flag & B_ASYNC) == 0) { 204 error = biowait(bp); 205 if ((flag&B_DELWRI) == 0) 206 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 207 else 208 reassignbuf(bp, bp->b_vp); 209 brelse(bp); 210 } else if (flag & B_DELWRI) { 211 s = splbio(); 212 bp->b_flags |= B_AGE; 213 splx(s); 214 } 215 return (error); 216 } 217 218 /* 219 * Delayed write. 220 * 221 * The buffer is marked dirty, but is not queued for I/O. 222 * This routine should be used when the buffer is expected 223 * to be modified again soon, typically a small write that 224 * partially fills a buffer. 225 * 226 * NB: magnetic tapes cannot be delayed; they must be 227 * written in the order that the writes are requested. 228 */ 229 bdwrite(bp) 230 register struct buf *bp; 231 { 232 struct proc *p = curproc; /* XXX */ 233 234 if ((bp->b_flags & B_DELWRI) == 0) { 235 bp->b_flags |= B_DELWRI; 236 reassignbuf(bp, bp->b_vp); 237 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 238 } 239 /* 240 * If this is a tape drive, the write must be initiated. 241 */ 242 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 243 bawrite(bp); 244 } else { 245 bp->b_flags |= (B_DONE | B_DELWRI); 246 brelse(bp); 247 } 248 } 249 250 /* 251 * Asynchronous write. 252 * Start I/O on a buffer, but do not wait for it to complete. 253 * The buffer is released when the I/O completes. 254 */ 255 bawrite(bp) 256 register struct buf *bp; 257 { 258 259 /* 260 * Setting the ASYNC flag causes bwrite to return 261 * after starting the I/O. 262 */ 263 bp->b_flags |= B_ASYNC; 264 (void) bwrite(bp); 265 } 266 267 /* 268 * Release a buffer. 269 * Even if the buffer is dirty, no I/O is started. 270 */ 271 brelse(bp) 272 register struct buf *bp; 273 { 274 register struct buf *flist; 275 int s; 276 277 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 278 /* 279 * If a process is waiting for the buffer, or 280 * is waiting for a free buffer, awaken it. 281 */ 282 if (bp->b_flags & B_WANTED) 283 wakeup((caddr_t)bp); 284 if (bfreelist[0].b_flags & B_WANTED) { 285 bfreelist[0].b_flags &= ~B_WANTED; 286 wakeup((caddr_t)bfreelist); 287 } 288 /* 289 * Retry I/O for locked buffers rather than invalidating them. 290 */ 291 s = splbio(); 292 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 293 bp->b_flags &= ~B_ERROR; 294 /* 295 * Disassociate buffers that are no longer valid. 296 */ 297 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 298 bp->b_flags |= B_INVAL; 299 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 300 if (bp->b_vp) 301 brelvp(bp); 302 bp->b_flags &= ~B_DELWRI; 303 } 304 /* 305 * Stick the buffer back on a free list. 306 */ 307 if (bp->b_bufsize <= 0) { 308 /* block has no buffer ... put at front of unused buffer list */ 309 flist = &bfreelist[BQ_EMPTY]; 310 binsheadfree(bp, flist); 311 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 312 /* block has no info ... put at front of most free list */ 313 flist = &bfreelist[BQ_AGE]; 314 binsheadfree(bp, flist); 315 } else { 316 if (bp->b_flags & B_LOCKED) 317 flist = &bfreelist[BQ_LOCKED]; 318 else if (bp->b_flags & B_AGE) 319 flist = &bfreelist[BQ_AGE]; 320 else 321 flist = &bfreelist[BQ_LRU]; 322 binstailfree(bp, flist); 323 } 324 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 325 splx(s); 326 } 327 328 /* 329 * Check to see if a block is currently memory resident. 330 */ 331 incore(vp, blkno) 332 struct vnode *vp; 333 daddr_t blkno; 334 { 335 register struct buf *bp; 336 register struct buf *dp; 337 338 dp = BUFHASH(vp, blkno); 339 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 340 if (bp->b_lblkno == blkno && bp->b_vp == vp && 341 (bp->b_flags & B_INVAL) == 0) 342 return (1); 343 return (0); 344 } 345 346 /* 347 * Check to see if a block is currently memory resident. 348 * If it is resident, return it. If it is not resident, 349 * allocate a new buffer and assign it to the block. 350 */ 351 struct buf * 352 getblk(vp, blkno, size) 353 register struct vnode *vp; 354 daddr_t blkno; 355 int size; 356 { 357 register struct buf *bp, *dp; 358 int s; 359 360 if (size > MAXBSIZE) 361 panic("getblk: size too big"); 362 /* 363 * Search the cache for the block. If the buffer is found, 364 * but it is currently locked, the we must wait for it to 365 * become available. 366 */ 367 dp = BUFHASH(vp, blkno); 368 loop: 369 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 370 if (bp->b_lblkno != blkno || bp->b_vp != vp || 371 (bp->b_flags & B_INVAL)) 372 continue; 373 s = splbio(); 374 if (bp->b_flags & B_BUSY) { 375 bp->b_flags |= B_WANTED; 376 sleep((caddr_t)bp, PRIBIO + 1); 377 splx(s); 378 goto loop; 379 } 380 bremfree(bp); 381 bp->b_flags |= B_BUSY; 382 splx(s); 383 if (bp->b_bcount != size) { 384 printf("getblk: stray size"); 385 bp->b_flags |= B_INVAL; 386 bwrite(bp); 387 goto loop; 388 } 389 bp->b_flags |= B_CACHE; 390 return (bp); 391 } 392 bp = getnewbuf(); 393 bremhash(bp); 394 bgetvp(vp, bp); 395 bp->b_bcount = 0; 396 bp->b_lblkno = blkno; 397 bp->b_blkno = blkno; 398 bp->b_error = 0; 399 bp->b_resid = 0; 400 binshash(bp, dp); 401 allocbuf(bp, size); 402 return (bp); 403 } 404 405 /* 406 * Allocate a buffer. 407 * The caller will assign it to a block. 408 */ 409 struct buf * 410 geteblk(size) 411 int size; 412 { 413 register struct buf *bp, *flist; 414 415 if (size > MAXBSIZE) 416 panic("geteblk: size too big"); 417 bp = getnewbuf(); 418 bp->b_flags |= B_INVAL; 419 bremhash(bp); 420 flist = &bfreelist[BQ_AGE]; 421 bp->b_bcount = 0; 422 bp->b_error = 0; 423 bp->b_resid = 0; 424 binshash(bp, flist); 425 allocbuf(bp, size); 426 return (bp); 427 } 428 429 /* 430 * Expand or contract the actual memory allocated to a buffer. 431 * If no memory is available, release buffer and take error exit. 432 */ 433 allocbuf(tp, size) 434 register struct buf *tp; 435 int size; 436 { 437 register struct buf *bp, *ep; 438 int sizealloc, take, s; 439 440 sizealloc = roundup(size, CLBYTES); 441 /* 442 * Buffer size does not change 443 */ 444 if (sizealloc == tp->b_bufsize) 445 goto out; 446 /* 447 * Buffer size is shrinking. 448 * Place excess space in a buffer header taken from the 449 * BQ_EMPTY buffer list and placed on the "most free" list. 450 * If no extra buffer headers are available, leave the 451 * extra space in the present buffer. 452 */ 453 if (sizealloc < tp->b_bufsize) { 454 ep = bfreelist[BQ_EMPTY].av_forw; 455 if (ep == &bfreelist[BQ_EMPTY]) 456 goto out; 457 s = splbio(); 458 bremfree(ep); 459 ep->b_flags |= B_BUSY; 460 splx(s); 461 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 462 (int)tp->b_bufsize - sizealloc); 463 ep->b_bufsize = tp->b_bufsize - sizealloc; 464 tp->b_bufsize = sizealloc; 465 ep->b_flags |= B_INVAL; 466 ep->b_bcount = 0; 467 brelse(ep); 468 goto out; 469 } 470 /* 471 * More buffer space is needed. Get it out of buffers on 472 * the "most free" list, placing the empty headers on the 473 * BQ_EMPTY buffer header list. 474 */ 475 while (tp->b_bufsize < sizealloc) { 476 take = sizealloc - tp->b_bufsize; 477 bp = getnewbuf(); 478 if (take >= bp->b_bufsize) 479 take = bp->b_bufsize; 480 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 481 &tp->b_un.b_addr[tp->b_bufsize], take); 482 tp->b_bufsize += take; 483 bp->b_bufsize = bp->b_bufsize - take; 484 if (bp->b_bcount > bp->b_bufsize) 485 bp->b_bcount = bp->b_bufsize; 486 if (bp->b_bufsize <= 0) { 487 bremhash(bp); 488 binshash(bp, &bfreelist[BQ_EMPTY]); 489 bp->b_dev = NODEV; 490 bp->b_error = 0; 491 bp->b_flags |= B_INVAL; 492 } 493 brelse(bp); 494 } 495 out: 496 tp->b_bcount = size; 497 return (1); 498 } 499 500 /* 501 * Find a buffer which is available for use. 502 * Select something from a free list. 503 * Preference is to AGE list, then LRU list. 504 */ 505 struct buf * 506 getnewbuf() 507 { 508 register struct buf *bp, *dp; 509 register struct ucred *cred; 510 int s; 511 512 #ifdef LFS 513 lfs_flush(); 514 #endif 515 loop: 516 s = splbio(); 517 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 518 if (dp->av_forw != dp) 519 break; 520 if (dp == bfreelist) { /* no free blocks */ 521 dp->b_flags |= B_WANTED; 522 sleep((caddr_t)dp, PRIBIO + 1); 523 splx(s); 524 goto loop; 525 } 526 bp = dp->av_forw; 527 bremfree(bp); 528 bp->b_flags |= B_BUSY; 529 splx(s); 530 if (bp->b_flags & B_DELWRI) { 531 (void) bawrite(bp); 532 goto loop; 533 } 534 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 535 if (bp->b_vp) 536 brelvp(bp); 537 if (bp->b_rcred != NOCRED) { 538 cred = bp->b_rcred; 539 bp->b_rcred = NOCRED; 540 crfree(cred); 541 } 542 if (bp->b_wcred != NOCRED) { 543 cred = bp->b_wcred; 544 bp->b_wcred = NOCRED; 545 crfree(cred); 546 } 547 bp->b_flags = B_BUSY; 548 bp->b_dirtyoff = bp->b_dirtyend = 0; 549 bp->b_validoff = bp->b_validend = 0; 550 return (bp); 551 } 552 553 /* 554 * Wait for I/O to complete. 555 * 556 * Extract and return any errors associated with the I/O. 557 * If the error flag is set, but no specific error is 558 * given, return EIO. 559 */ 560 biowait(bp) 561 register struct buf *bp; 562 { 563 int s; 564 565 s = splbio(); 566 while ((bp->b_flags & B_DONE) == 0) 567 sleep((caddr_t)bp, PRIBIO); 568 splx(s); 569 if ((bp->b_flags & B_ERROR) == 0) 570 return (0); 571 if (bp->b_error) 572 return (bp->b_error); 573 return (EIO); 574 } 575 576 /* 577 * Mark I/O complete on a buffer. 578 * 579 * If a callback has been requested, e.g. the pageout 580 * daemon, do so. Otherwise, awaken waiting processes. 581 */ 582 void 583 biodone(bp) 584 register struct buf *bp; 585 { 586 587 if (bp->b_flags & B_DONE) 588 panic("dup biodone"); 589 bp->b_flags |= B_DONE; 590 if ((bp->b_flags & B_READ) == 0) 591 vwakeup(bp); 592 if (bp->b_flags & B_CALL) { 593 bp->b_flags &= ~B_CALL; 594 (*bp->b_iodone)(bp); 595 return; 596 } 597 if (bp->b_flags & B_ASYNC) 598 brelse(bp); 599 else { 600 bp->b_flags &= ~B_WANTED; 601 wakeup((caddr_t)bp); 602 } 603 } 604