1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_cluster.c 7.42 (Berkeley) 05/10/91 10 */ 11 12 #include "param.h" 13 #include "proc.h" 14 #include "buf.h" 15 #include "vnode.h" 16 #include "specdev.h" 17 #include "mount.h" 18 #include "trace.h" 19 #include "resourcevar.h" 20 21 /* 22 * Initialize buffers and hash links for buffers. 23 */ 24 bufinit() 25 { 26 register int i; 27 register struct buf *bp, *dp; 28 register struct bufhd *hp; 29 int base, residual; 30 31 for (hp = bufhash, i = 0; i < BUFHSZ; i++, hp++) 32 hp->b_forw = hp->b_back = (struct buf *)hp; 33 34 for (dp = bfreelist; dp < &bfreelist[BQUEUES]; dp++) { 35 dp->b_forw = dp->b_back = dp->av_forw = dp->av_back = dp; 36 dp->b_flags = B_HEAD; 37 } 38 base = bufpages / nbuf; 39 residual = bufpages % nbuf; 40 for (i = 0; i < nbuf; i++) { 41 bp = &buf[i]; 42 bp->b_dev = NODEV; 43 bp->b_bcount = 0; 44 bp->b_rcred = NOCRED; 45 bp->b_wcred = NOCRED; 46 bp->b_dirtyoff = 0; 47 bp->b_dirtyend = 0; 48 bp->b_un.b_addr = buffers + i * MAXBSIZE; 49 if (i < residual) 50 bp->b_bufsize = (base + 1) * CLBYTES; 51 else 52 bp->b_bufsize = base * CLBYTES; 53 binshash(bp, &bfreelist[BQ_AGE]); 54 bp->b_flags = B_BUSY|B_INVAL; 55 brelse(bp); 56 } 57 } 58 59 /* 60 * Find the block in the buffer pool. 61 * If the buffer is not present, allocate a new buffer and load 62 * its contents according to the filesystem fill routine. 63 */ 64 bread(vp, blkno, size, cred, bpp) 65 struct vnode *vp; 66 daddr_t blkno; 67 int size; 68 struct ucred *cred; 69 struct buf **bpp; 70 { 71 struct proc *p = curproc; /* XXX */ 72 register struct buf *bp; 73 74 if (size == 0) 75 panic("bread: size 0"); 76 *bpp = bp = getblk(vp, blkno, size); 77 if (bp->b_flags & (B_DONE | B_DELWRI)) { 78 trace(TR_BREADHIT, pack(vp, size), blkno); 79 return (0); 80 } 81 bp->b_flags |= B_READ; 82 if (bp->b_bcount > bp->b_bufsize) 83 panic("bread"); 84 if (bp->b_rcred == NOCRED && cred != NOCRED) { 85 crhold(cred); 86 bp->b_rcred = cred; 87 } 88 VOP_STRATEGY(bp); 89 trace(TR_BREADMISS, pack(vp, size), blkno); 90 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 91 return (biowait(bp)); 92 } 93 94 /* 95 * Operates like bread, but also starts I/O on the specified 96 * read-ahead block. 97 */ 98 breada(vp, blkno, size, rablkno, rabsize, cred, bpp) 99 struct vnode *vp; 100 daddr_t blkno; int size; 101 daddr_t rablkno; int rabsize; 102 struct ucred *cred; 103 struct buf **bpp; 104 { 105 struct proc *p = curproc; /* XXX */ 106 register struct buf *bp, *rabp; 107 108 bp = NULL; 109 /* 110 * If the block is not memory resident, 111 * allocate a buffer and start I/O. 112 */ 113 if (!incore(vp, blkno)) { 114 *bpp = bp = getblk(vp, blkno, size); 115 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 116 bp->b_flags |= B_READ; 117 if (bp->b_bcount > bp->b_bufsize) 118 panic("breada"); 119 if (bp->b_rcred == NOCRED && cred != NOCRED) { 120 crhold(cred); 121 bp->b_rcred = cred; 122 } 123 VOP_STRATEGY(bp); 124 trace(TR_BREADMISS, pack(vp, size), blkno); 125 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 126 } else 127 trace(TR_BREADHIT, pack(vp, size), blkno); 128 } 129 130 /* 131 * If there is a read-ahead block, start I/O on it too. 132 */ 133 if (!incore(vp, rablkno)) { 134 rabp = getblk(vp, rablkno, rabsize); 135 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 136 brelse(rabp); 137 trace(TR_BREADHITRA, pack(vp, rabsize), rablkno); 138 } else { 139 rabp->b_flags |= B_ASYNC | B_READ; 140 if (rabp->b_bcount > rabp->b_bufsize) 141 panic("breadrabp"); 142 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 143 crhold(cred); 144 rabp->b_rcred = cred; 145 } 146 VOP_STRATEGY(rabp); 147 trace(TR_BREADMISSRA, pack(vp, rabsize), rablkno); 148 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 149 } 150 } 151 152 /* 153 * If block was memory resident, let bread get it. 154 * If block was not memory resident, the read was 155 * started above, so just wait for the read to complete. 156 */ 157 if (bp == NULL) 158 return (bread(vp, blkno, size, cred, bpp)); 159 return (biowait(bp)); 160 } 161 162 /* 163 * Synchronous write. 164 * Release buffer on completion. 165 */ 166 bwrite(bp) 167 register struct buf *bp; 168 { 169 struct proc *p = curproc; /* XXX */ 170 register int flag; 171 int s, error; 172 173 flag = bp->b_flags; 174 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 175 if (flag & B_ASYNC) { 176 if ((flag & B_DELWRI) == 0) 177 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 178 else 179 reassignbuf(bp, bp->b_vp); 180 } 181 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 182 if (bp->b_bcount > bp->b_bufsize) 183 panic("bwrite"); 184 s = splbio(); 185 bp->b_vp->v_numoutput++; 186 splx(s); 187 VOP_STRATEGY(bp); 188 189 /* 190 * If the write was synchronous, then await I/O completion. 191 * If the write was "delayed", then we put the buffer on 192 * the queue of blocks awaiting I/O completion status. 193 */ 194 if ((flag & B_ASYNC) == 0) { 195 error = biowait(bp); 196 if ((flag&B_DELWRI) == 0) 197 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 198 else 199 reassignbuf(bp, bp->b_vp); 200 brelse(bp); 201 } else if (flag & B_DELWRI) { 202 bp->b_flags |= B_AGE; 203 error = 0; 204 } 205 return (error); 206 } 207 208 /* 209 * Delayed write. 210 * 211 * The buffer is marked dirty, but is not queued for I/O. 212 * This routine should be used when the buffer is expected 213 * to be modified again soon, typically a small write that 214 * partially fills a buffer. 215 * 216 * NB: magnetic tapes cannot be delayed; they must be 217 * written in the order that the writes are requested. 218 */ 219 bdwrite(bp) 220 register struct buf *bp; 221 { 222 struct proc *p = curproc; /* XXX */ 223 224 if ((bp->b_flags & B_DELWRI) == 0) { 225 bp->b_flags |= B_DELWRI; 226 reassignbuf(bp, bp->b_vp); 227 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 228 } 229 /* 230 * If this is a tape drive, the write must be initiated. 231 */ 232 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 233 bawrite(bp); 234 } else { 235 bp->b_flags |= (B_DONE | B_DELWRI); 236 brelse(bp); 237 } 238 } 239 240 /* 241 * Asynchronous write. 242 * Start I/O on a buffer, but do not wait for it to complete. 243 * The buffer is released when the I/O completes. 244 */ 245 bawrite(bp) 246 register struct buf *bp; 247 { 248 249 /* 250 * Setting the ASYNC flag causes bwrite to return 251 * after starting the I/O. 252 */ 253 bp->b_flags |= B_ASYNC; 254 (void) bwrite(bp); 255 } 256 257 /* 258 * Release a buffer. 259 * Even if the buffer is dirty, no I/O is started. 260 */ 261 brelse(bp) 262 register struct buf *bp; 263 { 264 register struct buf *flist; 265 int s; 266 267 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 268 /* 269 * If a process is waiting for the buffer, or 270 * is waiting for a free buffer, awaken it. 271 */ 272 if (bp->b_flags & B_WANTED) 273 wakeup((caddr_t)bp); 274 if (bfreelist[0].b_flags & B_WANTED) { 275 bfreelist[0].b_flags &= ~B_WANTED; 276 wakeup((caddr_t)bfreelist); 277 } 278 /* 279 * Retry I/O for locked buffers rather than invalidating them. 280 */ 281 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 282 bp->b_flags &= ~B_ERROR; 283 /* 284 * Disassociate buffers that are no longer valid. 285 */ 286 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 287 bp->b_flags |= B_INVAL; 288 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 289 if (bp->b_vp) 290 brelvp(bp); 291 bp->b_flags &= ~B_DELWRI; 292 } 293 /* 294 * Stick the buffer back on a free list. 295 */ 296 s = splbio(); 297 if (bp->b_bufsize <= 0) { 298 /* block has no buffer ... put at front of unused buffer list */ 299 flist = &bfreelist[BQ_EMPTY]; 300 binsheadfree(bp, flist); 301 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 302 /* block has no info ... put at front of most free list */ 303 flist = &bfreelist[BQ_AGE]; 304 binsheadfree(bp, flist); 305 } else { 306 if (bp->b_flags & B_LOCKED) 307 flist = &bfreelist[BQ_LOCKED]; 308 else if (bp->b_flags & B_AGE) 309 flist = &bfreelist[BQ_AGE]; 310 else 311 flist = &bfreelist[BQ_LRU]; 312 binstailfree(bp, flist); 313 } 314 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 315 splx(s); 316 } 317 318 /* 319 * Check to see if a block is currently memory resident. 320 */ 321 incore(vp, blkno) 322 struct vnode *vp; 323 daddr_t blkno; 324 { 325 register struct buf *bp; 326 register struct buf *dp; 327 328 dp = BUFHASH(vp, blkno); 329 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 330 if (bp->b_lblkno == blkno && bp->b_vp == vp && 331 (bp->b_flags & B_INVAL) == 0) 332 return (1); 333 return (0); 334 } 335 336 /* 337 * Check to see if a block is currently memory resident. 338 * If it is resident, return it. If it is not resident, 339 * allocate a new buffer and assign it to the block. 340 */ 341 struct buf * 342 getblk(vp, blkno, size) 343 register struct vnode *vp; 344 daddr_t blkno; 345 int size; 346 { 347 register struct buf *bp, *dp; 348 int s; 349 350 if (size > MAXBSIZE) 351 panic("getblk: size too big"); 352 /* 353 * Search the cache for the block. If the buffer is found, 354 * but it is currently locked, the we must wait for it to 355 * become available. 356 */ 357 dp = BUFHASH(vp, blkno); 358 loop: 359 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 360 if (bp->b_lblkno != blkno || bp->b_vp != vp || 361 (bp->b_flags & B_INVAL)) 362 continue; 363 s = splbio(); 364 if (bp->b_flags & B_BUSY) { 365 bp->b_flags |= B_WANTED; 366 sleep((caddr_t)bp, PRIBIO + 1); 367 splx(s); 368 goto loop; 369 } 370 bremfree(bp); 371 bp->b_flags |= B_BUSY; 372 splx(s); 373 if (bp->b_bcount != size) { 374 printf("getblk: stray size"); 375 bp->b_flags |= B_INVAL; 376 bwrite(bp); 377 goto loop; 378 } 379 bp->b_flags |= B_CACHE; 380 return (bp); 381 } 382 bp = getnewbuf(); 383 bremhash(bp); 384 bgetvp(vp, bp); 385 bp->b_bcount = 0; 386 bp->b_lblkno = blkno; 387 bp->b_blkno = blkno; 388 bp->b_error = 0; 389 bp->b_resid = 0; 390 binshash(bp, dp); 391 allocbuf(bp, size); 392 return (bp); 393 } 394 395 /* 396 * Allocate a buffer. 397 * The caller will assign it to a block. 398 */ 399 struct buf * 400 geteblk(size) 401 int size; 402 { 403 register struct buf *bp, *flist; 404 405 if (size > MAXBSIZE) 406 panic("geteblk: size too big"); 407 bp = getnewbuf(); 408 bp->b_flags |= B_INVAL; 409 bremhash(bp); 410 flist = &bfreelist[BQ_AGE]; 411 bp->b_bcount = 0; 412 bp->b_error = 0; 413 bp->b_resid = 0; 414 binshash(bp, flist); 415 allocbuf(bp, size); 416 return (bp); 417 } 418 419 /* 420 * Expand or contract the actual memory allocated to a buffer. 421 * If no memory is available, release buffer and take error exit. 422 */ 423 allocbuf(tp, size) 424 register struct buf *tp; 425 int size; 426 { 427 register struct buf *bp, *ep; 428 int sizealloc, take, s; 429 430 sizealloc = roundup(size, CLBYTES); 431 /* 432 * Buffer size does not change 433 */ 434 if (sizealloc == tp->b_bufsize) 435 goto out; 436 /* 437 * Buffer size is shrinking. 438 * Place excess space in a buffer header taken from the 439 * BQ_EMPTY buffer list and placed on the "most free" list. 440 * If no extra buffer headers are available, leave the 441 * extra space in the present buffer. 442 */ 443 if (sizealloc < tp->b_bufsize) { 444 ep = bfreelist[BQ_EMPTY].av_forw; 445 if (ep == &bfreelist[BQ_EMPTY]) 446 goto out; 447 s = splbio(); 448 bremfree(ep); 449 ep->b_flags |= B_BUSY; 450 splx(s); 451 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 452 (int)tp->b_bufsize - sizealloc); 453 ep->b_bufsize = tp->b_bufsize - sizealloc; 454 tp->b_bufsize = sizealloc; 455 ep->b_flags |= B_INVAL; 456 ep->b_bcount = 0; 457 brelse(ep); 458 goto out; 459 } 460 /* 461 * More buffer space is needed. Get it out of buffers on 462 * the "most free" list, placing the empty headers on the 463 * BQ_EMPTY buffer header list. 464 */ 465 while (tp->b_bufsize < sizealloc) { 466 take = sizealloc - tp->b_bufsize; 467 bp = getnewbuf(); 468 if (take >= bp->b_bufsize) 469 take = bp->b_bufsize; 470 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 471 &tp->b_un.b_addr[tp->b_bufsize], take); 472 tp->b_bufsize += take; 473 bp->b_bufsize = bp->b_bufsize - take; 474 if (bp->b_bcount > bp->b_bufsize) 475 bp->b_bcount = bp->b_bufsize; 476 if (bp->b_bufsize <= 0) { 477 bremhash(bp); 478 binshash(bp, &bfreelist[BQ_EMPTY]); 479 bp->b_dev = NODEV; 480 bp->b_error = 0; 481 bp->b_flags |= B_INVAL; 482 } 483 brelse(bp); 484 } 485 out: 486 tp->b_bcount = size; 487 return (1); 488 } 489 490 /* 491 * Find a buffer which is available for use. 492 * Select something from a free list. 493 * Preference is to AGE list, then LRU list. 494 */ 495 struct buf * 496 getnewbuf() 497 { 498 register struct buf *bp, *dp; 499 register struct ucred *cred; 500 int s; 501 502 loop: 503 s = splbio(); 504 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 505 if (dp->av_forw != dp) 506 break; 507 if (dp == bfreelist) { /* no free blocks */ 508 dp->b_flags |= B_WANTED; 509 sleep((caddr_t)dp, PRIBIO + 1); 510 splx(s); 511 goto loop; 512 } 513 bp = dp->av_forw; 514 bremfree(bp); 515 bp->b_flags |= B_BUSY; 516 splx(s); 517 if (bp->b_flags & B_DELWRI) { 518 (void) bawrite(bp); 519 goto loop; 520 } 521 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 522 if (bp->b_vp) 523 brelvp(bp); 524 if (bp->b_rcred != NOCRED) { 525 cred = bp->b_rcred; 526 bp->b_rcred = NOCRED; 527 crfree(cred); 528 } 529 if (bp->b_wcred != NOCRED) { 530 cred = bp->b_wcred; 531 bp->b_wcred = NOCRED; 532 crfree(cred); 533 } 534 bp->b_flags = B_BUSY; 535 bp->b_dirtyoff = bp->b_dirtyend = 0; 536 return (bp); 537 } 538 539 /* 540 * Wait for I/O to complete. 541 * 542 * Extract and return any errors associated with the I/O. 543 * If the error flag is set, but no specific error is 544 * given, return EIO. 545 */ 546 biowait(bp) 547 register struct buf *bp; 548 { 549 int s; 550 551 s = splbio(); 552 while ((bp->b_flags & B_DONE) == 0) 553 sleep((caddr_t)bp, PRIBIO); 554 splx(s); 555 if ((bp->b_flags & B_ERROR) == 0) 556 return (0); 557 if (bp->b_error) 558 return (bp->b_error); 559 return (EIO); 560 } 561 562 /* 563 * Mark I/O complete on a buffer. 564 * 565 * If a callback has been requested, e.g. the pageout 566 * daemon, do so. Otherwise, awaken waiting processes. 567 */ 568 biodone(bp) 569 register struct buf *bp; 570 { 571 572 if (bp->b_flags & B_DONE) 573 panic("dup biodone"); 574 bp->b_flags |= B_DONE; 575 if ((bp->b_flags & B_READ) == 0) 576 vwakeup(bp); 577 if (bp->b_flags & B_CALL) { 578 bp->b_flags &= ~B_CALL; 579 (*bp->b_iodone)(bp); 580 return; 581 } 582 if (bp->b_flags & B_ASYNC) 583 brelse(bp); 584 else { 585 bp->b_flags &= ~B_WANTED; 586 wakeup((caddr_t)bp); 587 } 588 } 589