1 /*- 2 * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This module is believed to contain source code proprietary to AT&T. 6 * Use and redistribution is subject to the Berkeley Software License 7 * Agreement and your Software Agreement with AT&T (Western Electric). 8 * 9 * @(#)vfs_bio.c 7.59.1.1 (Berkeley) 05/10/93 10 */ 11 12 #include <sys/param.h> 13 #include <sys/proc.h> 14 #include <sys/buf.h> 15 #include <sys/vnode.h> 16 #include <sys/mount.h> 17 #include <sys/trace.h> 18 #include <sys/resourcevar.h> 19 #include <sys/malloc.h> 20 #include <libkern/libkern.h> 21 #include <ufs/ufs/quota.h> 22 #include <ufs/ufs/inode.h> 23 24 /* 25 * Definitions for the buffer hash lists. 26 */ 27 #define BUFHASH(dvp, lbn) \ 28 (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 29 struct list_entry *bufhashtbl, invalhash; 30 u_long bufhash; 31 32 /* 33 * Insq/Remq for the buffer hash lists. 34 */ 35 #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) 36 #define bremhash(bp) list_remove(bp, struct buf *, b_hash) 37 38 /* 39 * Definitions for the buffer free lists. 40 */ 41 #define BQUEUES 4 /* number of free buffer queues */ 42 43 #define BQ_LOCKED 0 /* super-blocks &c */ 44 #define BQ_LRU 1 /* lru, useful buffers */ 45 #define BQ_AGE 2 /* rubbish */ 46 #define BQ_EMPTY 3 /* buffer headers with no memory */ 47 48 struct queue_entry bufqueues[BQUEUES]; 49 int needbuffer; 50 51 /* 52 * Insq/Remq for the buffer free lists. 53 */ 54 #define binsheadfree(bp, dp) \ 55 queue_enter_head(dp, bp, struct buf *, b_freelist) 56 #define binstailfree(bp, dp) \ 57 queue_enter_tail(dp, bp, struct buf *, b_freelist) 58 59 /* 60 * Local declarations 61 */ 62 struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 63 daddr_t, long, int)); 64 struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 65 daddr_t, daddr_t, long, int, long)); 66 void cluster_wbuild __P((struct vnode *, struct buf *, long size, 67 daddr_t start_lbn, int len, daddr_t lbn)); 68 69 void 70 bremfree(bp) 71 struct buf *bp; 72 { 73 struct queue_entry *dp; 74 75 /* 76 * We only calculate the head of the freelist when removing 77 * the last element of the list as that is the only time that 78 * it is needed (e.g. to reset the tail pointer). 79 */ 80 if (bp->b_freelist.qe_next == NULL) { 81 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 82 if (dp->qe_prev == &bp->b_freelist.qe_next) 83 break; 84 if (dp == &bufqueues[BQUEUES]) 85 panic("bremfree: lost tail"); 86 } 87 queue_remove(dp, bp, struct buf *, b_freelist); 88 } 89 90 /* 91 * Initialize buffers and hash links for buffers. 92 */ 93 void 94 bufinit() 95 { 96 register struct buf *bp; 97 struct queue_entry *dp; 98 register int i; 99 int base, residual; 100 101 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 102 queue_init(dp); 103 bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); 104 base = bufpages / nbuf; 105 residual = bufpages % nbuf; 106 for (i = 0; i < nbuf; i++) { 107 bp = &buf[i]; 108 bzero((char *)bp, sizeof *bp); 109 bp->b_dev = NODEV; 110 bp->b_rcred = NOCRED; 111 bp->b_wcred = NOCRED; 112 bp->b_un.b_addr = buffers + i * MAXBSIZE; 113 if (i < residual) 114 bp->b_bufsize = (base + 1) * CLBYTES; 115 else 116 bp->b_bufsize = base * CLBYTES; 117 bp->b_flags = B_INVAL; 118 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 119 binsheadfree(bp, dp); 120 binshash(bp, &invalhash); 121 } 122 } 123 124 /* 125 * Find the block in the buffer pool. 126 * If the buffer is not present, allocate a new buffer and load 127 * its contents according to the filesystem fill routine. 128 */ 129 bread(vp, blkno, size, cred, bpp) 130 struct vnode *vp; 131 daddr_t blkno; 132 int size; 133 struct ucred *cred; 134 struct buf **bpp; 135 { 136 struct proc *p = curproc; /* XXX */ 137 register struct buf *bp; 138 139 if (size == 0) 140 panic("bread: size 0"); 141 *bpp = bp = getblk(vp, blkno, size, 0, 0); 142 if (bp->b_flags & (B_DONE | B_DELWRI)) { 143 trace(TR_BREADHIT, pack(vp, size), blkno); 144 return (0); 145 } 146 bp->b_flags |= B_READ; 147 if (bp->b_bcount > bp->b_bufsize) 148 panic("bread"); 149 if (bp->b_rcred == NOCRED && cred != NOCRED) { 150 crhold(cred); 151 bp->b_rcred = cred; 152 } 153 VOP_STRATEGY(bp); 154 trace(TR_BREADMISS, pack(vp, size), blkno); 155 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 156 return (biowait(bp)); 157 } 158 159 /* 160 * Operates like bread, but also starts I/O on the N specified 161 * read-ahead blocks. 162 */ 163 breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 164 struct vnode *vp; 165 daddr_t blkno; int size; 166 daddr_t rablkno[]; int rabsize[]; 167 int num; 168 struct ucred *cred; 169 struct buf **bpp; 170 { 171 struct proc *p = curproc; /* XXX */ 172 register struct buf *bp, *rabp; 173 register int i; 174 175 bp = NULL; 176 /* 177 * If the block is not memory resident, 178 * allocate a buffer and start I/O. 179 */ 180 if (!incore(vp, blkno)) { 181 *bpp = bp = getblk(vp, blkno, size, 0, 0); 182 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 183 bp->b_flags |= B_READ; 184 if (bp->b_bcount > bp->b_bufsize) 185 panic("breadn"); 186 if (bp->b_rcred == NOCRED && cred != NOCRED) { 187 crhold(cred); 188 bp->b_rcred = cred; 189 } 190 VOP_STRATEGY(bp); 191 trace(TR_BREADMISS, pack(vp, size), blkno); 192 p->p_stats->p_ru.ru_inblock++; /* pay for read */ 193 } else { 194 trace(TR_BREADHIT, pack(vp, size), blkno); 195 } 196 } 197 198 /* 199 * If there's read-ahead block(s), start I/O 200 * on them also (as above). 201 */ 202 for (i = 0; i < num; i++) { 203 if (incore(vp, rablkno[i])) 204 continue; 205 rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0); 206 if (rabp->b_flags & (B_DONE | B_DELWRI)) { 207 brelse(rabp); 208 trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 209 } else { 210 rabp->b_flags |= B_ASYNC | B_READ; 211 if (rabp->b_bcount > rabp->b_bufsize) 212 panic("breadrabp"); 213 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 214 crhold(cred); 215 rabp->b_rcred = cred; 216 } 217 VOP_STRATEGY(rabp); 218 trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 219 p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 220 } 221 } 222 223 /* 224 * If block was memory resident, let bread get it. 225 * If block was not memory resident, the read was 226 * started above, so just wait for the read to complete. 227 */ 228 if (bp == NULL) 229 return (bread(vp, blkno, size, cred, bpp)); 230 return (biowait(bp)); 231 } 232 233 /* 234 * We could optimize this by keeping track of where the last read-ahead 235 * was, but it would involve adding fields to the vnode. For now, let's 236 * just get it working. 237 * 238 * This replaces bread. If this is a bread at the beginning of a file and 239 * lastr is 0, we assume this is the first read and we'll read up to two 240 * blocks if they are sequential. After that, we'll do regular read ahead 241 * in clustered chunks. 242 * 243 * There are 4 or 5 cases depending on how you count: 244 * Desired block is in the cache: 245 * 1 Not sequential access (0 I/Os). 246 * 2 Access is sequential, do read-ahead (1 ASYNC). 247 * Desired block is not in cache: 248 * 3 Not sequential access (1 SYNC). 249 * 4 Sequential access, next block is contiguous (1 SYNC). 250 * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 251 * 252 * There are potentially two buffers that require I/O. 253 * bp is the block requested. 254 * rbp is the read-ahead block. 255 * If either is NULL, then you don't have to do the I/O. 256 */ 257 cluster_read(vp, filesize, lblkno, size, cred, bpp) 258 struct vnode *vp; 259 u_quad_t filesize; 260 daddr_t lblkno; 261 long size; 262 struct ucred *cred; 263 struct buf **bpp; 264 { 265 struct buf *bp, *rbp; 266 daddr_t blkno, ioblkno; 267 long flags; 268 int error, num_ra, alreadyincore; 269 270 #ifdef DIAGNOSTIC 271 if (size == 0) 272 panic("cluster_read: size = 0"); 273 #endif 274 275 error = 0; 276 flags = B_READ; 277 *bpp = bp = getblk(vp, lblkno, size, 0, 0); 278 if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { 279 /* 280 * Desired block is in cache; do any readahead ASYNC. 281 * Case 1, 2. 282 */ 283 trace(TR_BREADHIT, pack(vp, size), lblkno); 284 flags |= B_ASYNC; 285 ioblkno = lblkno + 286 (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); 287 alreadyincore = (int)incore(vp, ioblkno); 288 bp = NULL; 289 } else { 290 /* Block wasn't in cache, case 3, 4, 5. */ 291 trace(TR_BREADMISS, pack(vp, size), lblkno); 292 ioblkno = lblkno; 293 bp->b_flags |= flags; 294 alreadyincore = 0; 295 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 296 } 297 /* 298 * XXX 299 * Replace 1 with a window size based on some permutation of 300 * maxcontig and rot_delay. This will let you figure out how 301 * many blocks you should read-ahead (case 2, 4, 5). 302 * 303 * If the access isn't sequential, cut the window size in half. 304 */ 305 rbp = NULL; 306 if (lblkno != vp->v_lastr + 1 && lblkno != 0) 307 vp->v_ralen = max(vp->v_ralen >> 1, 1); 308 else if ((ioblkno + 1) * size < filesize && !alreadyincore && 309 !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { 310 /* 311 * Reading sequentially, and the next block is not in the 312 * cache. We are going to try reading ahead. If this is 313 * the first read of a file, then limit read-ahead to a 314 * single block, else read as much as we're allowed. 315 */ 316 if (num_ra > vp->v_ralen) { 317 num_ra = vp->v_ralen; 318 vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); 319 } else 320 vp->v_ralen = num_ra + 1; 321 322 323 if (num_ra) /* case 2, 4 */ 324 rbp = cluster_rbuild(vp, filesize, 325 bp, ioblkno, blkno, size, num_ra, flags); 326 else if (lblkno != 0 && ioblkno == lblkno) { 327 /* Case 5: check how many blocks to read ahead */ 328 ++ioblkno; 329 if ((ioblkno + 1) * size > filesize || 330 (error = VOP_BMAP(vp, 331 ioblkno, NULL, &blkno, &num_ra))) 332 goto skip_readahead; 333 flags |= B_ASYNC; 334 if (num_ra) 335 rbp = cluster_rbuild(vp, filesize, 336 NULL, ioblkno, blkno, size, num_ra, flags); 337 else { 338 rbp = getblk(vp, ioblkno, size, 0, 0); 339 rbp->b_flags |= flags; 340 rbp->b_blkno = blkno; 341 } 342 } else if (lblkno != 0) { 343 /* case 2; read ahead single block */ 344 rbp = getblk(vp, ioblkno, size, 0, 0); 345 rbp->b_flags |= flags; 346 rbp->b_blkno = blkno; 347 } else if (bp) /* case 1, 3, block 0 */ 348 bp->b_blkno = blkno; 349 /* Case 1 on block 0; not really doing sequential I/O */ 350 351 if (rbp == bp) /* case 4 */ 352 rbp = NULL; 353 else if (rbp) { /* case 2, 5 */ 354 trace(TR_BREADMISSRA, 355 pack(vp, (num_ra + 1) * size), ioblkno); 356 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 357 } 358 } 359 360 /* XXX Kirk, do we need to make sure the bp has creds? */ 361 skip_readahead: 362 if (bp) 363 if (bp->b_flags & (B_DONE | B_DELWRI)) 364 panic("cluster_read: DONE bp"); 365 else 366 error = VOP_STRATEGY(bp); 367 368 if (rbp) 369 if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 370 rbp->b_flags &= ~(B_ASYNC | B_READ); 371 brelse(rbp); 372 } else 373 (void) VOP_STRATEGY(rbp); 374 375 if (bp) 376 return(biowait(bp)); 377 return(error); 378 } 379 380 /* 381 * If blocks are contiguous on disk, use this to provide clustered 382 * read ahead. We will read as many blocks as possible sequentially 383 * and then parcel them up into logical blocks in the buffer hash table. 384 */ 385 struct buf * 386 cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 387 struct vnode *vp; 388 u_quad_t filesize; 389 struct buf *bp; 390 daddr_t lbn; 391 daddr_t blkno; 392 long size; 393 int run; 394 long flags; 395 { 396 struct cluster_save *b_save; 397 struct buf *tbp; 398 daddr_t bn; 399 int i, inc; 400 401 #ifdef DIAGNOSTIC 402 if (size != vp->v_mount->mnt_stat.f_iosize) 403 panic("cluster_rbuild: size %d != filesize %d\n", 404 size, vp->v_mount->mnt_stat.f_iosize); 405 #endif 406 if (size * (lbn + run + 1) > filesize) 407 --run; 408 if (run == 0) { 409 if (!bp) { 410 bp = getblk(vp, lbn, size, 0, 0); 411 bp->b_blkno = blkno; 412 bp->b_flags |= flags; 413 } 414 return(bp); 415 } 416 417 bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 418 if (bp->b_flags & (B_DONE | B_DELWRI)) 419 return (bp); 420 421 b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 422 M_SEGMENT, M_WAITOK); 423 b_save->bs_bufsize = b_save->bs_bcount = size; 424 b_save->bs_nchildren = 0; 425 b_save->bs_children = (struct buf **)(b_save + 1); 426 b_save->bs_saveaddr = bp->b_saveaddr; 427 bp->b_saveaddr = (caddr_t) b_save; 428 429 inc = size / DEV_BSIZE; 430 for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 431 if (incore(vp, lbn + i)) { 432 if (i == 1) { 433 bp->b_saveaddr = b_save->bs_saveaddr; 434 bp->b_flags &= ~B_CALL; 435 bp->b_iodone = NULL; 436 allocbuf(bp, size); 437 free(b_save, M_SEGMENT); 438 } else 439 allocbuf(bp, size * i); 440 break; 441 } 442 tbp = getblk(vp, lbn + i, 0, 0, 0); 443 tbp->b_bcount = tbp->b_bufsize = size; 444 tbp->b_blkno = bn; 445 { 446 daddr_t temp; 447 VOP_BMAP(tbp->b_vp, tbp->b_lblkno, NULL, &temp, NULL); 448 if (temp != bn) { 449 printf("Block: %d Assigned address: %x Bmap address: %x\n", 450 tbp->b_lblkno, tbp->b_blkno, temp); 451 panic("cluster_rbuild: wrong disk address"); 452 } 453 } 454 tbp->b_flags |= flags | B_READ | B_ASYNC; 455 ++b_save->bs_nchildren; 456 b_save->bs_children[i - 1] = tbp; 457 } 458 if (!(bp->b_flags & B_ASYNC)) 459 vp->v_ralen = max(vp->v_ralen - 1, 1); 460 return(bp); 461 } 462 463 /* 464 * Either get a new buffer or grow the existing one. 465 */ 466 struct buf * 467 cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 468 struct vnode *vp; 469 struct buf *bp; 470 long flags; 471 daddr_t blkno; 472 daddr_t lblkno; 473 long size; 474 int run; 475 { 476 if (!bp) { 477 bp = getblk(vp, lblkno, size, 0, 0); 478 if (bp->b_flags & (B_DONE | B_DELWRI)) { 479 bp->b_blkno = blkno; 480 return(bp); 481 } 482 } 483 allocbuf(bp, run * size); 484 bp->b_blkno = blkno; 485 bp->b_iodone = cluster_callback; 486 bp->b_flags |= flags | B_CALL; 487 return(bp); 488 } 489 490 /* 491 * Cleanup after a clustered read or write. 492 */ 493 void 494 cluster_callback(bp) 495 struct buf *bp; 496 { 497 struct cluster_save *b_save; 498 struct buf **tbp; 499 long bsize; 500 caddr_t cp; 501 daddr_t daddr; 502 b_save = (struct cluster_save *)(bp->b_saveaddr); 503 bp->b_saveaddr = b_save->bs_saveaddr; 504 505 cp = bp->b_un.b_addr + b_save->bs_bufsize; 506 daddr = bp->b_blkno + b_save->bs_bufsize / DEV_BSIZE; 507 for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { 508 pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); 509 cp += (*tbp)->b_bufsize; 510 bp->b_bufsize -= (*tbp)->b_bufsize; 511 if ((*tbp)->b_blkno != daddr) { 512 struct inode *ip; 513 printf("cluster_callback: bad disk address:\n"); 514 printf("Clustered Block: %d DiskAddr: %x bytes left: %d\n", 515 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 516 printf("\toriginal size: %d flags: %x\n", bp->b_bcount, 517 bp->b_flags); 518 printf("Child Block: %d DiskAddr: %x bytes: %d\n", 519 (*tbp)->b_lblkno, (*tbp)->b_blkno, 520 (*tbp)->b_bufsize); 521 ip = VTOI((*tbp)->b_vp); 522 printf("daddr: %x i_size %qd\n", daddr, ip->i_size); 523 if ((*tbp)->b_lblkno < NDADDR) 524 printf("Child block pointer from inode: %x\n", 525 ip->i_din.di_db[(*tbp)->b_lblkno]); 526 spl0(); 527 panic ("cluster_callback: bad disk address"); 528 } 529 daddr += (*tbp)->b_bufsize / DEV_BSIZE; 530 biodone(*tbp); 531 } 532 #ifdef DIAGNOSTIC 533 if (bp->b_bufsize != b_save->bs_bufsize) 534 panic ("cluster_callback: more space to reclaim"); 535 #endif 536 bp->b_bcount = bp->b_bufsize; 537 bp->b_iodone = NULL; 538 free(b_save, M_SEGMENT); 539 if (bp->b_flags & B_ASYNC) 540 brelse(bp); 541 else 542 wakeup((caddr_t)bp); 543 } 544 545 /* 546 * Synchronous write. 547 * Release buffer on completion. 548 */ 549 bwrite(bp) 550 register struct buf *bp; 551 { 552 struct proc *p = curproc; /* XXX */ 553 register int flag; 554 int s, error = 0; 555 556 flag = bp->b_flags; 557 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 558 if (flag & B_ASYNC) { 559 if ((flag & B_DELWRI) == 0) 560 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 561 else 562 reassignbuf(bp, bp->b_vp); 563 } 564 trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 565 if (bp->b_bcount > bp->b_bufsize) 566 panic("bwrite"); 567 s = splbio(); 568 bp->b_vp->v_numoutput++; 569 bp->b_flags |= B_WRITEINPROG; 570 splx(s); 571 VOP_STRATEGY(bp); 572 573 /* 574 * If the write was synchronous, then await I/O completion. 575 * If the write was "delayed", then we put the buffer on 576 * the queue of blocks awaiting I/O completion status. 577 */ 578 if ((flag & B_ASYNC) == 0) { 579 error = biowait(bp); 580 if ((flag&B_DELWRI) == 0) 581 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 582 else 583 reassignbuf(bp, bp->b_vp); 584 if (bp->b_flags & B_EINTR) { 585 bp->b_flags &= ~B_EINTR; 586 error = EINTR; 587 } 588 brelse(bp); 589 } else if (flag & B_DELWRI) { 590 s = splbio(); 591 bp->b_flags |= B_AGE; 592 splx(s); 593 } 594 return (error); 595 } 596 597 int 598 vn_bwrite(ap) 599 struct vop_bwrite_args *ap; 600 { 601 return (bwrite(ap->a_bp)); 602 } 603 604 605 /* 606 * Delayed write. 607 * 608 * The buffer is marked dirty, but is not queued for I/O. 609 * This routine should be used when the buffer is expected 610 * to be modified again soon, typically a small write that 611 * partially fills a buffer. 612 * 613 * NB: magnetic tapes cannot be delayed; they must be 614 * written in the order that the writes are requested. 615 */ 616 bdwrite(bp) 617 register struct buf *bp; 618 { 619 struct proc *p = curproc; /* XXX */ 620 621 if ((bp->b_flags & B_DELWRI) == 0) { 622 bp->b_flags |= B_DELWRI; 623 reassignbuf(bp, bp->b_vp); 624 p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 625 } 626 /* 627 * If this is a tape drive, the write must be initiated. 628 */ 629 if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 630 bawrite(bp); 631 } else { 632 bp->b_flags |= (B_DONE | B_DELWRI); 633 brelse(bp); 634 } 635 } 636 637 /* 638 * Asynchronous write. 639 * Start I/O on a buffer, but do not wait for it to complete. 640 * The buffer is released when the I/O completes. 641 */ 642 bawrite(bp) 643 register struct buf *bp; 644 { 645 646 /* 647 * Setting the ASYNC flag causes bwrite to return 648 * after starting the I/O. 649 */ 650 bp->b_flags |= B_ASYNC; 651 (void) VOP_BWRITE(bp); 652 } 653 654 /* 655 * Do clustered write for FFS. 656 * 657 * Three cases: 658 * 1. Write is not sequential (write asynchronously) 659 * Write is sequential: 660 * 2. beginning of cluster - begin cluster 661 * 3. middle of a cluster - add to cluster 662 * 4. end of a cluster - asynchronously write cluster 663 */ 664 void 665 cluster_write(bp, filesize) 666 struct buf *bp; 667 u_quad_t filesize; 668 { 669 struct vnode *vp; 670 daddr_t lbn; 671 int clen; 672 673 vp = bp->b_vp; 674 lbn = bp->b_lblkno; 675 676 /* Initialize vnode to beginning of file. */ 677 if (lbn == 0) 678 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 679 680 if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 681 (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) { 682 if (vp->v_clen != 0) 683 /* 684 * Write is not sequential. 685 */ 686 cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 687 vp->v_lastw - vp->v_cstart + 1, lbn); 688 /* 689 * Consider beginning a cluster. 690 */ 691 if ((lbn + 1) * bp->b_bcount == filesize) 692 /* End of file, make cluster as large as possible */ 693 clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 694 else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { 695 bawrite(bp); 696 vp->v_clen = 0; 697 vp->v_lasta = bp->b_blkno; 698 vp->v_cstart = lbn + 1; 699 vp->v_lastw = lbn; 700 return; 701 } else 702 clen = 0; 703 vp->v_clen = clen; 704 if (clen == 0) { /* I/O not contiguous */ 705 vp->v_cstart = lbn + 1; 706 bawrite(bp); 707 } else { /* Wait for rest of cluster */ 708 vp->v_cstart = lbn; 709 bdwrite(bp); 710 } 711 } else if (lbn == vp->v_cstart + vp->v_clen) { 712 /* 713 * At end of cluster, write it out. 714 */ 715 cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 716 vp->v_clen + 1, lbn); 717 vp->v_clen = 0; 718 vp->v_cstart = lbn + 1; 719 } else 720 /* 721 * In the middle of a cluster, so just delay the 722 * I/O for now. 723 */ 724 bdwrite(bp); 725 vp->v_lastw = lbn; 726 vp->v_lasta = bp->b_blkno; 727 } 728 729 730 /* 731 * This is an awful lot like cluster_rbuild...wish they could be combined. 732 * The last lbn argument is the current block on which I/O is being 733 * performed. Check to see that it doesn't fall in the middle of 734 * the current block. 735 */ 736 void 737 cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 738 struct vnode *vp; 739 struct buf *last_bp; 740 long size; 741 daddr_t start_lbn; 742 int len; 743 daddr_t lbn; 744 { 745 struct cluster_save *b_save; 746 struct buf *bp, *tbp; 747 caddr_t cp; 748 int i, s; 749 750 #ifdef DIAGNOSTIC 751 if (size != vp->v_mount->mnt_stat.f_iosize) 752 panic("cluster_wbuild: size %d != filesize %d\n", 753 size, vp->v_mount->mnt_stat.f_iosize); 754 #endif 755 redo: 756 while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 757 ++start_lbn; 758 --len; 759 } 760 761 /* Get more memory for current buffer */ 762 if (len <= 1) { 763 if (last_bp) { 764 bawrite(last_bp); 765 } else if (len) { 766 bp = getblk(vp, start_lbn, size, 0, 0); 767 bawrite(bp); 768 } 769 return; 770 } 771 772 bp = getblk(vp, start_lbn, size, 0, 0); 773 if (!(bp->b_flags & B_DELWRI)) { 774 ++start_lbn; 775 --len; 776 brelse(bp); 777 goto redo; 778 } 779 780 --len; 781 b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 782 M_SEGMENT, M_WAITOK); 783 b_save->bs_bcount = bp->b_bcount; 784 b_save->bs_bufsize = bp->b_bufsize; 785 b_save->bs_nchildren = 0; 786 b_save->bs_children = (struct buf **)(b_save + 1); 787 b_save->bs_saveaddr = bp->b_saveaddr; 788 bp->b_saveaddr = (caddr_t) b_save; 789 790 791 bp->b_flags |= B_CALL; 792 bp->b_iodone = cluster_callback; 793 cp = bp->b_un.b_addr + bp->b_bufsize; 794 for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 795 if (!incore(vp, start_lbn) || start_lbn == lbn) 796 break; 797 798 if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { 799 tbp = getblk(vp, start_lbn, size, 0, 0); 800 #ifdef DIAGNOSTIC 801 if (tbp->b_bcount != tbp->b_bufsize) 802 panic("cluster_wbuild: Buffer too big"); 803 #endif 804 if (!(tbp->b_flags & B_DELWRI)) { 805 brelse(tbp); 806 break; 807 } 808 } else 809 tbp = last_bp; 810 811 ++b_save->bs_nchildren; 812 813 /* Move memory from children to parent */ 814 if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) { 815 printf("Clustered Block: %d addr %x bufsize: %d\n", 816 bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 817 printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 818 tbp->b_blkno); 819 panic("Clustered write to wrong blocks"); 820 } 821 822 pagemove(tbp->b_un.b_daddr, cp, size); 823 bp->b_bcount += size; 824 bp->b_bufsize += size; 825 826 tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 827 tbp->b_flags |= B_ASYNC; 828 s = splbio(); 829 reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 830 ++tbp->b_vp->v_numoutput; 831 splx(s); 832 b_save->bs_children[i] = tbp; 833 834 cp += tbp->b_bufsize; 835 } 836 837 if (i == 0) { 838 /* None to cluster */ 839 bp->b_saveaddr = b_save->bs_saveaddr; 840 bp->b_flags &= ~B_CALL; 841 bp->b_iodone = NULL; 842 free(b_save, M_SEGMENT); 843 } 844 bawrite(bp); 845 if (i < len) { 846 len -= i + 1; 847 start_lbn += 1; 848 goto redo; 849 } 850 } 851 852 /* 853 * Release a buffer. 854 * Even if the buffer is dirty, no I/O is started. 855 */ 856 brelse(bp) 857 register struct buf *bp; 858 { 859 register struct queue_entry *flist; 860 int s; 861 862 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 863 /* 864 * If a process is waiting for the buffer, or 865 * is waiting for a free buffer, awaken it. 866 */ 867 if (bp->b_flags & B_WANTED) 868 wakeup((caddr_t)bp); 869 if (needbuffer) { 870 needbuffer = 0; 871 wakeup((caddr_t)&needbuffer); 872 } 873 /* 874 * Retry I/O for locked buffers rather than invalidating them. 875 */ 876 s = splbio(); 877 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 878 bp->b_flags &= ~B_ERROR; 879 /* 880 * Disassociate buffers that are no longer valid. 881 */ 882 if (bp->b_flags & (B_NOCACHE | B_ERROR)) 883 bp->b_flags |= B_INVAL; 884 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 885 if (bp->b_vp) 886 brelvp(bp); 887 bp->b_flags &= ~B_DELWRI; 888 } 889 /* 890 * Stick the buffer back on a free list. 891 */ 892 if (bp->b_bufsize <= 0) { 893 /* block has no buffer ... put at front of unused buffer list */ 894 flist = &bufqueues[BQ_EMPTY]; 895 binsheadfree(bp, flist); 896 } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 897 /* block has no info ... put at front of most free list */ 898 flist = &bufqueues[BQ_AGE]; 899 binsheadfree(bp, flist); 900 } else { 901 if (bp->b_flags & B_LOCKED) 902 flist = &bufqueues[BQ_LOCKED]; 903 else if (bp->b_flags & B_AGE) 904 flist = &bufqueues[BQ_AGE]; 905 else 906 flist = &bufqueues[BQ_LRU]; 907 binstailfree(bp, flist); 908 } 909 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 910 splx(s); 911 } 912 913 /* 914 * Check to see if a block is currently memory resident. 915 */ 916 struct buf * 917 incore(vp, blkno) 918 struct vnode *vp; 919 daddr_t blkno; 920 { 921 register struct buf *bp; 922 923 for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) 924 if (bp->b_lblkno == blkno && bp->b_vp == vp && 925 (bp->b_flags & B_INVAL) == 0) 926 return (bp); 927 return (NULL); 928 } 929 930 /* 931 * Check to see if a block is currently memory resident. 932 * If it is resident, return it. If it is not resident, 933 * allocate a new buffer and assign it to the block. 934 */ 935 struct buf * 936 getblk(vp, blkno, size, slpflag, slptimeo) 937 register struct vnode *vp; 938 daddr_t blkno; 939 int size, slpflag, slptimeo; 940 { 941 register struct buf *bp; 942 struct list_entry *dp; 943 int s, error; 944 945 if (size > MAXBSIZE) 946 panic("getblk: size too big"); 947 /* 948 * Search the cache for the block. If the buffer is found, 949 * but it is currently locked, the we must wait for it to 950 * become available. 951 */ 952 dp = BUFHASH(vp, blkno); 953 loop: 954 for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { 955 if (bp->b_lblkno != blkno || bp->b_vp != vp) 956 continue; 957 s = splbio(); 958 if (bp->b_flags & B_BUSY) { 959 bp->b_flags |= B_WANTED; 960 error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), 961 "getblk", slptimeo); 962 splx(s); 963 if (error) 964 return (NULL); 965 goto loop; 966 } 967 /* 968 * The test for B_INVAL is moved down here, since there 969 * are cases where B_INVAL is set before VOP_BWRITE() is 970 * called and for NFS, the process cannot be allowed to 971 * allocate a new buffer for the same block until the write 972 * back to the server has been completed. (ie. B_BUSY clears) 973 */ 974 if (bp->b_flags & B_INVAL) { 975 splx(s); 976 continue; 977 } 978 bremfree(bp); 979 bp->b_flags |= B_BUSY; 980 splx(s); 981 if (bp->b_bcount != size) { 982 printf("getblk: stray size"); 983 bp->b_flags |= B_INVAL; 984 VOP_BWRITE(bp); 985 goto loop; 986 } 987 bp->b_flags |= B_CACHE; 988 return (bp); 989 } 990 /* 991 * The loop back to the top when getnewbuf() fails is because 992 * stateless filesystems like NFS have no node locks. Thus, 993 * there is a slight chance that more than one process will 994 * try and getnewbuf() for the same block concurrently when 995 * the first sleeps in getnewbuf(). So after a sleep, go back 996 * up to the top to check the hash lists again. 997 */ 998 if ((bp = getnewbuf(slpflag, slptimeo)) == 0) 999 goto loop; 1000 bremhash(bp); 1001 bgetvp(vp, bp); 1002 bp->b_bcount = 0; 1003 bp->b_lblkno = blkno; 1004 bp->b_blkno = blkno; 1005 bp->b_error = 0; 1006 bp->b_resid = 0; 1007 binshash(bp, dp); 1008 allocbuf(bp, size); 1009 return (bp); 1010 } 1011 1012 /* 1013 * Allocate a buffer. 1014 * The caller will assign it to a block. 1015 */ 1016 struct buf * 1017 geteblk(size) 1018 int size; 1019 { 1020 register struct buf *bp; 1021 1022 if (size > MAXBSIZE) 1023 panic("geteblk: size too big"); 1024 while ((bp = getnewbuf(0, 0)) == NULL) 1025 /* void */; 1026 bp->b_flags |= B_INVAL; 1027 bremhash(bp); 1028 binshash(bp, &invalhash); 1029 bp->b_bcount = 0; 1030 bp->b_error = 0; 1031 bp->b_resid = 0; 1032 allocbuf(bp, size); 1033 return (bp); 1034 } 1035 1036 /* 1037 * Expand or contract the actual memory allocated to a buffer. 1038 * If no memory is available, release buffer and take error exit. 1039 */ 1040 allocbuf(tp, size) 1041 register struct buf *tp; 1042 int size; 1043 { 1044 register struct buf *bp, *ep; 1045 int sizealloc, take, s; 1046 1047 sizealloc = roundup(size, CLBYTES); 1048 /* 1049 * Buffer size does not change 1050 */ 1051 if (sizealloc == tp->b_bufsize) 1052 goto out; 1053 /* 1054 * Buffer size is shrinking. 1055 * Place excess space in a buffer header taken from the 1056 * BQ_EMPTY buffer list and placed on the "most free" list. 1057 * If no extra buffer headers are available, leave the 1058 * extra space in the present buffer. 1059 */ 1060 if (sizealloc < tp->b_bufsize) { 1061 if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) 1062 goto out; 1063 s = splbio(); 1064 bremfree(ep); 1065 ep->b_flags |= B_BUSY; 1066 splx(s); 1067 pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 1068 (int)tp->b_bufsize - sizealloc); 1069 ep->b_bufsize = tp->b_bufsize - sizealloc; 1070 tp->b_bufsize = sizealloc; 1071 ep->b_flags |= B_INVAL; 1072 ep->b_bcount = 0; 1073 brelse(ep); 1074 goto out; 1075 } 1076 /* 1077 * More buffer space is needed. Get it out of buffers on 1078 * the "most free" list, placing the empty headers on the 1079 * BQ_EMPTY buffer header list. 1080 */ 1081 while (tp->b_bufsize < sizealloc) { 1082 take = sizealloc - tp->b_bufsize; 1083 while ((bp = getnewbuf(0, 0)) == NULL) 1084 /* void */; 1085 if (take >= bp->b_bufsize) 1086 take = bp->b_bufsize; 1087 pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 1088 &tp->b_un.b_addr[tp->b_bufsize], take); 1089 tp->b_bufsize += take; 1090 bp->b_bufsize = bp->b_bufsize - take; 1091 if (bp->b_bcount > bp->b_bufsize) 1092 bp->b_bcount = bp->b_bufsize; 1093 if (bp->b_bufsize <= 0) { 1094 bremhash(bp); 1095 binshash(bp, &invalhash); 1096 bp->b_dev = NODEV; 1097 bp->b_error = 0; 1098 bp->b_flags |= B_INVAL; 1099 } 1100 brelse(bp); 1101 } 1102 out: 1103 tp->b_bcount = size; 1104 return (1); 1105 } 1106 1107 /* 1108 * Find a buffer which is available for use. 1109 * Select something from a free list. 1110 * Preference is to AGE list, then LRU list. 1111 */ 1112 struct buf * 1113 getnewbuf(slpflag, slptimeo) 1114 int slpflag, slptimeo; 1115 { 1116 register struct buf *bp; 1117 register struct queue_entry *dp; 1118 register struct ucred *cred; 1119 int s; 1120 struct buf *abp; 1121 static int losecnt = 0; 1122 1123 loop: 1124 s = splbio(); 1125 abp = NULL; 1126 for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) { 1127 for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { 1128 if (abp == NULL) 1129 abp = bp; 1130 if ((bp->b_flags & B_DELWRI) && 1131 bp->b_vp && VOP_ISLOCKED(bp->b_vp)) 1132 continue; 1133 goto found; 1134 } 1135 } 1136 if (dp == bufqueues) { /* no free blocks */ 1137 if (abp) { 1138 bp = abp; 1139 bp->b_flags |= B_XXX; 1140 if (losecnt++ < 20) { 1141 vprint("skipping blkno check", bp->b_vp); 1142 printf("\tlblkno %d, blkno %d\n", 1143 bp->b_lblkno, bp->b_blkno); 1144 } 1145 goto found; 1146 } 1147 needbuffer = 1; 1148 (void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1), 1149 "getnewbuf", slptimeo); 1150 splx(s); 1151 return (NULL); 1152 } 1153 found: 1154 bremfree(bp); 1155 bp->b_flags |= B_BUSY; 1156 splx(s); 1157 if (bp->b_flags & B_DELWRI) { 1158 (void) bawrite(bp); 1159 goto loop; 1160 } 1161 trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 1162 if (bp->b_vp) 1163 brelvp(bp); 1164 if (bp->b_rcred != NOCRED) { 1165 cred = bp->b_rcred; 1166 bp->b_rcred = NOCRED; 1167 crfree(cred); 1168 } 1169 if (bp->b_wcred != NOCRED) { 1170 cred = bp->b_wcred; 1171 bp->b_wcred = NOCRED; 1172 crfree(cred); 1173 } 1174 bp->b_flags = B_BUSY; 1175 bp->b_dirtyoff = bp->b_dirtyend = 0; 1176 bp->b_validoff = bp->b_validend = 0; 1177 return (bp); 1178 } 1179 1180 /* 1181 * Wait for I/O to complete. 1182 * 1183 * Extract and return any errors associated with the I/O. 1184 * If the error flag is set, but no specific error is 1185 * given, return EIO. 1186 */ 1187 biowait(bp) 1188 register struct buf *bp; 1189 { 1190 int s; 1191 1192 s = splbio(); 1193 while ((bp->b_flags & B_DONE) == 0) 1194 sleep((caddr_t)bp, PRIBIO); 1195 splx(s); 1196 if ((bp->b_flags & B_ERROR) == 0) 1197 return (0); 1198 if (bp->b_error) 1199 return (bp->b_error); 1200 return (EIO); 1201 } 1202 1203 /* 1204 * Mark I/O complete on a buffer. 1205 * 1206 * If a callback has been requested, e.g. the pageout 1207 * daemon, do so. Otherwise, awaken waiting processes. 1208 */ 1209 void 1210 biodone(bp) 1211 register struct buf *bp; 1212 { 1213 1214 if (bp->b_flags & B_DONE) 1215 panic("dup biodone"); 1216 bp->b_flags |= B_DONE; 1217 if ((bp->b_flags & B_READ) == 0) 1218 vwakeup(bp); 1219 if (bp->b_flags & B_CALL) { 1220 bp->b_flags &= ~B_CALL; 1221 (*bp->b_iodone)(bp); 1222 return; 1223 } 1224 if (bp->b_flags & B_ASYNC) 1225 brelse(bp); 1226 else { 1227 bp->b_flags &= ~B_WANTED; 1228 wakeup((caddr_t)bp); 1229 } 1230 } 1231 1232 int 1233 count_lock_queue() 1234 { 1235 register struct buf *bp; 1236 register int ret; 1237 1238 for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next; 1239 bp; bp = (struct buf *)bp->b_freelist.qe_next) 1240 ++ret; 1241 return(ret); 1242 } 1243 1244 #ifdef DIAGNOSTIC 1245 /* 1246 * Print out statistics on the current allocation of the buffer pool. 1247 * Can be enabled to print out on every ``sync'' by setting "syncprt" 1248 * above. 1249 */ 1250 void 1251 vfs_bufstats() 1252 { 1253 int s, i, j, count; 1254 register struct buf *bp; 1255 register struct queue_entry *dp; 1256 int counts[MAXBSIZE/CLBYTES+1]; 1257 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 1258 1259 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 1260 count = 0; 1261 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 1262 counts[j] = 0; 1263 s = splbio(); 1264 for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { 1265 counts[bp->b_bufsize/CLBYTES]++; 1266 count++; 1267 } 1268 splx(s); 1269 printf("%s: total-%d", bname[i], count); 1270 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 1271 if (counts[j] != 0) 1272 printf(", %d-%d", j * CLBYTES, counts[j]); 1273 printf("\n"); 1274 } 1275 } 1276 #endif /* DIAGNOSTIC */ 1277