1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * Modifications/enhancements: 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. All advertising materials mentioning features or use of this software 16 * must display the following acknowledgement: 17 * This product includes software developed by the University of 18 * California, Berkeley and its contributors. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 36 * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $ 37 * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.40 2008/07/14 03:09:00 dillon Exp $ 38 */ 39 40 #include "opt_debug_cluster.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/buf.h> 47 #include <sys/vnode.h> 48 #include <sys/malloc.h> 49 #include <sys/mount.h> 50 #include <sys/resourcevar.h> 51 #include <sys/vmmeter.h> 52 #include <vm/vm.h> 53 #include <vm/vm_object.h> 54 #include <vm/vm_page.h> 55 #include <sys/sysctl.h> 56 57 #include <sys/buf2.h> 58 #include <vm/vm_page2.h> 59 60 #include <machine/limits.h> 61 62 #if defined(CLUSTERDEBUG) 63 #include <sys/sysctl.h> 64 static int rcluster= 0; 65 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 66 #endif 67 68 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer"); 69 70 static struct cluster_save * 71 cluster_collectbufs (struct vnode *vp, struct buf *last_bp, 72 int blksize); 73 static struct buf * 74 cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset, 75 off_t doffset, int blksize, int run, 76 struct buf *fbp); 77 static void cluster_callback (struct bio *); 78 static void cluster_setram (struct buf *); 79 80 static int write_behind = 1; 81 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 82 "Cluster write-behind setting"); 83 static int max_readahead = 2 * 1024 * 1024; 84 SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, 85 "Limit in bytes for desired cluster read-ahead"); 86 87 extern vm_page_t bogus_page; 88 89 extern int cluster_pbuf_freecnt; 90 91 /* 92 * This replaces bread. 93 * 94 * filesize - read-ahead @ blksize will not cross this boundary 95 * loffset - loffset for returned *bpp 96 * blksize - blocksize for returned *bpp and read-ahead bps 97 * minreq - minimum (not a hard minimum) in bytes, typically reflects 98 * a higher level uio resid. 99 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) 100 * bpp - return buffer (*bpp) for (loffset,blksize) 101 */ 102 int 103 cluster_readx(struct vnode *vp, off_t filesize, off_t loffset, 104 int blksize, size_t minreq, size_t maxreq, struct buf **bpp) 105 { 106 struct buf *bp, *rbp, *reqbp; 107 off_t origoffset; 108 off_t doffset; 109 int error; 110 int i; 111 int maxra; 112 int maxrbuild; 113 114 error = 0; 115 116 /* 117 * Calculate the desired read-ahead in blksize'd blocks (maxra). 118 * To do this we calculate maxreq. 119 * 120 * maxreq typically starts out as a sequential heuristic. If the 121 * high level uio/resid is bigger (minreq), we pop maxreq up to 122 * minreq. This represents the case where random I/O is being 123 * performed by the userland is issuing big read()'s. 124 * 125 * Then we limit maxreq to max_readahead to ensure it is a reasonable 126 * value. 127 * 128 * Finally we must ensure that (loffset + maxreq) does not cross the 129 * boundary (filesize) for the current blocksize. If we allowed it 130 * to cross we could end up with buffers past the boundary with the 131 * wrong block size (HAMMER large-data areas use mixed block sizes). 132 * minreq is also absolutely limited to filesize. 133 */ 134 if (maxreq < minreq) 135 maxreq = minreq; 136 /* minreq not used beyond this point */ 137 138 if (maxreq > max_readahead) { 139 maxreq = max_readahead; 140 if (maxreq > 16 * 1024 * 1024) 141 maxreq = 16 * 1024 * 1024; 142 } 143 if (maxreq < blksize) 144 maxreq = blksize; 145 if (loffset + maxreq > filesize) { 146 if (loffset > filesize) 147 maxreq = 0; 148 else 149 maxreq = filesize - loffset; 150 } 151 152 maxra = (int)(maxreq / blksize); 153 154 /* 155 * Get the requested block. 156 */ 157 if (*bpp) 158 reqbp = bp = *bpp; 159 else 160 *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0); 161 origoffset = loffset; 162 163 /* 164 * Calculate the maximum cluster size for a single I/O, used 165 * by cluster_rbuild(). 166 */ 167 maxrbuild = vmaxiosize(vp) / blksize; 168 169 /* 170 * if it is in the cache, then check to see if the reads have been 171 * sequential. If they have, then try some read-ahead, otherwise 172 * back-off on prospective read-aheads. 173 */ 174 if (bp->b_flags & B_CACHE) { 175 /* 176 * Not sequential, do not do any read-ahead 177 */ 178 if (maxra <= 1) 179 return 0; 180 181 /* 182 * No read-ahead mark, do not do any read-ahead 183 * yet. 184 */ 185 if ((bp->b_flags & B_RAM) == 0) 186 return 0; 187 188 /* 189 * We hit a read-ahead-mark, figure out how much read-ahead 190 * to do (maxra) and where to start (loffset). 191 * 192 * Shortcut the scan. Typically the way this works is that 193 * we've built up all the blocks inbetween except for the 194 * last in previous iterations, so if the second-to-last 195 * block is present we just skip ahead to it. 196 * 197 * This algorithm has O(1) cpu in the steady state no 198 * matter how large maxra is. 199 */ 200 bp->b_flags &= ~B_RAM; 201 202 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST)) 203 i = maxra - 1; 204 else 205 i = 1; 206 while (i < maxra) { 207 if (findblk(vp, loffset + i * blksize, 208 FINDBLK_TEST) == NULL) { 209 break; 210 } 211 ++i; 212 } 213 214 /* 215 * We got everything or everything is in the cache, no 216 * point continuing. 217 */ 218 if (i >= maxra) 219 return 0; 220 maxra -= i; 221 loffset += i * blksize; 222 reqbp = bp = NULL; 223 } else { 224 __debugvar off_t firstread = bp->b_loffset; 225 int nblks; 226 227 /* 228 * Set-up synchronous read for bp. 229 */ 230 bp->b_cmd = BUF_CMD_READ; 231 bp->b_bio1.bio_done = biodone_sync; 232 bp->b_bio1.bio_flags |= BIO_SYNC; 233 234 KASSERT(firstread != NOOFFSET, 235 ("cluster_read: no buffer offset")); 236 237 /* 238 * nblks is our cluster_rbuild request size, limited 239 * primarily by the device. 240 */ 241 if ((nblks = maxra) > maxrbuild) 242 nblks = maxrbuild; 243 244 if (nblks > 1) { 245 int burstbytes; 246 247 error = VOP_BMAP(vp, loffset, &doffset, 248 &burstbytes, NULL, BUF_CMD_READ); 249 if (error) 250 goto single_block_read; 251 if (nblks > burstbytes / blksize) 252 nblks = burstbytes / blksize; 253 if (doffset == NOOFFSET) 254 goto single_block_read; 255 if (nblks <= 1) 256 goto single_block_read; 257 258 bp = cluster_rbuild(vp, filesize, loffset, 259 doffset, blksize, nblks, bp); 260 loffset += bp->b_bufsize; 261 maxra -= bp->b_bufsize / blksize; 262 } else { 263 single_block_read: 264 /* 265 * If it isn't in the cache, then get a chunk from 266 * disk if sequential, otherwise just get the block. 267 */ 268 cluster_setram(bp); 269 loffset += blksize; 270 --maxra; 271 } 272 } 273 274 /* 275 * If B_CACHE was not set issue bp. bp will either be an 276 * asynchronous cluster buf or a synchronous single-buf. 277 * If it is a single buf it will be the same as reqbp. 278 * 279 * NOTE: Once an async cluster buf is issued bp becomes invalid. 280 */ 281 if (bp) { 282 #if defined(CLUSTERDEBUG) 283 if (rcluster) 284 kprintf("S(%012jx,%d,%d)\n", 285 (intmax_t)bp->b_loffset, bp->b_bcount, maxra); 286 #endif 287 if ((bp->b_flags & B_CLUSTER) == 0) 288 vfs_busy_pages(vp, bp); 289 bp->b_flags &= ~(B_ERROR|B_INVAL); 290 vn_strategy(vp, &bp->b_bio1); 291 error = 0; 292 /* bp invalid now */ 293 } 294 295 /* 296 * If we have been doing sequential I/O, then do some read-ahead. 297 * The code above us should have positioned us at the next likely 298 * offset. 299 * 300 * Only mess with buffers which we can immediately lock. HAMMER 301 * will do device-readahead irrespective of what the blocks 302 * represent. 303 */ 304 while (error == 0 && maxra > 0) { 305 int burstbytes; 306 int tmp_error; 307 int nblks; 308 309 rbp = getblk(vp, loffset, blksize, 310 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 311 if (rbp == NULL) 312 goto no_read_ahead; 313 if ((rbp->b_flags & B_CACHE)) { 314 bqrelse(rbp); 315 goto no_read_ahead; 316 } 317 318 /* 319 * An error from the read-ahead bmap has nothing to do 320 * with the caller's original request. 321 */ 322 tmp_error = VOP_BMAP(vp, loffset, &doffset, 323 &burstbytes, NULL, BUF_CMD_READ); 324 if (tmp_error || doffset == NOOFFSET) { 325 rbp->b_flags |= B_INVAL; 326 brelse(rbp); 327 rbp = NULL; 328 goto no_read_ahead; 329 } 330 if ((nblks = maxra) > maxrbuild) 331 nblks = maxrbuild; 332 if (nblks > burstbytes / blksize) 333 nblks = burstbytes / blksize; 334 335 /* 336 * rbp: async read 337 */ 338 rbp->b_cmd = BUF_CMD_READ; 339 /*rbp->b_flags |= B_AGE*/; 340 cluster_setram(rbp); 341 342 if (nblks > 1) { 343 rbp = cluster_rbuild(vp, filesize, loffset, 344 doffset, blksize, 345 nblks, rbp); 346 } else { 347 rbp->b_bio2.bio_offset = doffset; 348 } 349 350 #if defined(CLUSTERDEBUG) 351 if (rcluster) { 352 if (bp) { 353 kprintf("A+(%012jx,%d,%jd) " 354 "doff=%012jx minr=%zd ra=%d\n", 355 (intmax_t)loffset, rbp->b_bcount, 356 (intmax_t)(loffset - origoffset), 357 (intmax_t)doffset, minreq, maxra); 358 } else { 359 kprintf("A-(%012jx,%d,%jd) " 360 "doff=%012jx minr=%zd ra=%d\n", 361 (intmax_t)rbp->b_loffset, rbp->b_bcount, 362 (intmax_t)(loffset - origoffset), 363 (intmax_t)doffset, minreq, maxra); 364 } 365 } 366 #endif 367 rbp->b_flags &= ~(B_ERROR|B_INVAL); 368 369 if ((rbp->b_flags & B_CLUSTER) == 0) 370 vfs_busy_pages(vp, rbp); 371 BUF_KERNPROC(rbp); 372 loffset += rbp->b_bufsize; 373 maxra -= rbp->b_bufsize / blksize; 374 vn_strategy(vp, &rbp->b_bio1); 375 /* rbp invalid now */ 376 } 377 378 /* 379 * Wait for our original buffer to complete its I/O. reqbp will 380 * be NULL if the original buffer was B_CACHE. We are returning 381 * (*bpp) which is the same as reqbp when reqbp != NULL. 382 */ 383 no_read_ahead: 384 if (reqbp) { 385 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC); 386 error = biowait(&reqbp->b_bio1, "clurd"); 387 } 388 return (error); 389 } 390 391 /* 392 * If blocks are contiguous on disk, use this to provide clustered 393 * read ahead. We will read as many blocks as possible sequentially 394 * and then parcel them up into logical blocks in the buffer hash table. 395 * 396 * This function either returns a cluster buf or it returns fbp. fbp is 397 * already expected to be set up as a synchronous or asynchronous request. 398 * 399 * If a cluster buf is returned it will always be async. 400 */ 401 static struct buf * 402 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, 403 int blksize, int run, struct buf *fbp) 404 { 405 struct buf *bp, *tbp; 406 off_t boffset; 407 int i, j; 408 int maxiosize = vmaxiosize(vp); 409 410 /* 411 * avoid a division 412 */ 413 while (loffset + run * blksize > filesize) { 414 --run; 415 } 416 417 tbp = fbp; 418 tbp->b_bio2.bio_offset = doffset; 419 if((tbp->b_flags & B_MALLOC) || 420 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) { 421 return tbp; 422 } 423 424 bp = trypbuf_kva(&cluster_pbuf_freecnt); 425 if (bp == NULL) { 426 return tbp; 427 } 428 429 /* 430 * We are synthesizing a buffer out of vm_page_t's, but 431 * if the block size is not page aligned then the starting 432 * address may not be either. Inherit the b_data offset 433 * from the original buffer. 434 */ 435 bp->b_data = (char *)((vm_offset_t)bp->b_data | 436 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 437 bp->b_flags |= B_CLUSTER | B_VMIO; 438 bp->b_cmd = BUF_CMD_READ; 439 bp->b_bio1.bio_done = cluster_callback; /* default to async */ 440 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 441 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 442 bp->b_loffset = loffset; 443 bp->b_bio2.bio_offset = doffset; 444 KASSERT(bp->b_loffset != NOOFFSET, 445 ("cluster_rbuild: no buffer offset")); 446 447 bp->b_bcount = 0; 448 bp->b_bufsize = 0; 449 bp->b_xio.xio_npages = 0; 450 451 for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) { 452 if (i) { 453 if ((bp->b_xio.xio_npages * PAGE_SIZE) + 454 round_page(blksize) > maxiosize) { 455 break; 456 } 457 458 /* 459 * Shortcut some checks and try to avoid buffers that 460 * would block in the lock. The same checks have to 461 * be made again after we officially get the buffer. 462 */ 463 tbp = getblk(vp, loffset + i * blksize, blksize, 464 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 465 if (tbp == NULL) 466 break; 467 for (j = 0; j < tbp->b_xio.xio_npages; j++) { 468 if (tbp->b_xio.xio_pages[j]->valid) 469 break; 470 } 471 if (j != tbp->b_xio.xio_npages) { 472 bqrelse(tbp); 473 break; 474 } 475 476 /* 477 * Stop scanning if the buffer is fuly valid 478 * (marked B_CACHE), or locked (may be doing a 479 * background write), or if the buffer is not 480 * VMIO backed. The clustering code can only deal 481 * with VMIO-backed buffers. 482 */ 483 if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || 484 (tbp->b_flags & B_VMIO) == 0 || 485 (LIST_FIRST(&tbp->b_dep) != NULL && 486 buf_checkread(tbp)) 487 ) { 488 bqrelse(tbp); 489 break; 490 } 491 492 /* 493 * The buffer must be completely invalid in order to 494 * take part in the cluster. If it is partially valid 495 * then we stop. 496 */ 497 for (j = 0;j < tbp->b_xio.xio_npages; j++) { 498 if (tbp->b_xio.xio_pages[j]->valid) 499 break; 500 } 501 if (j != tbp->b_xio.xio_npages) { 502 bqrelse(tbp); 503 break; 504 } 505 506 /* 507 * Set a read-ahead mark as appropriate 508 */ 509 if (i == 1 || i == (run - 1)) 510 cluster_setram(tbp); 511 512 /* 513 * Depress the priority of buffers not explicitly 514 * requested. 515 */ 516 /* tbp->b_flags |= B_AGE; */ 517 518 /* 519 * Set the block number if it isn't set, otherwise 520 * if it is make sure it matches the block number we 521 * expect. 522 */ 523 if (tbp->b_bio2.bio_offset == NOOFFSET) { 524 tbp->b_bio2.bio_offset = boffset; 525 } else if (tbp->b_bio2.bio_offset != boffset) { 526 brelse(tbp); 527 break; 528 } 529 } 530 531 /* 532 * The passed-in tbp (i == 0) will already be set up for 533 * async or sync operation. All other tbp's acquire in 534 * our loop are set up for async operation. 535 */ 536 tbp->b_cmd = BUF_CMD_READ; 537 BUF_KERNPROC(tbp); 538 cluster_append(&bp->b_bio1, tbp); 539 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 540 vm_page_t m; 541 542 m = tbp->b_xio.xio_pages[j]; 543 vm_page_busy_wait(m, FALSE, "clurpg"); 544 vm_page_io_start(m); 545 vm_page_wakeup(m); 546 vm_object_pip_add(m->object, 1); 547 if ((bp->b_xio.xio_npages == 0) || 548 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) { 549 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 550 bp->b_xio.xio_npages++; 551 } 552 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) 553 tbp->b_xio.xio_pages[j] = bogus_page; 554 } 555 /* 556 * XXX shouldn't this be += size for both, like in 557 * cluster_wbuild()? 558 * 559 * Don't inherit tbp->b_bufsize as it may be larger due to 560 * a non-page-aligned size. Instead just aggregate using 561 * 'size'. 562 */ 563 if (tbp->b_bcount != blksize) 564 kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize); 565 if (tbp->b_bufsize != blksize) 566 kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize); 567 bp->b_bcount += blksize; 568 bp->b_bufsize += blksize; 569 } 570 571 /* 572 * Fully valid pages in the cluster are already good and do not need 573 * to be re-read from disk. Replace the page with bogus_page 574 */ 575 for (j = 0; j < bp->b_xio.xio_npages; j++) { 576 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) == 577 VM_PAGE_BITS_ALL) { 578 bp->b_xio.xio_pages[j] = bogus_page; 579 } 580 } 581 if (bp->b_bufsize > bp->b_kvasize) { 582 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)", 583 bp->b_bufsize, bp->b_kvasize); 584 } 585 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 586 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages); 587 BUF_KERNPROC(bp); 588 return (bp); 589 } 590 591 /* 592 * Cleanup after a clustered read or write. 593 * This is complicated by the fact that any of the buffers might have 594 * extra memory (if there were no empty buffer headers at allocbuf time) 595 * that we will need to shift around. 596 * 597 * The returned bio is &bp->b_bio1 598 */ 599 void 600 cluster_callback(struct bio *bio) 601 { 602 struct buf *bp = bio->bio_buf; 603 struct buf *tbp; 604 int error = 0; 605 606 /* 607 * Must propogate errors to all the components. A short read (EOF) 608 * is a critical error. 609 */ 610 if (bp->b_flags & B_ERROR) { 611 error = bp->b_error; 612 } else if (bp->b_bcount != bp->b_bufsize) { 613 panic("cluster_callback: unexpected EOF on cluster %p!", bio); 614 } 615 616 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages); 617 /* 618 * Move memory from the large cluster buffer into the component 619 * buffers and mark IO as done on these. Since the memory map 620 * is the same, no actual copying is required. 621 */ 622 while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) { 623 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next; 624 if (error) { 625 tbp->b_flags |= B_ERROR | B_IODEBUG; 626 tbp->b_error = error; 627 } else { 628 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 629 tbp->b_flags &= ~(B_ERROR|B_INVAL); 630 tbp->b_flags |= B_IODEBUG; 631 /* 632 * XXX the bdwrite()/bqrelse() issued during 633 * cluster building clears B_RELBUF (see bqrelse() 634 * comment). If direct I/O was specified, we have 635 * to restore it here to allow the buffer and VM 636 * to be freed. 637 */ 638 if (tbp->b_flags & B_DIRECT) 639 tbp->b_flags |= B_RELBUF; 640 } 641 biodone(&tbp->b_bio1); 642 } 643 relpbuf(bp, &cluster_pbuf_freecnt); 644 } 645 646 /* 647 * cluster_wbuild_wb: 648 * 649 * Implement modified write build for cluster. 650 * 651 * write_behind = 0 write behind disabled 652 * write_behind = 1 write behind normal (default) 653 * write_behind = 2 write behind backed-off 654 */ 655 656 static __inline int 657 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len) 658 { 659 int r = 0; 660 661 switch(write_behind) { 662 case 2: 663 if (start_loffset < len) 664 break; 665 start_loffset -= len; 666 /* fall through */ 667 case 1: 668 r = cluster_wbuild(vp, blksize, start_loffset, len); 669 /* fall through */ 670 default: 671 /* fall through */ 672 break; 673 } 674 return(r); 675 } 676 677 /* 678 * Do clustered write for FFS. 679 * 680 * Three cases: 681 * 1. Write is not sequential (write asynchronously) 682 * Write is sequential: 683 * 2. beginning of cluster - begin cluster 684 * 3. middle of a cluster - add to cluster 685 * 4. end of a cluster - asynchronously write cluster 686 */ 687 void 688 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount) 689 { 690 struct vnode *vp; 691 off_t loffset; 692 int maxclen, cursize; 693 int async; 694 695 vp = bp->b_vp; 696 if (vp->v_type == VREG) 697 async = vp->v_mount->mnt_flag & MNT_ASYNC; 698 else 699 async = 0; 700 loffset = bp->b_loffset; 701 KASSERT(bp->b_loffset != NOOFFSET, 702 ("cluster_write: no buffer offset")); 703 704 /* Initialize vnode to beginning of file. */ 705 if (loffset == 0) 706 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 707 708 if (vp->v_clen == 0 || loffset != vp->v_lastw + blksize || 709 bp->b_bio2.bio_offset == NOOFFSET || 710 (bp->b_bio2.bio_offset != vp->v_lasta + blksize)) { 711 maxclen = vmaxiosize(vp); 712 if (vp->v_clen != 0) { 713 /* 714 * Next block is not sequential. 715 * 716 * If we are not writing at end of file, the process 717 * seeked to another point in the file since its last 718 * write, or we have reached our maximum cluster size, 719 * then push the previous cluster. Otherwise try 720 * reallocating to make it sequential. 721 * 722 * Change to algorithm: only push previous cluster if 723 * it was sequential from the point of view of the 724 * seqcount heuristic, otherwise leave the buffer 725 * intact so we can potentially optimize the I/O 726 * later on in the buf_daemon or update daemon 727 * flush. 728 */ 729 cursize = vp->v_lastw - vp->v_cstart + blksize; 730 if (bp->b_loffset + blksize != filesize || 731 loffset != vp->v_lastw + blksize || vp->v_clen <= cursize) { 732 if (!async && seqcount > 0) { 733 cluster_wbuild_wb(vp, blksize, 734 vp->v_cstart, cursize); 735 } 736 } else { 737 struct buf **bpp, **endbp; 738 struct cluster_save *buflist; 739 740 buflist = cluster_collectbufs(vp, bp, blksize); 741 endbp = &buflist->bs_children 742 [buflist->bs_nchildren - 1]; 743 if (VOP_REALLOCBLKS(vp, buflist)) { 744 /* 745 * Failed, push the previous cluster 746 * if *really* writing sequentially 747 * in the logical file (seqcount > 1), 748 * otherwise delay it in the hopes that 749 * the low level disk driver can 750 * optimize the write ordering. 751 */ 752 for (bpp = buflist->bs_children; 753 bpp < endbp; bpp++) 754 brelse(*bpp); 755 kfree(buflist, M_SEGMENT); 756 if (seqcount > 1) { 757 cluster_wbuild_wb(vp, 758 blksize, vp->v_cstart, 759 cursize); 760 } 761 } else { 762 /* 763 * Succeeded, keep building cluster. 764 */ 765 for (bpp = buflist->bs_children; 766 bpp <= endbp; bpp++) 767 bdwrite(*bpp); 768 kfree(buflist, M_SEGMENT); 769 vp->v_lastw = loffset; 770 vp->v_lasta = bp->b_bio2.bio_offset; 771 return; 772 } 773 } 774 } 775 /* 776 * Consider beginning a cluster. If at end of file, make 777 * cluster as large as possible, otherwise find size of 778 * existing cluster. 779 */ 780 if ((vp->v_type == VREG) && 781 bp->b_loffset + blksize != filesize && 782 (bp->b_bio2.bio_offset == NOOFFSET) && 783 (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) || 784 bp->b_bio2.bio_offset == NOOFFSET)) { 785 bawrite(bp); 786 vp->v_clen = 0; 787 vp->v_lasta = bp->b_bio2.bio_offset; 788 vp->v_cstart = loffset + blksize; 789 vp->v_lastw = loffset; 790 return; 791 } 792 if (maxclen > blksize) 793 vp->v_clen = maxclen - blksize; 794 else 795 vp->v_clen = 0; 796 if (!async && vp->v_clen == 0) { /* I/O not contiguous */ 797 vp->v_cstart = loffset + blksize; 798 bawrite(bp); 799 } else { /* Wait for rest of cluster */ 800 vp->v_cstart = loffset; 801 bdwrite(bp); 802 } 803 } else if (loffset == vp->v_cstart + vp->v_clen) { 804 /* 805 * At end of cluster, write it out if seqcount tells us we 806 * are operating sequentially, otherwise let the buf or 807 * update daemon handle it. 808 */ 809 bdwrite(bp); 810 if (seqcount > 1) 811 cluster_wbuild_wb(vp, blksize, vp->v_cstart, 812 vp->v_clen + blksize); 813 vp->v_clen = 0; 814 vp->v_cstart = loffset + blksize; 815 } else if (vm_page_count_severe()) { 816 /* 817 * We are low on memory, get it going NOW 818 */ 819 bawrite(bp); 820 } else { 821 /* 822 * In the middle of a cluster, so just delay the I/O for now. 823 */ 824 bdwrite(bp); 825 } 826 vp->v_lastw = loffset; 827 vp->v_lasta = bp->b_bio2.bio_offset; 828 } 829 830 831 /* 832 * This is an awful lot like cluster_rbuild...wish they could be combined. 833 * The last lbn argument is the current block on which I/O is being 834 * performed. Check to see that it doesn't fall in the middle of 835 * the current block (if last_bp == NULL). 836 */ 837 int 838 cluster_wbuild(struct vnode *vp, int blksize, off_t start_loffset, int bytes) 839 { 840 struct buf *bp, *tbp; 841 int i, j; 842 int totalwritten = 0; 843 int maxiosize = vmaxiosize(vp); 844 845 while (bytes > 0) { 846 /* 847 * If the buffer is not delayed-write (i.e. dirty), or it 848 * is delayed-write but either locked or inval, it cannot 849 * partake in the clustered write. 850 */ 851 tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK); 852 if (tbp == NULL || 853 (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != B_DELWRI || 854 (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) { 855 if (tbp) 856 BUF_UNLOCK(tbp); 857 start_loffset += blksize; 858 bytes -= blksize; 859 continue; 860 } 861 bremfree(tbp); 862 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 863 864 /* 865 * Extra memory in the buffer, punt on this buffer. 866 * XXX we could handle this in most cases, but we would 867 * have to push the extra memory down to after our max 868 * possible cluster size and then potentially pull it back 869 * up if the cluster was terminated prematurely--too much 870 * hassle. 871 */ 872 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 873 (tbp->b_bcount != tbp->b_bufsize) || 874 (tbp->b_bcount != blksize) || 875 (bytes == blksize) || 876 ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) { 877 totalwritten += tbp->b_bufsize; 878 bawrite(tbp); 879 start_loffset += blksize; 880 bytes -= blksize; 881 continue; 882 } 883 884 /* 885 * Set up the pbuf. Track our append point with b_bcount 886 * and b_bufsize. b_bufsize is not used by the device but 887 * our caller uses it to loop clusters and we use it to 888 * detect a premature EOF on the block device. 889 */ 890 bp->b_bcount = 0; 891 bp->b_bufsize = 0; 892 bp->b_xio.xio_npages = 0; 893 bp->b_loffset = tbp->b_loffset; 894 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset; 895 896 /* 897 * We are synthesizing a buffer out of vm_page_t's, but 898 * if the block size is not page aligned then the starting 899 * address may not be either. Inherit the b_data offset 900 * from the original buffer. 901 */ 902 bp->b_data = (char *)((vm_offset_t)bp->b_data | 903 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 904 bp->b_flags &= ~B_ERROR; 905 bp->b_flags |= B_CLUSTER | B_BNOCLIP | 906 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 907 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 908 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 909 910 /* 911 * From this location in the file, scan forward to see 912 * if there are buffers with adjacent data that need to 913 * be written as well. 914 */ 915 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) { 916 if (i != 0) { /* If not the first buffer */ 917 tbp = findblk(vp, start_loffset, 918 FINDBLK_NBLOCK); 919 /* 920 * Buffer not found or could not be locked 921 * non-blocking. 922 */ 923 if (tbp == NULL) 924 break; 925 926 /* 927 * If it IS in core, but has different 928 * characteristics, then don't cluster 929 * with it. 930 */ 931 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 932 B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 933 != (B_DELWRI | B_CLUSTEROK | 934 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 935 (tbp->b_flags & B_LOCKED) || 936 (LIST_FIRST(&tbp->b_dep) && 937 buf_checkwrite(tbp)) 938 ) { 939 BUF_UNLOCK(tbp); 940 break; 941 } 942 943 /* 944 * Check that the combined cluster 945 * would make sense with regard to pages 946 * and would not be too large 947 */ 948 if ((tbp->b_bcount != blksize) || 949 ((bp->b_bio2.bio_offset + i) != 950 tbp->b_bio2.bio_offset) || 951 ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > 952 (maxiosize / PAGE_SIZE))) { 953 BUF_UNLOCK(tbp); 954 break; 955 } 956 /* 957 * Ok, it's passed all the tests, 958 * so remove it from the free list 959 * and mark it busy. We will use it. 960 */ 961 bremfree(tbp); 962 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 963 } /* end of code for non-first buffers only */ 964 965 /* 966 * If the IO is via the VM then we do some 967 * special VM hackery (yuck). Since the buffer's 968 * block size may not be page-aligned it is possible 969 * for a page to be shared between two buffers. We 970 * have to get rid of the duplication when building 971 * the cluster. 972 */ 973 if (tbp->b_flags & B_VMIO) { 974 vm_page_t m; 975 976 if (i != 0) { /* if not first buffer */ 977 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 978 m = tbp->b_xio.xio_pages[j]; 979 if (m->flags & PG_BUSY) { 980 bqrelse(tbp); 981 goto finishcluster; 982 } 983 } 984 } 985 986 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 987 m = tbp->b_xio.xio_pages[j]; 988 vm_page_busy_wait(m, FALSE, "clurpg"); 989 vm_page_io_start(m); 990 vm_page_wakeup(m); 991 vm_object_pip_add(m->object, 1); 992 if ((bp->b_xio.xio_npages == 0) || 993 (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) { 994 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 995 bp->b_xio.xio_npages++; 996 } 997 } 998 } 999 bp->b_bcount += blksize; 1000 bp->b_bufsize += blksize; 1001 1002 bundirty(tbp); 1003 tbp->b_flags &= ~B_ERROR; 1004 tbp->b_cmd = BUF_CMD_WRITE; 1005 BUF_KERNPROC(tbp); 1006 cluster_append(&bp->b_bio1, tbp); 1007 1008 /* 1009 * check for latent dependencies to be handled 1010 */ 1011 if (LIST_FIRST(&tbp->b_dep) != NULL) 1012 buf_start(tbp); 1013 } 1014 finishcluster: 1015 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 1016 (vm_page_t *) bp->b_xio.xio_pages, bp->b_xio.xio_npages); 1017 if (bp->b_bufsize > bp->b_kvasize) { 1018 panic( 1019 "cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n", 1020 bp->b_bufsize, bp->b_kvasize); 1021 } 1022 totalwritten += bp->b_bufsize; 1023 bp->b_dirtyoff = 0; 1024 bp->b_dirtyend = bp->b_bufsize; 1025 bp->b_bio1.bio_done = cluster_callback; 1026 bp->b_cmd = BUF_CMD_WRITE; 1027 1028 vfs_busy_pages(vp, bp); 1029 bsetrunningbufspace(bp, bp->b_bufsize); 1030 BUF_KERNPROC(bp); 1031 vn_strategy(vp, &bp->b_bio1); 1032 1033 bytes -= i; 1034 } 1035 return totalwritten; 1036 } 1037 1038 /* 1039 * Collect together all the buffers in a cluster. 1040 * Plus add one additional buffer. 1041 */ 1042 static struct cluster_save * 1043 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int blksize) 1044 { 1045 struct cluster_save *buflist; 1046 struct buf *bp; 1047 off_t loffset; 1048 int i, len; 1049 1050 len = (int)(vp->v_lastw - vp->v_cstart + blksize) / blksize; 1051 buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 1052 M_SEGMENT, M_WAITOK); 1053 buflist->bs_nchildren = 0; 1054 buflist->bs_children = (struct buf **) (buflist + 1); 1055 for (loffset = vp->v_cstart, i = 0; i < len; (loffset += blksize), i++) { 1056 (void) bread(vp, loffset, last_bp->b_bcount, &bp); 1057 buflist->bs_children[i] = bp; 1058 if (bp->b_bio2.bio_offset == NOOFFSET) { 1059 VOP_BMAP(bp->b_vp, bp->b_loffset, 1060 &bp->b_bio2.bio_offset, 1061 NULL, NULL, BUF_CMD_WRITE); 1062 } 1063 } 1064 buflist->bs_children[i] = bp = last_bp; 1065 if (bp->b_bio2.bio_offset == NOOFFSET) { 1066 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1067 NULL, NULL, BUF_CMD_WRITE); 1068 } 1069 buflist->bs_nchildren = i + 1; 1070 return (buflist); 1071 } 1072 1073 void 1074 cluster_append(struct bio *bio, struct buf *tbp) 1075 { 1076 tbp->b_cluster_next = NULL; 1077 if (bio->bio_caller_info1.cluster_head == NULL) { 1078 bio->bio_caller_info1.cluster_head = tbp; 1079 bio->bio_caller_info2.cluster_tail = tbp; 1080 } else { 1081 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp; 1082 bio->bio_caller_info2.cluster_tail = tbp; 1083 } 1084 } 1085 1086 static 1087 void 1088 cluster_setram (struct buf *bp) 1089 { 1090 bp->b_flags |= B_RAM; 1091 if (bp->b_xio.xio_npages) 1092 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM); 1093 } 1094