1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * Modifications/enhancements: 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 6 * Copyright (c) 2012-2013 Matthew Dillon. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include "opt_debug_cluster.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/proc.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/malloc.h> 42 #include <sys/mount.h> 43 #include <sys/resourcevar.h> 44 #include <sys/vmmeter.h> 45 #include <vm/vm.h> 46 #include <vm/vm_object.h> 47 #include <vm/vm_page.h> 48 #include <sys/sysctl.h> 49 50 #include <sys/buf2.h> 51 #include <vm/vm_page2.h> 52 53 #include <machine/limits.h> 54 55 /* 56 * Cluster tracking cache - replaces the original vnode v_* fields which had 57 * limited utility and were not MP safe. 58 * 59 * The cluster tracking cache is a simple 4-way set-associative non-chained 60 * cache. It is capable of tracking up to four zones separated by 1MB or 61 * more per vnode. 62 * 63 * NOTE: We want this structure to be cache-line friendly so the iterator 64 * is embedded rather than in a separate array. 65 * 66 * NOTE: A cluster cache entry can become stale when a vnode is recycled. 67 * For now we treat the values as heuristical but also self-consistent. 68 * i.e. the values cannot be completely random and cannot be SMP unsafe 69 * or the cluster code might end-up clustering non-contiguous buffers 70 * at the wrong offsets. 71 */ 72 struct cluster_cache { 73 struct vnode *vp; 74 u_int locked; 75 off_t v_lastw; /* last write (end) (write cluster) */ 76 off_t v_cstart; /* start block (beg) of cluster */ 77 off_t v_lasta; /* last allocation (end) */ 78 u_int v_clen; /* length of current cluster */ 79 u_int iterator; 80 } __cachealign; 81 82 typedef struct cluster_cache cluster_cache_t; 83 84 #define CLUSTER_CACHE_SIZE 512 85 #define CLUSTER_CACHE_MASK (CLUSTER_CACHE_SIZE - 1) 86 87 #define CLUSTER_ZONE ((off_t)(1024 * 1024)) 88 89 cluster_cache_t cluster_array[CLUSTER_CACHE_SIZE]; 90 91 #if defined(CLUSTERDEBUG) 92 static int rcluster= 0; 93 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 94 #endif 95 96 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer"); 97 98 static struct cluster_save * 99 cluster_collectbufs (cluster_cache_t *cc, struct vnode *vp, 100 struct buf *last_bp, int blksize); 101 static struct buf * 102 cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset, 103 off_t doffset, int blksize, int run, 104 struct buf *fbp, int *srp); 105 static void cluster_callback (struct bio *); 106 static void cluster_setram (struct buf *); 107 static void cluster_clrram (struct buf *); 108 static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize, 109 off_t start_loffset, int bytes); 110 111 static int write_behind = 1; 112 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 113 "Cluster write-behind setting"); 114 static quad_t write_behind_minfilesize = 10 * 1024 * 1024; 115 SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW, 116 &write_behind_minfilesize, 0, "Cluster write-behind setting"); 117 static int max_readahead = 2 * 1024 * 1024; 118 SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, 119 "Limit in bytes for desired cluster read-ahead"); 120 121 extern vm_page_t bogus_page; 122 123 /* 124 * nblks is our cluster_rbuild request size. The approximate number of 125 * physical read-ahead requests is maxra / nblks. The physical request 126 * size is limited by the device (maxrbuild). We also do not want to make 127 * the request size too big or it will mess up the B_RAM streaming. 128 */ 129 static __inline 130 int 131 calc_rbuild_reqsize(int maxra, int maxrbuild) 132 { 133 int nblks; 134 135 if ((nblks = maxra / 4) > maxrbuild) 136 nblks = maxrbuild; 137 if (nblks < 1) 138 nblks = maxra; 139 return nblks; 140 } 141 142 /* 143 * Acquire/release cluster cache (can return dummy entry) 144 */ 145 static 146 cluster_cache_t * 147 cluster_getcache(cluster_cache_t *dummy, struct vnode *vp, off_t loffset) 148 { 149 cluster_cache_t *cc; 150 size_t hv; 151 int i; 152 int xact; 153 154 hv = (size_t)(intptr_t)vp ^ (size_t)(intptr_t)vp / sizeof(*vp); 155 hv &= CLUSTER_CACHE_MASK & ~3; 156 cc = &cluster_array[hv]; 157 158 xact = -1; 159 for (i = 0; i < 4; ++i) { 160 if (cc[i].vp != vp) 161 continue; 162 if (rounddown2(cc[i].v_cstart ^ loffset, CLUSTER_ZONE) == 0) { 163 xact = i; 164 break; 165 } 166 } 167 if (xact >= 0 && atomic_swap_int(&cc[xact].locked, 1) == 0) { 168 if (cc[xact].vp == vp && 169 rounddown2(cc[i].v_cstart ^ loffset, CLUSTER_ZONE) == 0) { 170 return(&cc[xact]); 171 } 172 atomic_swap_int(&cc[xact].locked, 0); 173 } 174 175 /* 176 * New entry. If we can't acquire the cache line then use the 177 * passed-in dummy element and reset all fields. 178 * 179 * When we are able to acquire the cache line we only clear the 180 * fields if the vp does not match. This allows us to multi-zone 181 * a vp and for excessive zones / partial clusters to be retired. 182 */ 183 i = cc->iterator++ & 3; 184 cc += i; 185 if (atomic_swap_int(&cc->locked, 1) != 0) { 186 cc = dummy; 187 cc->locked = 1; 188 cc->vp = NULL; 189 } 190 if (cc->vp != vp) { 191 cc->vp = vp; 192 cc->v_lasta = 0; 193 cc->v_clen = 0; 194 cc->v_cstart = 0; 195 cc->v_lastw = 0; 196 } 197 return(cc); 198 } 199 200 static 201 void 202 cluster_putcache(cluster_cache_t *cc) 203 { 204 atomic_swap_int(&cc->locked, 0); 205 } 206 207 /* 208 * This replaces bread(), providing a synchronous read of the requested 209 * buffer plus asynchronous read-ahead within the specified bounds. 210 * 211 * The caller may pre-populate *bpp if it already has the requested buffer 212 * in-hand, else must set *bpp to NULL. Note that the cluster_read() inline 213 * sets *bpp to NULL and then calls cluster_readx() for compatibility. 214 * 215 * filesize - read-ahead @ blksize will not cross this boundary 216 * loffset - loffset for returned *bpp 217 * blksize - blocksize for returned *bpp and read-ahead bps 218 * minreq - minimum (not a hard minimum) in bytes, typically reflects 219 * a higher level uio resid. 220 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) 221 * bpp - return buffer (*bpp) for (loffset,blksize) 222 */ 223 int 224 cluster_readx(struct vnode *vp, off_t filesize, off_t loffset, int blksize, 225 int bflags, size_t minreq, size_t maxreq, 226 struct buf **bpp) 227 { 228 struct buf *bp, *rbp, *reqbp; 229 off_t origoffset; 230 off_t doffset; 231 int error; 232 int i; 233 int maxra; 234 int maxrbuild; 235 int sr; 236 int blkflags = (bflags & B_KVABIO) ? GETBLK_KVABIO : 0; 237 238 sr = 0; 239 240 /* 241 * Calculate the desired read-ahead in blksize'd blocks (maxra). 242 * To do this we calculate maxreq. 243 * 244 * maxreq typically starts out as a sequential heuristic. If the 245 * high level uio/resid is bigger (minreq), we pop maxreq up to 246 * minreq. This represents the case where random I/O is being 247 * performed by the userland is issuing big read()'s. 248 * 249 * Then we limit maxreq to max_readahead to ensure it is a reasonable 250 * value. 251 * 252 * Finally we must ensure that (loffset + maxreq) does not cross the 253 * boundary (filesize) for the current blocksize. If we allowed it 254 * to cross we could end up with buffers past the boundary with the 255 * wrong block size (HAMMER large-data areas use mixed block sizes). 256 * minreq is also absolutely limited to filesize. 257 */ 258 if (maxreq < minreq) 259 maxreq = minreq; 260 /* minreq not used beyond this point */ 261 262 if (maxreq > max_readahead) { 263 maxreq = max_readahead; 264 if (maxreq > 16 * 1024 * 1024) 265 maxreq = 16 * 1024 * 1024; 266 } 267 if (maxreq < blksize) 268 maxreq = blksize; 269 if (loffset + maxreq > filesize) { 270 if (loffset > filesize) 271 maxreq = 0; 272 else 273 maxreq = filesize - loffset; 274 } 275 276 maxra = (int)(maxreq / blksize); 277 278 /* 279 * Get the requested block. 280 */ 281 if (*bpp) 282 reqbp = bp = *bpp; 283 else 284 *bpp = reqbp = bp = getblk(vp, loffset, blksize, blkflags, 0); 285 origoffset = loffset; 286 287 /* 288 * Calculate the maximum cluster size for a single I/O, used 289 * by cluster_rbuild(). 290 */ 291 maxrbuild = vmaxiosize(vp) / blksize; 292 293 /* 294 * If it is in the cache, then check to see if the reads have been 295 * sequential. If they have, then try some read-ahead, otherwise 296 * back-off on prospective read-aheads. 297 */ 298 if (bp->b_flags & B_CACHE) { 299 /* 300 * Not sequential, do not do any read-ahead 301 */ 302 if (maxra <= 1) 303 return 0; 304 305 /* 306 * No read-ahead mark, do not do any read-ahead 307 * yet. 308 */ 309 if ((bp->b_flags & B_RAM) == 0) 310 return 0; 311 312 /* 313 * We hit a read-ahead-mark, figure out how much read-ahead 314 * to do (maxra) and where to start (loffset). 315 * 316 * Typically the way this works is that B_RAM is set in the 317 * middle of the cluster and triggers an overlapping 318 * read-ahead of 1/2 a cluster more blocks. This ensures 319 * that the cluster read-ahead scales with the read-ahead 320 * count and is thus better-able to absorb the caller's 321 * latency. 322 * 323 * Estimate where the next unread block will be by assuming 324 * that the B_RAM's are placed at the half-way point. 325 */ 326 bp->b_flags &= ~B_RAM; 327 328 i = maxra / 2; 329 rbp = findblk(vp, loffset + i * blksize, FINDBLK_TEST); 330 if (rbp == NULL || (rbp->b_flags & B_CACHE) == 0) { 331 while (i) { 332 --i; 333 rbp = findblk(vp, loffset + i * blksize, 334 FINDBLK_TEST); 335 if (rbp) { 336 ++i; 337 break; 338 } 339 } 340 } else { 341 while (i < maxra) { 342 rbp = findblk(vp, loffset + i * blksize, 343 FINDBLK_TEST); 344 if (rbp == NULL) 345 break; 346 ++i; 347 } 348 } 349 350 /* 351 * We got everything or everything is in the cache, no 352 * point continuing. 353 */ 354 if (i >= maxra) 355 return 0; 356 357 /* 358 * Calculate where to start the read-ahead and how much 359 * to do. Generally speaking we want to read-ahead by 360 * (maxra) when we've found a read-ahead mark. We do 361 * not want to reduce maxra here as it will cause 362 * successive read-ahead I/O's to be smaller and smaller. 363 * 364 * However, we have to make sure we don't break the 365 * filesize limitation for the clustered operation. 366 */ 367 loffset += i * blksize; 368 reqbp = bp = NULL; 369 370 if (loffset >= filesize) 371 return 0; 372 if (loffset + maxra * blksize > filesize) { 373 maxreq = filesize - loffset; 374 maxra = (int)(maxreq / blksize); 375 } 376 377 /* 378 * Set RAM on first read-ahead block since we still have 379 * approximate maxra/2 blocks ahead of us that are already 380 * cached or in-progress. 381 */ 382 sr = 1; 383 } else { 384 /* 385 * Start block is not valid, we will want to do a 386 * full read-ahead. 387 */ 388 __debugvar off_t firstread = bp->b_loffset; 389 int nblks; 390 391 /* 392 * Set-up synchronous read for bp. 393 */ 394 bp->b_cmd = BUF_CMD_READ; 395 bp->b_bio1.bio_done = biodone_sync; 396 bp->b_bio1.bio_flags |= BIO_SYNC; 397 398 KASSERT(firstread != NOOFFSET, 399 ("cluster_read: no buffer offset")); 400 401 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 402 403 /* 404 * Set RAM half-way through the full-cluster. 405 */ 406 sr = (maxra + 1) / 2; 407 408 if (nblks > 1) { 409 int burstbytes; 410 411 error = VOP_BMAP(vp, loffset, &doffset, 412 &burstbytes, NULL, BUF_CMD_READ); 413 if (error) 414 goto single_block_read; 415 if (nblks > burstbytes / blksize) 416 nblks = burstbytes / blksize; 417 if (doffset == NOOFFSET) 418 goto single_block_read; 419 if (nblks <= 1) 420 goto single_block_read; 421 422 bp = cluster_rbuild(vp, filesize, loffset, 423 doffset, blksize, nblks, bp, &sr); 424 loffset += bp->b_bufsize; 425 maxra -= bp->b_bufsize / blksize; 426 } else { 427 single_block_read: 428 /* 429 * If it isn't in the cache, then get a chunk from 430 * disk if sequential, otherwise just get the block. 431 */ 432 loffset += blksize; 433 --maxra; 434 } 435 } 436 437 /* 438 * If B_CACHE was not set issue bp. bp will either be an 439 * asynchronous cluster buf or a synchronous single-buf. 440 * If it is a single buf it will be the same as reqbp. 441 * 442 * NOTE: Once an async cluster buf is issued bp becomes invalid. 443 */ 444 if (bp) { 445 #if defined(CLUSTERDEBUG) 446 if (rcluster) 447 kprintf("S(%012jx,%d,%d)\n", 448 (intmax_t)bp->b_loffset, bp->b_bcount, maxra); 449 #endif 450 if ((bp->b_flags & B_CLUSTER) == 0) 451 vfs_busy_pages(vp, bp); 452 bp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); 453 bp->b_flags |= bflags; 454 vn_strategy(vp, &bp->b_bio1); 455 /* bp invalid now */ 456 bp = NULL; 457 } 458 459 #if defined(CLUSTERDEBUG) 460 if (rcluster) 461 kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n", 462 loffset, blksize, maxra, sr); 463 #endif 464 465 /* 466 * If we have been doing sequential I/O, then do some read-ahead. 467 * The code above us should have positioned us at the next likely 468 * offset. 469 * 470 * Only mess with buffers which we can immediately lock. HAMMER 471 * will do device-readahead irrespective of what the blocks 472 * represent. 473 * 474 * Set B_RAM on the first buffer (the next likely offset needing 475 * read-ahead), under the assumption that there are still 476 * approximately maxra/2 blocks good ahead of us. 477 */ 478 while (maxra > 0) { 479 int burstbytes; 480 int nblks; 481 482 rbp = getblk(vp, loffset, blksize, 483 GETBLK_SZMATCH | GETBLK_NOWAIT | GETBLK_KVABIO, 484 0); 485 #if defined(CLUSTERDEBUG) 486 if (rcluster) { 487 kprintf("read-ahead %016jx rbp=%p ", 488 loffset, rbp); 489 } 490 #endif 491 if (rbp == NULL) 492 goto no_read_ahead; 493 if ((rbp->b_flags & B_CACHE)) { 494 bqrelse(rbp); 495 goto no_read_ahead; 496 } 497 498 /* 499 * If BMAP is not supported or has an issue, we still do 500 * (maxra) read-ahead, but we do not try to use rbuild. 501 */ 502 error = VOP_BMAP(vp, loffset, &doffset, 503 &burstbytes, NULL, BUF_CMD_READ); 504 if (error || doffset == NOOFFSET) { 505 nblks = 1; 506 doffset = NOOFFSET; 507 } else { 508 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 509 if (nblks > burstbytes / blksize) 510 nblks = burstbytes / blksize; 511 } 512 rbp->b_cmd = BUF_CMD_READ; 513 514 if (nblks > 1) { 515 rbp = cluster_rbuild(vp, filesize, loffset, 516 doffset, blksize, 517 nblks, rbp, &sr); 518 } else { 519 rbp->b_bio2.bio_offset = doffset; 520 if (--sr == 0) 521 cluster_setram(rbp); 522 } 523 524 rbp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); 525 rbp->b_flags |= bflags; 526 527 if ((rbp->b_flags & B_CLUSTER) == 0) 528 vfs_busy_pages(vp, rbp); 529 BUF_KERNPROC(rbp); 530 loffset += rbp->b_bufsize; 531 maxra -= rbp->b_bufsize / blksize; 532 vn_strategy(vp, &rbp->b_bio1); 533 /* rbp invalid now */ 534 } 535 536 /* 537 * Wait for our original buffer to complete its I/O. reqbp will 538 * be NULL if the original buffer was B_CACHE. We are returning 539 * (*bpp) which is the same as reqbp when reqbp != NULL. 540 */ 541 no_read_ahead: 542 if (reqbp) { 543 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC); 544 error = biowait(&reqbp->b_bio1, "clurd"); 545 } else { 546 error = 0; 547 } 548 return (error); 549 } 550 551 /* 552 * This replaces breadcb(), providing an asynchronous read of the requested 553 * buffer with a callback, plus an asynchronous read-ahead within the 554 * specified bounds. 555 * 556 * The callback must check whether BIO_DONE is set in the bio and issue 557 * the bpdone(bp, 0) if it isn't. The callback is responsible for clearing 558 * BIO_DONE and disposing of the I/O (bqrelse()ing it). 559 * 560 * filesize - read-ahead @ blksize will not cross this boundary 561 * loffset - loffset for returned *bpp 562 * blksize - blocksize for returned *bpp and read-ahead bps 563 * minreq - minimum (not a hard minimum) in bytes, typically reflects 564 * a higher level uio resid. 565 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) 566 * bpp - return buffer (*bpp) for (loffset,blksize) 567 */ 568 void 569 cluster_readcb(struct vnode *vp, off_t filesize, off_t loffset, int blksize, 570 int bflags, size_t minreq, size_t maxreq, 571 void (*func)(struct bio *), void *arg) 572 { 573 struct buf *bp, *rbp, *reqbp; 574 off_t origoffset; 575 off_t doffset; 576 int i; 577 int maxra; 578 int maxrbuild; 579 int sr; 580 int blkflags = (bflags & B_KVABIO) ? GETBLK_KVABIO : 0; 581 582 sr = 0; 583 584 /* 585 * Calculate the desired read-ahead in blksize'd blocks (maxra). 586 * To do this we calculate maxreq. 587 * 588 * maxreq typically starts out as a sequential heuristic. If the 589 * high level uio/resid is bigger (minreq), we pop maxreq up to 590 * minreq. This represents the case where random I/O is being 591 * performed by the userland is issuing big read()'s. 592 * 593 * Then we limit maxreq to max_readahead to ensure it is a reasonable 594 * value. 595 * 596 * Finally we must ensure that (loffset + maxreq) does not cross the 597 * boundary (filesize) for the current blocksize. If we allowed it 598 * to cross we could end up with buffers past the boundary with the 599 * wrong block size (HAMMER large-data areas use mixed block sizes). 600 * minreq is also absolutely limited to filesize. 601 */ 602 if (maxreq < minreq) 603 maxreq = minreq; 604 /* minreq not used beyond this point */ 605 606 if (maxreq > max_readahead) { 607 maxreq = max_readahead; 608 if (maxreq > 16 * 1024 * 1024) 609 maxreq = 16 * 1024 * 1024; 610 } 611 if (maxreq < blksize) 612 maxreq = blksize; 613 if (loffset + maxreq > filesize) { 614 if (loffset > filesize) 615 maxreq = 0; 616 else 617 maxreq = filesize - loffset; 618 } 619 620 maxra = (int)(maxreq / blksize); 621 622 /* 623 * Get the requested block. 624 */ 625 reqbp = bp = getblk(vp, loffset, blksize, blkflags, 0); 626 origoffset = loffset; 627 628 /* 629 * Calculate the maximum cluster size for a single I/O, used 630 * by cluster_rbuild(). 631 */ 632 maxrbuild = vmaxiosize(vp) / blksize; 633 634 /* 635 * if it is in the cache, then check to see if the reads have been 636 * sequential. If they have, then try some read-ahead, otherwise 637 * back-off on prospective read-aheads. 638 */ 639 if (bp->b_flags & B_CACHE) { 640 /* 641 * Setup for func() call whether we do read-ahead or not. 642 */ 643 bp->b_bio1.bio_caller_info1.ptr = arg; 644 bp->b_bio1.bio_flags |= BIO_DONE; 645 646 /* 647 * Not sequential, do not do any read-ahead 648 */ 649 if (maxra <= 1) 650 goto no_read_ahead; 651 652 /* 653 * No read-ahead mark, do not do any read-ahead 654 * yet. 655 */ 656 if ((bp->b_flags & B_RAM) == 0) 657 goto no_read_ahead; 658 bp->b_flags &= ~B_RAM; 659 660 /* 661 * We hit a read-ahead-mark, figure out how much read-ahead 662 * to do (maxra) and where to start (loffset). 663 * 664 * Shortcut the scan. Typically the way this works is that 665 * we've built up all the blocks inbetween except for the 666 * last in previous iterations, so if the second-to-last 667 * block is present we just skip ahead to it. 668 * 669 * This algorithm has O(1) cpu in the steady state no 670 * matter how large maxra is. 671 */ 672 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST)) 673 i = maxra - 1; 674 else 675 i = 1; 676 while (i < maxra) { 677 if (findblk(vp, loffset + i * blksize, 678 FINDBLK_TEST) == NULL) { 679 break; 680 } 681 ++i; 682 } 683 684 /* 685 * We got everything or everything is in the cache, no 686 * point continuing. 687 */ 688 if (i >= maxra) 689 goto no_read_ahead; 690 691 /* 692 * Calculate where to start the read-ahead and how much 693 * to do. Generally speaking we want to read-ahead by 694 * (maxra) when we've found a read-ahead mark. We do 695 * not want to reduce maxra here as it will cause 696 * successive read-ahead I/O's to be smaller and smaller. 697 * 698 * However, we have to make sure we don't break the 699 * filesize limitation for the clustered operation. 700 */ 701 loffset += i * blksize; 702 bp = NULL; 703 /* leave reqbp intact to force function callback */ 704 705 if (loffset >= filesize) 706 goto no_read_ahead; 707 if (loffset + maxra * blksize > filesize) { 708 maxreq = filesize - loffset; 709 maxra = (int)(maxreq / blksize); 710 } 711 sr = 1; 712 } else { 713 /* 714 * bp is not valid, no prior cluster in progress so get a 715 * full cluster read-ahead going. 716 */ 717 __debugvar off_t firstread = bp->b_loffset; 718 int nblks; 719 int error; 720 721 /* 722 * Set-up synchronous read for bp. 723 */ 724 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL | B_NOTMETA); 725 bp->b_flags |= bflags; 726 bp->b_cmd = BUF_CMD_READ; 727 bp->b_bio1.bio_done = func; 728 bp->b_bio1.bio_caller_info1.ptr = arg; 729 BUF_KERNPROC(bp); 730 reqbp = NULL; /* don't func() reqbp, it's running async */ 731 732 KASSERT(firstread != NOOFFSET, 733 ("cluster_read: no buffer offset")); 734 735 /* 736 * nblks is our cluster_rbuild request size, limited 737 * primarily by the device. 738 */ 739 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 740 741 /* 742 * Set RAM half-way through the full-cluster. 743 */ 744 sr = (maxra + 1) / 2; 745 746 if (nblks > 1) { 747 int burstbytes; 748 749 error = VOP_BMAP(vp, loffset, &doffset, 750 &burstbytes, NULL, BUF_CMD_READ); 751 if (error) 752 goto single_block_read; 753 if (nblks > burstbytes / blksize) 754 nblks = burstbytes / blksize; 755 if (doffset == NOOFFSET) 756 goto single_block_read; 757 if (nblks <= 1) 758 goto single_block_read; 759 760 bp = cluster_rbuild(vp, filesize, loffset, 761 doffset, blksize, nblks, bp, &sr); 762 loffset += bp->b_bufsize; 763 maxra -= bp->b_bufsize / blksize; 764 } else { 765 single_block_read: 766 /* 767 * If it isn't in the cache, then get a chunk from 768 * disk if sequential, otherwise just get the block. 769 */ 770 loffset += blksize; 771 --maxra; 772 } 773 } 774 775 /* 776 * If bp != NULL then B_CACHE was *NOT* set and bp must be issued. 777 * bp will either be an asynchronous cluster buf or an asynchronous 778 * single-buf. 779 * 780 * NOTE: Once an async cluster buf is issued bp becomes invalid. 781 */ 782 if (bp) { 783 #if defined(CLUSTERDEBUG) 784 if (rcluster) 785 kprintf("S(%012jx,%d,%d)\n", 786 (intmax_t)bp->b_loffset, bp->b_bcount, maxra); 787 #endif 788 if ((bp->b_flags & B_CLUSTER) == 0) 789 vfs_busy_pages(vp, bp); 790 bp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); 791 bp->b_flags |= bflags; 792 vn_strategy(vp, &bp->b_bio1); 793 /* bp invalid now */ 794 bp = NULL; 795 } 796 797 #if defined(CLUSTERDEBUG) 798 if (rcluster) 799 kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n", 800 loffset, blksize, maxra, sr); 801 #endif 802 803 /* 804 * If we have been doing sequential I/O, then do some read-ahead. 805 * The code above us should have positioned us at the next likely 806 * offset. 807 * 808 * Only mess with buffers which we can immediately lock. HAMMER 809 * will do device-readahead irrespective of what the blocks 810 * represent. 811 */ 812 while (maxra > 0) { 813 int burstbytes; 814 int error; 815 int nblks; 816 817 rbp = getblk(vp, loffset, blksize, 818 GETBLK_SZMATCH | GETBLK_NOWAIT | GETBLK_KVABIO, 819 0); 820 if (rbp == NULL) 821 goto no_read_ahead; 822 if ((rbp->b_flags & B_CACHE)) { 823 bqrelse(rbp); 824 goto no_read_ahead; 825 } 826 827 /* 828 * If BMAP is not supported or has an issue, we still do 829 * (maxra) read-ahead, but we do not try to use rbuild. 830 */ 831 error = VOP_BMAP(vp, loffset, &doffset, 832 &burstbytes, NULL, BUF_CMD_READ); 833 if (error || doffset == NOOFFSET) { 834 nblks = 1; 835 doffset = NOOFFSET; 836 } else { 837 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 838 if (nblks > burstbytes / blksize) 839 nblks = burstbytes / blksize; 840 } 841 rbp->b_cmd = BUF_CMD_READ; 842 843 if (nblks > 1) { 844 rbp = cluster_rbuild(vp, filesize, loffset, 845 doffset, blksize, 846 nblks, rbp, &sr); 847 } else { 848 rbp->b_bio2.bio_offset = doffset; 849 if (--sr == 0) 850 cluster_setram(rbp); 851 } 852 853 rbp->b_flags &= ~(B_ERROR | B_INVAL | B_NOTMETA); 854 rbp->b_flags |= bflags; 855 856 if ((rbp->b_flags & B_CLUSTER) == 0) 857 vfs_busy_pages(vp, rbp); 858 BUF_KERNPROC(rbp); 859 loffset += rbp->b_bufsize; 860 maxra -= rbp->b_bufsize / blksize; 861 vn_strategy(vp, &rbp->b_bio1); 862 /* rbp invalid now */ 863 } 864 865 /* 866 * If reqbp is non-NULL it had B_CACHE set and we issue the 867 * function callback synchronously. 868 * 869 * Note that we may start additional asynchronous I/O before doing 870 * the func() callback for the B_CACHE case 871 */ 872 no_read_ahead: 873 if (reqbp) 874 func(&reqbp->b_bio1); 875 } 876 877 /* 878 * If blocks are contiguous on disk, use this to provide clustered 879 * read ahead. We will read as many blocks as possible sequentially 880 * and then parcel them up into logical blocks in the buffer hash table. 881 * 882 * This function either returns a cluster buf or it returns fbp. fbp is 883 * already expected to be set up as a synchronous or asynchronous request. 884 * 885 * If a cluster buf is returned it will always be async. 886 * 887 * (*srp) counts down original blocks to determine where B_RAM should be set. 888 * Set B_RAM when *srp drops to 0. If (*srp) starts at 0, B_RAM will not be 889 * set on any buffer. Make sure B_RAM is cleared on any other buffers to 890 * prevent degenerate read-aheads from being generated. 891 */ 892 static struct buf * 893 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, 894 int blksize, int run, struct buf *fbp, int *srp) 895 { 896 struct buf *bp, *tbp; 897 off_t boffset; 898 int i, j; 899 int maxiosize = vmaxiosize(vp); 900 901 /* 902 * avoid a division 903 */ 904 while (loffset + run * blksize > filesize) { 905 --run; 906 } 907 908 tbp = fbp; 909 tbp->b_bio2.bio_offset = doffset; 910 if (((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) { 911 if (--*srp == 0) 912 cluster_setram(tbp); 913 else 914 cluster_clrram(tbp); 915 return tbp; 916 } 917 918 /* 919 * Get a pbuf, limit cluster I/O on a per-device basis. If 920 * doing cluster I/O for a file, limit cluster I/O on a 921 * per-mount basis. 922 */ 923 if (vp->v_type == VCHR || vp->v_type == VBLK) 924 bp = trypbuf_kva(&vp->v_pbuf_count); 925 else 926 bp = trypbuf_kva(&vp->v_mount->mnt_pbuf_count); 927 928 if (bp == NULL) 929 return tbp; 930 931 /* 932 * We are synthesizing a buffer out of vm_page_t's, but 933 * if the block size is not page aligned then the starting 934 * address may not be either. Inherit the b_data offset 935 * from the original buffer. 936 */ 937 bp->b_vp = vp; 938 bp->b_data = (char *)((vm_offset_t)bp->b_data | 939 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 940 bp->b_flags |= B_CLUSTER | B_VMIO | B_KVABIO; 941 bp->b_cmd = BUF_CMD_READ; 942 bp->b_bio1.bio_done = cluster_callback; /* default to async */ 943 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 944 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 945 bp->b_loffset = loffset; 946 bp->b_bio2.bio_offset = doffset; 947 KASSERT(bp->b_loffset != NOOFFSET, 948 ("cluster_rbuild: no buffer offset")); 949 950 bp->b_bcount = 0; 951 bp->b_bufsize = 0; 952 bp->b_xio.xio_npages = 0; 953 954 for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) { 955 if (i) { 956 if ((bp->b_xio.xio_npages * PAGE_SIZE) + 957 round_page(blksize) > maxiosize) { 958 break; 959 } 960 961 /* 962 * Shortcut some checks and try to avoid buffers that 963 * would block in the lock. The same checks have to 964 * be made again after we officially get the buffer. 965 */ 966 tbp = getblk(vp, loffset + i * blksize, blksize, 967 GETBLK_SZMATCH | 968 GETBLK_NOWAIT | 969 GETBLK_KVABIO, 970 0); 971 if (tbp == NULL) 972 break; 973 for (j = 0; j < tbp->b_xio.xio_npages; j++) { 974 if (tbp->b_xio.xio_pages[j]->valid) 975 break; 976 } 977 if (j != tbp->b_xio.xio_npages) { 978 bqrelse(tbp); 979 break; 980 } 981 982 /* 983 * Stop scanning if the buffer is fuly valid 984 * (marked B_CACHE), or locked (may be doing a 985 * background write), or if the buffer is not 986 * VMIO backed. The clustering code can only deal 987 * with VMIO-backed buffers. 988 */ 989 if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || 990 (tbp->b_flags & B_VMIO) == 0 || 991 (LIST_FIRST(&tbp->b_dep) != NULL && 992 buf_checkread(tbp)) 993 ) { 994 bqrelse(tbp); 995 break; 996 } 997 998 /* 999 * The buffer must be completely invalid in order to 1000 * take part in the cluster. If it is partially valid 1001 * then we stop. 1002 */ 1003 for (j = 0;j < tbp->b_xio.xio_npages; j++) { 1004 if (tbp->b_xio.xio_pages[j]->valid) 1005 break; 1006 } 1007 if (j != tbp->b_xio.xio_npages) { 1008 bqrelse(tbp); 1009 break; 1010 } 1011 1012 /* 1013 * Depress the priority of buffers not explicitly 1014 * requested. 1015 */ 1016 /* tbp->b_flags |= B_AGE; */ 1017 1018 /* 1019 * Set the block number if it isn't set, otherwise 1020 * if it is make sure it matches the block number we 1021 * expect. 1022 */ 1023 if (tbp->b_bio2.bio_offset == NOOFFSET) { 1024 tbp->b_bio2.bio_offset = boffset; 1025 } else if (tbp->b_bio2.bio_offset != boffset) { 1026 brelse(tbp); 1027 break; 1028 } 1029 } 1030 1031 /* 1032 * Set B_RAM if (*srp) is 1. B_RAM is only set on one buffer 1033 * in the cluster, including potentially the first buffer 1034 * once we start streaming the read-aheads. 1035 */ 1036 if (--*srp == 0) 1037 cluster_setram(tbp); 1038 else 1039 cluster_clrram(tbp); 1040 1041 /* 1042 * The passed-in tbp (i == 0) will already be set up for 1043 * async or sync operation. All other tbp's acquire in 1044 * our loop are set up for async operation. 1045 */ 1046 tbp->b_cmd = BUF_CMD_READ; 1047 BUF_KERNPROC(tbp); 1048 cluster_append(&bp->b_bio1, tbp); 1049 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 1050 vm_page_t m; 1051 1052 m = tbp->b_xio.xio_pages[j]; 1053 vm_page_busy_wait(m, FALSE, "clurpg"); 1054 vm_page_io_start(m); 1055 vm_page_wakeup(m); 1056 vm_object_pip_add(m->object, 1); 1057 if ((bp->b_xio.xio_npages == 0) || 1058 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) { 1059 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 1060 bp->b_xio.xio_npages++; 1061 } 1062 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) { 1063 tbp->b_xio.xio_pages[j] = bogus_page; 1064 tbp->b_flags |= B_HASBOGUS; 1065 } 1066 } 1067 /* 1068 * XXX shouldn't this be += size for both, like in 1069 * cluster_wbuild()? 1070 * 1071 * Don't inherit tbp->b_bufsize as it may be larger due to 1072 * a non-page-aligned size. Instead just aggregate using 1073 * 'size'. 1074 */ 1075 if (tbp->b_bcount != blksize) 1076 kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize); 1077 if (tbp->b_bufsize != blksize) 1078 kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize); 1079 bp->b_bcount += blksize; 1080 bp->b_bufsize += blksize; 1081 } 1082 1083 /* 1084 * Fully valid pages in the cluster are already good and do not need 1085 * to be re-read from disk. Replace the page with bogus_page 1086 */ 1087 for (j = 0; j < bp->b_xio.xio_npages; j++) { 1088 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) == 1089 VM_PAGE_BITS_ALL) { 1090 bp->b_xio.xio_pages[j] = bogus_page; 1091 bp->b_flags |= B_HASBOGUS; 1092 } 1093 } 1094 if (bp->b_bufsize > bp->b_kvasize) { 1095 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)", 1096 bp->b_bufsize, bp->b_kvasize); 1097 } 1098 pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data), 1099 (vm_page_t *)bp->b_xio.xio_pages, 1100 bp->b_xio.xio_npages); 1101 BUF_KERNPROC(bp); 1102 return (bp); 1103 } 1104 1105 /* 1106 * Cleanup after a clustered read or write. 1107 * This is complicated by the fact that any of the buffers might have 1108 * extra memory (if there were no empty buffer headers at allocbuf time) 1109 * that we will need to shift around. 1110 * 1111 * The returned bio is &bp->b_bio1 1112 */ 1113 static void 1114 cluster_callback(struct bio *bio) 1115 { 1116 struct buf *bp = bio->bio_buf; 1117 struct buf *tbp; 1118 struct buf *next; 1119 struct vnode *vp; 1120 int error = 0; 1121 int bpflags; 1122 1123 /* 1124 * Must propogate errors to all the components. A short read (EOF) 1125 * is a critical error. 1126 */ 1127 if (bp->b_flags & B_ERROR) { 1128 error = bp->b_error; 1129 } else if (bp->b_bcount != bp->b_bufsize) { 1130 panic("cluster_callback: unexpected EOF on cluster %p!", bio); 1131 } 1132 1133 pmap_qremove_noinval(trunc_page((vm_offset_t) bp->b_data), 1134 bp->b_xio.xio_npages); 1135 1136 /* 1137 * Retrieve the cluster head and dispose of the cluster buffer. 1138 * the vp is only valid while we hold one or more cluster elements, 1139 * so we have to do this before disposing of them. 1140 */ 1141 tbp = bio->bio_caller_info1.cluster_head; 1142 bio->bio_caller_info1.cluster_head = NULL; 1143 bpflags = bp->b_flags; 1144 vp = bp->b_vp; 1145 bp->b_vp = NULL; 1146 1147 if (vp->v_type == VCHR || vp->v_type == VBLK) 1148 relpbuf(bp, &vp->v_pbuf_count); 1149 else 1150 relpbuf(bp, &vp->v_mount->mnt_pbuf_count); 1151 bp = NULL; /* SAFETY */ 1152 1153 /* 1154 * Move memory from the large cluster buffer into the component 1155 * buffers and mark IO as done on these. Since the memory map 1156 * is the same, no actual copying is required. 1157 * 1158 * (And we already disposed of the larger cluster buffer) 1159 */ 1160 while (tbp) { 1161 next = tbp->b_cluster_next; 1162 if (error) { 1163 tbp->b_flags |= B_ERROR | B_IOISSUED; 1164 tbp->b_error = error; 1165 } else { 1166 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 1167 tbp->b_flags &= ~(B_ERROR | B_INVAL); 1168 if (tbp->b_cmd == BUF_CMD_READ) { 1169 tbp->b_flags = (tbp->b_flags & ~B_NOTMETA) | 1170 (bpflags & B_NOTMETA); 1171 } 1172 tbp->b_flags |= B_IOISSUED; 1173 1174 /* 1175 * XXX the bdwrite()/bqrelse() issued during 1176 * cluster building clears B_RELBUF (see bqrelse() 1177 * comment). If direct I/O was specified, we have 1178 * to restore it here to allow the buffer and VM 1179 * to be freed. 1180 */ 1181 if (tbp->b_flags & B_DIRECT) 1182 tbp->b_flags |= B_RELBUF; 1183 1184 /* 1185 * XXX I think biodone() below will do this, but do 1186 * it here anyway for consistency. 1187 */ 1188 if (tbp->b_cmd == BUF_CMD_WRITE) 1189 bundirty(tbp); 1190 } 1191 biodone(&tbp->b_bio1); 1192 tbp = next; 1193 } 1194 } 1195 1196 /* 1197 * Implement modified write build for cluster. 1198 * 1199 * write_behind = 0 write behind disabled 1200 * write_behind = 1 write behind normal (default) 1201 * write_behind = 2 write behind backed-off 1202 * 1203 * In addition, write_behind is only activated for files that have 1204 * grown past a certain size (default 10MB). Otherwise temporary files 1205 * wind up generating a lot of unnecessary disk I/O. 1206 */ 1207 static __inline int 1208 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len) 1209 { 1210 int r = 0; 1211 1212 switch(write_behind) { 1213 case 2: 1214 if (start_loffset < len) 1215 break; 1216 start_loffset -= len; 1217 /* fall through */ 1218 case 1: 1219 if (vp->v_filesize >= write_behind_minfilesize) { 1220 r = cluster_wbuild(vp, NULL, blksize, 1221 start_loffset, len); 1222 } 1223 /* fall through */ 1224 default: 1225 /* fall through */ 1226 break; 1227 } 1228 return(r); 1229 } 1230 1231 /* 1232 * Do clustered write for FFS. 1233 * 1234 * Three cases: 1235 * 1. Write is not sequential (write asynchronously) 1236 * Write is sequential: 1237 * 2. beginning of cluster - begin cluster 1238 * 3. middle of a cluster - add to cluster 1239 * 4. end of a cluster - asynchronously write cluster 1240 * 1241 * WARNING! vnode fields are not locked and must ONLY be used heuristically. 1242 */ 1243 void 1244 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount) 1245 { 1246 struct vnode *vp; 1247 off_t loffset; 1248 int maxclen, cursize; 1249 int async; 1250 cluster_cache_t dummy; 1251 cluster_cache_t *cc; 1252 1253 vp = bp->b_vp; 1254 if (vp->v_type == VREG) 1255 async = vp->v_mount->mnt_flag & MNT_ASYNC; 1256 else 1257 async = 0; 1258 loffset = bp->b_loffset; 1259 KASSERT(bp->b_loffset != NOOFFSET, 1260 ("cluster_write: no buffer offset")); 1261 1262 cc = cluster_getcache(&dummy, vp, loffset); 1263 1264 /* 1265 * Initialize vnode to beginning of file. 1266 */ 1267 if (loffset == 0) 1268 cc->v_lasta = cc->v_clen = cc->v_cstart = cc->v_lastw = 0; 1269 1270 if (cc->v_clen == 0 || loffset != cc->v_lastw || 1271 (bp->b_bio2.bio_offset != NOOFFSET && 1272 (bp->b_bio2.bio_offset != cc->v_lasta))) { 1273 /* 1274 * Next block is not logically sequential, or, if physical 1275 * block offsets are available, not physically sequential. 1276 * 1277 * If physical block offsets are not available we only 1278 * get here if we weren't logically sequential. 1279 */ 1280 maxclen = vmaxiosize(vp); 1281 if (cc->v_clen != 0) { 1282 /* 1283 * Next block is not sequential. 1284 * 1285 * If we are not writing at end of file, the process 1286 * seeked to another point in the file since its last 1287 * write, or we have reached our maximum cluster size, 1288 * then push the previous cluster. Otherwise try 1289 * reallocating to make it sequential. 1290 * 1291 * Change to algorithm: only push previous cluster if 1292 * it was sequential from the point of view of the 1293 * seqcount heuristic, otherwise leave the buffer 1294 * intact so we can potentially optimize the I/O 1295 * later on in the buf_daemon or update daemon 1296 * flush. 1297 */ 1298 cursize = cc->v_lastw - cc->v_cstart; 1299 if (bp->b_loffset + blksize < filesize || 1300 loffset != cc->v_lastw || 1301 cc->v_clen <= cursize) { 1302 if (!async && seqcount > 0) { 1303 cluster_wbuild_wb(vp, blksize, 1304 cc->v_cstart, cursize); 1305 } 1306 } else { 1307 struct buf **bpp, **endbp; 1308 struct cluster_save *buflist; 1309 1310 buflist = cluster_collectbufs(cc, vp, 1311 bp, blksize); 1312 endbp = &buflist->bs_children 1313 [buflist->bs_nchildren - 1]; 1314 if (VOP_REALLOCBLKS(vp, buflist)) { 1315 /* 1316 * Failed, push the previous cluster 1317 * if *really* writing sequentially 1318 * in the logical file (seqcount > 1), 1319 * otherwise delay it in the hopes that 1320 * the low level disk driver can 1321 * optimize the write ordering. 1322 * 1323 * NOTE: We do not brelse the last 1324 * element which is bp, and we 1325 * do not return here. 1326 */ 1327 for (bpp = buflist->bs_children; 1328 bpp < endbp; bpp++) 1329 brelse(*bpp); 1330 kfree(buflist, M_SEGMENT); 1331 if (seqcount > 1) { 1332 cluster_wbuild_wb(vp, 1333 blksize, cc->v_cstart, 1334 cursize); 1335 } 1336 } else { 1337 /* 1338 * Succeeded, keep building cluster. 1339 */ 1340 for (bpp = buflist->bs_children; 1341 bpp <= endbp; bpp++) 1342 bdwrite(*bpp); 1343 kfree(buflist, M_SEGMENT); 1344 cc->v_lastw = loffset + blksize; 1345 cc->v_lasta = bp->b_bio2.bio_offset + 1346 blksize; 1347 cluster_putcache(cc); 1348 return; 1349 } 1350 } 1351 } 1352 1353 /* 1354 * Consider beginning a cluster. If at end of file, make 1355 * cluster as large as possible, otherwise find size of 1356 * existing cluster. 1357 */ 1358 if ((vp->v_type == VREG) && 1359 bp->b_loffset + blksize < filesize && 1360 (bp->b_bio2.bio_offset == NOOFFSET) && 1361 (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) || 1362 bp->b_bio2.bio_offset == NOOFFSET)) { 1363 bdwrite(bp); 1364 cc->v_clen = 0; 1365 cc->v_lasta = bp->b_bio2.bio_offset + blksize; 1366 cc->v_cstart = loffset; 1367 cc->v_lastw = loffset + blksize; 1368 cluster_putcache(cc); 1369 return; 1370 } 1371 if (maxclen > blksize) 1372 cc->v_clen = maxclen; 1373 else 1374 cc->v_clen = blksize; 1375 if (!async && cc->v_clen == 0) { /* I/O not contiguous */ 1376 cc->v_cstart = loffset; 1377 bdwrite(bp); 1378 } else { /* Wait for rest of cluster */ 1379 cc->v_cstart = loffset; 1380 bdwrite(bp); 1381 } 1382 } else if (loffset == cc->v_cstart + cc->v_clen) { 1383 /* 1384 * At end of cluster, write it out if seqcount tells us we 1385 * are operating sequentially, otherwise let the buf or 1386 * update daemon handle it. 1387 */ 1388 bdwrite(bp); 1389 if (seqcount > 1) 1390 cluster_wbuild_wb(vp, blksize, cc->v_cstart, 1391 cc->v_clen + blksize); 1392 cc->v_clen = 0; 1393 cc->v_cstart = loffset; 1394 } else if (vm_page_count_severe() && 1395 bp->b_loffset + blksize < filesize) { 1396 /* 1397 * We are low on memory, get it going NOW. However, do not 1398 * try to push out a partial block at the end of the file 1399 * as this could lead to extremely non-optimal write activity. 1400 */ 1401 bawrite(bp); 1402 } else { 1403 /* 1404 * In the middle of a cluster, so just delay the I/O for now. 1405 */ 1406 bdwrite(bp); 1407 } 1408 cc->v_lastw = loffset + blksize; 1409 cc->v_lasta = bp->b_bio2.bio_offset + blksize; 1410 cluster_putcache(cc); 1411 } 1412 1413 /* 1414 * This is the clustered version of bawrite(). It works similarly to 1415 * cluster_write() except I/O on the buffer is guaranteed to occur. 1416 */ 1417 int 1418 cluster_awrite(struct buf *bp) 1419 { 1420 int total; 1421 1422 /* 1423 * Don't bother if it isn't clusterable. 1424 */ 1425 if ((bp->b_flags & B_CLUSTEROK) == 0 || 1426 bp->b_vp == NULL || 1427 (bp->b_vp->v_flag & VOBJBUF) == 0) { 1428 total = bp->b_bufsize; 1429 bawrite(bp); 1430 return (total); 1431 } 1432 1433 total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize, 1434 bp->b_loffset, vmaxiosize(bp->b_vp)); 1435 1436 /* 1437 * If bp is still non-NULL then cluster_wbuild() did not initiate 1438 * I/O on it and we must do so here to provide the API guarantee. 1439 */ 1440 if (bp) 1441 bawrite(bp); 1442 1443 return total; 1444 } 1445 1446 /* 1447 * This is an awful lot like cluster_rbuild...wish they could be combined. 1448 * The last lbn argument is the current block on which I/O is being 1449 * performed. Check to see that it doesn't fall in the middle of 1450 * the current block (if last_bp == NULL). 1451 * 1452 * cluster_wbuild() normally does not guarantee anything. If bpp is 1453 * non-NULL and cluster_wbuild() is able to incorporate it into the 1454 * I/O it will set *bpp to NULL, otherwise it will leave it alone and 1455 * the caller must dispose of *bpp. 1456 */ 1457 static int 1458 cluster_wbuild(struct vnode *vp, struct buf **bpp, 1459 int blksize, off_t start_loffset, int bytes) 1460 { 1461 struct buf *bp, *tbp; 1462 int i, j; 1463 int totalwritten = 0; 1464 int must_initiate; 1465 int maxiosize = vmaxiosize(vp); 1466 1467 while (bytes > 0) { 1468 /* 1469 * If the buffer matches the passed locked & removed buffer 1470 * we used the passed buffer (which might not be B_DELWRI). 1471 * 1472 * Otherwise locate the buffer and determine if it is 1473 * compatible. 1474 */ 1475 if (bpp && (*bpp)->b_loffset == start_loffset) { 1476 tbp = *bpp; 1477 *bpp = NULL; 1478 bpp = NULL; 1479 } else { 1480 tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK | 1481 FINDBLK_KVABIO); 1482 if (tbp == NULL || 1483 (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != 1484 B_DELWRI || 1485 (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) { 1486 if (tbp) 1487 BUF_UNLOCK(tbp); 1488 start_loffset += blksize; 1489 bytes -= blksize; 1490 continue; 1491 } 1492 bremfree(tbp); 1493 } 1494 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 1495 1496 /* 1497 * Extra memory in the buffer, punt on this buffer. 1498 * XXX we could handle this in most cases, but we would 1499 * have to push the extra memory down to after our max 1500 * possible cluster size and then potentially pull it back 1501 * up if the cluster was terminated prematurely--too much 1502 * hassle. 1503 */ 1504 if ((tbp->b_flags & B_CLUSTEROK) == 0 || 1505 tbp->b_bcount != tbp->b_bufsize || 1506 tbp->b_bcount != blksize || 1507 bytes == blksize) { 1508 totalwritten += tbp->b_bufsize; 1509 bawrite(tbp); 1510 start_loffset += blksize; 1511 bytes -= blksize; 1512 continue; 1513 } 1514 1515 /* 1516 * Get a pbuf, limit cluster I/O on a per-device basis. If 1517 * doing cluster I/O for a file, limit cluster I/O on a 1518 * per-mount basis. 1519 * 1520 * HAMMER and other filesystems may attempt to queue a massive 1521 * amount of write I/O, using trypbuf() here easily results in 1522 * situation where the I/O stream becomes non-clustered. 1523 */ 1524 if (vp->v_type == VCHR || vp->v_type == VBLK) 1525 bp = getpbuf_kva(&vp->v_pbuf_count); 1526 else 1527 bp = getpbuf_kva(&vp->v_mount->mnt_pbuf_count); 1528 1529 /* 1530 * Set up the pbuf. Track our append point with b_bcount 1531 * and b_bufsize. b_bufsize is not used by the device but 1532 * our caller uses it to loop clusters and we use it to 1533 * detect a premature EOF on the block device. 1534 */ 1535 bp->b_bcount = 0; 1536 bp->b_bufsize = 0; 1537 bp->b_xio.xio_npages = 0; 1538 bp->b_loffset = tbp->b_loffset; 1539 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset; 1540 bp->b_vp = vp; 1541 1542 /* 1543 * We are synthesizing a buffer out of vm_page_t's, but 1544 * if the block size is not page aligned then the starting 1545 * address may not be either. Inherit the b_data offset 1546 * from the original buffer. 1547 */ 1548 bp->b_data = (char *)((vm_offset_t)bp->b_data | 1549 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 1550 bp->b_flags &= ~(B_ERROR | B_NOTMETA); 1551 bp->b_flags |= B_CLUSTER | B_BNOCLIP | B_KVABIO | 1552 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT | 1553 B_NOTMETA)); 1554 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 1555 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 1556 1557 /* 1558 * From this location in the file, scan forward to see 1559 * if there are buffers with adjacent data that need to 1560 * be written as well. 1561 * 1562 * IO *must* be initiated on index 0 at this point 1563 * (particularly when called from cluster_awrite()). 1564 */ 1565 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) { 1566 if (i == 0) { 1567 must_initiate = 1; 1568 } else { 1569 /* 1570 * Not first buffer. 1571 */ 1572 must_initiate = 0; 1573 tbp = findblk(vp, start_loffset, 1574 FINDBLK_NBLOCK | FINDBLK_KVABIO); 1575 /* 1576 * Buffer not found or could not be locked 1577 * non-blocking. 1578 */ 1579 if (tbp == NULL) 1580 break; 1581 1582 /* 1583 * If it IS in core, but has different 1584 * characteristics, then don't cluster 1585 * with it. 1586 */ 1587 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 1588 B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 1589 != (B_DELWRI | B_CLUSTEROK | 1590 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 1591 (tbp->b_flags & B_LOCKED) 1592 ) { 1593 BUF_UNLOCK(tbp); 1594 break; 1595 } 1596 1597 /* 1598 * Check that the combined cluster 1599 * would make sense with regard to pages 1600 * and would not be too large 1601 * 1602 * WARNING! buf_checkwrite() must be the last 1603 * check made. If it returns 0 then 1604 * we must initiate the I/O. 1605 */ 1606 if ((tbp->b_bcount != blksize) || 1607 ((bp->b_bio2.bio_offset + i) != 1608 tbp->b_bio2.bio_offset) || 1609 ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > 1610 (maxiosize / PAGE_SIZE)) || 1611 (LIST_FIRST(&tbp->b_dep) && 1612 buf_checkwrite(tbp)) 1613 ) { 1614 BUF_UNLOCK(tbp); 1615 break; 1616 } 1617 if (LIST_FIRST(&tbp->b_dep)) 1618 must_initiate = 1; 1619 /* 1620 * Ok, it's passed all the tests, 1621 * so remove it from the free list 1622 * and mark it busy. We will use it. 1623 */ 1624 bremfree(tbp); 1625 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 1626 } 1627 1628 /* 1629 * If the IO is via the VM then we do some 1630 * special VM hackery (yuck). Since the buffer's 1631 * block size may not be page-aligned it is possible 1632 * for a page to be shared between two buffers. We 1633 * have to get rid of the duplication when building 1634 * the cluster. 1635 */ 1636 if (tbp->b_flags & B_VMIO) { 1637 vm_page_t m; 1638 1639 /* 1640 * Try to avoid deadlocks with the VM system. 1641 * However, we cannot abort the I/O if 1642 * must_initiate is non-zero. 1643 */ 1644 if (must_initiate == 0) { 1645 for (j = 0; 1646 j < tbp->b_xio.xio_npages; 1647 ++j) { 1648 m = tbp->b_xio.xio_pages[j]; 1649 if (m->busy_count & 1650 PBUSY_LOCKED) { 1651 bqrelse(tbp); 1652 goto finishcluster; 1653 } 1654 } 1655 } 1656 1657 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 1658 m = tbp->b_xio.xio_pages[j]; 1659 vm_page_busy_wait(m, FALSE, "clurpg"); 1660 vm_page_io_start(m); 1661 vm_page_wakeup(m); 1662 vm_object_pip_add(m->object, 1); 1663 if ((bp->b_xio.xio_npages == 0) || 1664 (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) { 1665 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 1666 bp->b_xio.xio_npages++; 1667 } 1668 } 1669 } 1670 bp->b_bcount += blksize; 1671 bp->b_bufsize += blksize; 1672 1673 /* 1674 * NOTE: see bwrite/bawrite code for why we no longer 1675 * undirty tbp here. 1676 * 1677 * bundirty(tbp); REMOVED 1678 */ 1679 tbp->b_flags &= ~B_ERROR; 1680 tbp->b_cmd = BUF_CMD_WRITE; 1681 BUF_KERNPROC(tbp); 1682 cluster_append(&bp->b_bio1, tbp); 1683 1684 /* 1685 * check for latent dependencies to be handled 1686 */ 1687 if (LIST_FIRST(&tbp->b_dep) != NULL) 1688 buf_start(tbp); 1689 } 1690 finishcluster: 1691 pmap_qenter_noinval(trunc_page((vm_offset_t)bp->b_data), 1692 (vm_page_t *)bp->b_xio.xio_pages, 1693 bp->b_xio.xio_npages); 1694 if (bp->b_bufsize > bp->b_kvasize) { 1695 panic("cluster_wbuild: b_bufsize(%d) " 1696 "> b_kvasize(%d)\n", 1697 bp->b_bufsize, bp->b_kvasize); 1698 } 1699 totalwritten += bp->b_bufsize; 1700 bp->b_dirtyoff = 0; 1701 bp->b_dirtyend = bp->b_bufsize; 1702 bp->b_bio1.bio_done = cluster_callback; 1703 bp->b_cmd = BUF_CMD_WRITE; 1704 1705 vfs_busy_pages(vp, bp); 1706 bsetrunningbufspace(bp, bp->b_bufsize); 1707 BUF_KERNPROC(bp); 1708 vn_strategy(vp, &bp->b_bio1); 1709 1710 bytes -= i; 1711 } 1712 return totalwritten; 1713 } 1714 1715 /* 1716 * Collect together all the buffers in a cluster, plus add one 1717 * additional buffer passed-in. 1718 * 1719 * Only pre-existing buffers whos block size matches blksize are collected. 1720 * (this is primarily because HAMMER1 uses varying block sizes and we don't 1721 * want to override its choices). 1722 * 1723 * This code will not try to collect buffers that it cannot lock, otherwise 1724 * it might deadlock against SMP-friendly filesystems. 1725 */ 1726 static struct cluster_save * 1727 cluster_collectbufs(cluster_cache_t *cc, struct vnode *vp, 1728 struct buf *last_bp, int blksize) 1729 { 1730 struct cluster_save *buflist; 1731 struct buf *bp; 1732 off_t loffset; 1733 int i, len; 1734 int j; 1735 int k; 1736 1737 len = (int)(cc->v_lastw - cc->v_cstart) / blksize; 1738 KKASSERT(len > 0); 1739 buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 1740 M_SEGMENT, M_WAITOK); 1741 buflist->bs_nchildren = 0; 1742 buflist->bs_children = (struct buf **) (buflist + 1); 1743 for (loffset = cc->v_cstart, i = 0, j = 0; 1744 i < len; 1745 (loffset += blksize), i++) { 1746 bp = getcacheblk(vp, loffset, 1747 last_bp->b_bcount, GETBLK_SZMATCH | 1748 GETBLK_NOWAIT); 1749 buflist->bs_children[i] = bp; 1750 if (bp == NULL) { 1751 j = i + 1; 1752 } else if (bp->b_bio2.bio_offset == NOOFFSET) { 1753 VOP_BMAP(bp->b_vp, bp->b_loffset, 1754 &bp->b_bio2.bio_offset, 1755 NULL, NULL, BUF_CMD_WRITE); 1756 } 1757 } 1758 1759 /* 1760 * Get rid of gaps 1761 */ 1762 for (k = 0; k < j; ++k) { 1763 if (buflist->bs_children[k]) { 1764 bqrelse(buflist->bs_children[k]); 1765 buflist->bs_children[k] = NULL; 1766 } 1767 } 1768 if (j != 0) { 1769 if (j != i) { 1770 bcopy(buflist->bs_children + j, 1771 buflist->bs_children + 0, 1772 sizeof(buflist->bs_children[0]) * (i - j)); 1773 } 1774 i -= j; 1775 } 1776 buflist->bs_children[i] = bp = last_bp; 1777 if (bp->b_bio2.bio_offset == NOOFFSET) { 1778 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1779 NULL, NULL, BUF_CMD_WRITE); 1780 } 1781 buflist->bs_nchildren = i + 1; 1782 return (buflist); 1783 } 1784 1785 void 1786 cluster_append(struct bio *bio, struct buf *tbp) 1787 { 1788 tbp->b_cluster_next = NULL; 1789 if (bio->bio_caller_info1.cluster_head == NULL) { 1790 bio->bio_caller_info1.cluster_head = tbp; 1791 bio->bio_caller_info2.cluster_tail = tbp; 1792 } else { 1793 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp; 1794 bio->bio_caller_info2.cluster_tail = tbp; 1795 } 1796 } 1797 1798 static 1799 void 1800 cluster_setram(struct buf *bp) 1801 { 1802 bp->b_flags |= B_RAM; 1803 if (bp->b_xio.xio_npages) 1804 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM); 1805 } 1806 1807 static 1808 void 1809 cluster_clrram(struct buf *bp) 1810 { 1811 bp->b_flags &= ~B_RAM; 1812 if (bp->b_xio.xio_npages) 1813 vm_page_flag_clear(bp->b_xio.xio_pages[0], PG_RAM); 1814 } 1815