1 /*- 2 * Copyright (c) 1993 3 * The Regents of the University of California. All rights reserved. 4 * Modifications/enhancements: 5 * Copyright (c) 1995 John S. Dyson. All rights reserved. 6 * Copyright (c) 2012-2013 Matthew Dillon. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include "opt_debug_cluster.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/proc.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/malloc.h> 42 #include <sys/mount.h> 43 #include <sys/resourcevar.h> 44 #include <sys/vmmeter.h> 45 #include <vm/vm.h> 46 #include <vm/vm_object.h> 47 #include <vm/vm_page.h> 48 #include <sys/sysctl.h> 49 50 #include <sys/buf2.h> 51 #include <vm/vm_page2.h> 52 53 #include <machine/limits.h> 54 55 /* 56 * Cluster tracking cache - replaces the original vnode v_* fields which had 57 * limited utility and were not MP safe. 58 * 59 * The cluster tracking cache is a simple 4-way set-associative non-chained 60 * cache. It is capable of tracking up to four zones separated by 1MB or 61 * more per vnode. 62 * 63 * NOTE: We want this structure to be cache-line friendly so the iterator 64 * is embedded rather than in a separate array. 65 * 66 * NOTE: A cluster cache entry can become stale when a vnode is recycled. 67 * For now we treat the values as heuristical but also self-consistent. 68 * i.e. the values cannot be completely random and cannot be SMP unsafe 69 * or the cluster code might end-up clustering non-contiguous buffers 70 * at the wrong offsets. 71 */ 72 struct cluster_cache { 73 struct vnode *vp; 74 u_int locked; 75 off_t v_lastw; /* last write (write cluster) */ 76 off_t v_cstart; /* start block of cluster */ 77 off_t v_lasta; /* last allocation */ 78 u_int v_clen; /* length of current cluster */ 79 u_int iterator; 80 } __cachealign; 81 82 typedef struct cluster_cache cluster_cache_t; 83 84 #define CLUSTER_CACHE_SIZE 512 85 #define CLUSTER_CACHE_MASK (CLUSTER_CACHE_SIZE - 1) 86 87 #define CLUSTER_ZONE ((off_t)(1024 * 1024)) 88 89 cluster_cache_t cluster_array[CLUSTER_CACHE_SIZE]; 90 91 #if defined(CLUSTERDEBUG) 92 #include <sys/sysctl.h> 93 static int rcluster= 0; 94 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); 95 #endif 96 97 static MALLOC_DEFINE(M_SEGMENT, "cluster_save", "cluster_save buffer"); 98 99 static struct cluster_save * 100 cluster_collectbufs (cluster_cache_t *cc, struct vnode *vp, 101 struct buf *last_bp, int blksize); 102 static struct buf * 103 cluster_rbuild (struct vnode *vp, off_t filesize, off_t loffset, 104 off_t doffset, int blksize, int run, 105 struct buf *fbp, int *srp); 106 static void cluster_callback (struct bio *); 107 static void cluster_setram (struct buf *); 108 static void cluster_clrram (struct buf *); 109 static int cluster_wbuild(struct vnode *vp, struct buf **bpp, int blksize, 110 off_t start_loffset, int bytes); 111 112 static int write_behind = 1; 113 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, 114 "Cluster write-behind setting"); 115 static quad_t write_behind_minfilesize = 10 * 1024 * 1024; 116 SYSCTL_QUAD(_vfs, OID_AUTO, write_behind_minfilesize, CTLFLAG_RW, 117 &write_behind_minfilesize, 0, "Cluster write-behind setting"); 118 static int max_readahead = 2 * 1024 * 1024; 119 SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, 120 "Limit in bytes for desired cluster read-ahead"); 121 122 extern vm_page_t bogus_page; 123 124 extern int cluster_pbuf_freecnt; 125 126 /* 127 * nblks is our cluster_rbuild request size. The approximate number of 128 * physical read-ahead requests is maxra / nblks. The physical request 129 * size is limited by the device (maxrbuild). We also do not want to make 130 * the request size too big or it will mess up the B_RAM streaming. 131 */ 132 static __inline 133 int 134 calc_rbuild_reqsize(int maxra, int maxrbuild) 135 { 136 int nblks; 137 138 if ((nblks = maxra / 4) > maxrbuild) 139 nblks = maxrbuild; 140 if (nblks < 1) 141 nblks = maxra; 142 return nblks; 143 } 144 145 /* 146 * Acquire/release cluster cache (can return dummy entry) 147 */ 148 static 149 cluster_cache_t * 150 cluster_getcache(cluster_cache_t *dummy, struct vnode *vp, off_t loffset) 151 { 152 cluster_cache_t *cc; 153 size_t hv; 154 int i; 155 int xact; 156 157 hv = (size_t)(intptr_t)vp ^ (size_t)(intptr_t)vp / sizeof(*vp); 158 hv &= CLUSTER_CACHE_MASK & ~3; 159 cc = &cluster_array[hv]; 160 161 xact = -1; 162 for (i = 0; i < 4; ++i) { 163 if (cc[i].vp != vp) 164 continue; 165 if (((cc[i].v_cstart ^ loffset) & ~(CLUSTER_ZONE - 1)) == 0) { 166 xact = i; 167 break; 168 } 169 } 170 if (xact >= 0 && atomic_swap_int(&cc[xact].locked, 1) == 0) { 171 if (cc[xact].vp == vp && 172 ((cc[i].v_cstart ^ loffset) & ~(CLUSTER_ZONE - 1)) == 0) { 173 return(&cc[xact]); 174 } 175 atomic_swap_int(&cc[xact].locked, 0); 176 } 177 178 /* 179 * New entry. If we can't acquire the cache line then use the 180 * passed-in dummy element and reset all fields. 181 * 182 * When we are able to acquire the cache line we only clear the 183 * fields if the vp does not match. This allows us to multi-zone 184 * a vp and for excessive zones / partial clusters to be retired. 185 */ 186 i = cc->iterator++ & 3; 187 cc += i; 188 if (atomic_swap_int(&cc->locked, 1) != 0) { 189 cc = dummy; 190 cc->locked = 1; 191 cc->vp = NULL; 192 } 193 if (cc->vp != vp) { 194 cc->vp = vp; 195 cc->v_lasta = 0; 196 cc->v_clen = 0; 197 cc->v_cstart = 0; 198 cc->v_lastw = 0; 199 } 200 return(cc); 201 } 202 203 static 204 void 205 cluster_putcache(cluster_cache_t *cc) 206 { 207 atomic_swap_int(&cc->locked, 0); 208 } 209 210 /* 211 * This replaces bread(), providing a synchronous read of the requested 212 * buffer plus asynchronous read-ahead within the specified bounds. 213 * 214 * The caller may pre-populate *bpp if it already has the requested buffer 215 * in-hand, else must set *bpp to NULL. Note that the cluster_read() inline 216 * sets *bpp to NULL and then calls cluster_readx() for compatibility. 217 * 218 * filesize - read-ahead @ blksize will not cross this boundary 219 * loffset - loffset for returned *bpp 220 * blksize - blocksize for returned *bpp and read-ahead bps 221 * minreq - minimum (not a hard minimum) in bytes, typically reflects 222 * a higher level uio resid. 223 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) 224 * bpp - return buffer (*bpp) for (loffset,blksize) 225 */ 226 int 227 cluster_readx(struct vnode *vp, off_t filesize, off_t loffset, 228 int blksize, size_t minreq, size_t maxreq, struct buf **bpp) 229 { 230 struct buf *bp, *rbp, *reqbp; 231 off_t origoffset; 232 off_t doffset; 233 int error; 234 int i; 235 int maxra; 236 int maxrbuild; 237 int sr; 238 239 sr = 0; 240 241 /* 242 * Calculate the desired read-ahead in blksize'd blocks (maxra). 243 * To do this we calculate maxreq. 244 * 245 * maxreq typically starts out as a sequential heuristic. If the 246 * high level uio/resid is bigger (minreq), we pop maxreq up to 247 * minreq. This represents the case where random I/O is being 248 * performed by the userland is issuing big read()'s. 249 * 250 * Then we limit maxreq to max_readahead to ensure it is a reasonable 251 * value. 252 * 253 * Finally we must ensure that (loffset + maxreq) does not cross the 254 * boundary (filesize) for the current blocksize. If we allowed it 255 * to cross we could end up with buffers past the boundary with the 256 * wrong block size (HAMMER large-data areas use mixed block sizes). 257 * minreq is also absolutely limited to filesize. 258 */ 259 if (maxreq < minreq) 260 maxreq = minreq; 261 /* minreq not used beyond this point */ 262 263 if (maxreq > max_readahead) { 264 maxreq = max_readahead; 265 if (maxreq > 16 * 1024 * 1024) 266 maxreq = 16 * 1024 * 1024; 267 } 268 if (maxreq < blksize) 269 maxreq = blksize; 270 if (loffset + maxreq > filesize) { 271 if (loffset > filesize) 272 maxreq = 0; 273 else 274 maxreq = filesize - loffset; 275 } 276 277 maxra = (int)(maxreq / blksize); 278 279 /* 280 * Get the requested block. 281 */ 282 if (*bpp) 283 reqbp = bp = *bpp; 284 else 285 *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0); 286 origoffset = loffset; 287 288 /* 289 * Calculate the maximum cluster size for a single I/O, used 290 * by cluster_rbuild(). 291 */ 292 maxrbuild = vmaxiosize(vp) / blksize; 293 294 /* 295 * If it is in the cache, then check to see if the reads have been 296 * sequential. If they have, then try some read-ahead, otherwise 297 * back-off on prospective read-aheads. 298 */ 299 if (bp->b_flags & B_CACHE) { 300 /* 301 * Not sequential, do not do any read-ahead 302 */ 303 if (maxra <= 1) 304 return 0; 305 306 /* 307 * No read-ahead mark, do not do any read-ahead 308 * yet. 309 */ 310 if ((bp->b_flags & B_RAM) == 0) 311 return 0; 312 313 /* 314 * We hit a read-ahead-mark, figure out how much read-ahead 315 * to do (maxra) and where to start (loffset). 316 * 317 * Typically the way this works is that B_RAM is set in the 318 * middle of the cluster and triggers an overlapping 319 * read-ahead of 1/2 a cluster more blocks. This ensures 320 * that the cluster read-ahead scales with the read-ahead 321 * count and is thus better-able to absorb the caller's 322 * latency. 323 * 324 * Estimate where the next unread block will be by assuming 325 * that the B_RAM's are placed at the half-way point. 326 */ 327 bp->b_flags &= ~B_RAM; 328 329 i = maxra / 2; 330 rbp = findblk(vp, loffset + i * blksize, FINDBLK_TEST); 331 if (rbp == NULL || (rbp->b_flags & B_CACHE) == 0) { 332 while (i) { 333 --i; 334 rbp = findblk(vp, loffset + i * blksize, 335 FINDBLK_TEST); 336 if (rbp) { 337 ++i; 338 break; 339 } 340 } 341 } else { 342 while (i < maxra) { 343 rbp = findblk(vp, loffset + i * blksize, 344 FINDBLK_TEST); 345 if (rbp == NULL) 346 break; 347 ++i; 348 } 349 } 350 351 /* 352 * We got everything or everything is in the cache, no 353 * point continuing. 354 */ 355 if (i >= maxra) 356 return 0; 357 358 /* 359 * Calculate where to start the read-ahead and how much 360 * to do. Generally speaking we want to read-ahead by 361 * (maxra) when we've found a read-ahead mark. We do 362 * not want to reduce maxra here as it will cause 363 * successive read-ahead I/O's to be smaller and smaller. 364 * 365 * However, we have to make sure we don't break the 366 * filesize limitation for the clustered operation. 367 */ 368 loffset += i * blksize; 369 reqbp = bp = NULL; 370 371 if (loffset >= filesize) 372 return 0; 373 if (loffset + maxra * blksize > filesize) { 374 maxreq = filesize - loffset; 375 maxra = (int)(maxreq / blksize); 376 } 377 378 /* 379 * Set RAM on first read-ahead block since we still have 380 * approximate maxra/2 blocks ahead of us that are already 381 * cached or in-progress. 382 */ 383 sr = 1; 384 } else { 385 /* 386 * Start block is not valid, we will want to do a 387 * full read-ahead. 388 */ 389 __debugvar off_t firstread = bp->b_loffset; 390 int nblks; 391 392 /* 393 * Set-up synchronous read for bp. 394 */ 395 bp->b_cmd = BUF_CMD_READ; 396 bp->b_bio1.bio_done = biodone_sync; 397 bp->b_bio1.bio_flags |= BIO_SYNC; 398 399 KASSERT(firstread != NOOFFSET, 400 ("cluster_read: no buffer offset")); 401 402 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 403 404 /* 405 * Set RAM half-way through the full-cluster. 406 */ 407 sr = (maxra + 1) / 2; 408 409 if (nblks > 1) { 410 int burstbytes; 411 412 error = VOP_BMAP(vp, loffset, &doffset, 413 &burstbytes, NULL, BUF_CMD_READ); 414 if (error) 415 goto single_block_read; 416 if (nblks > burstbytes / blksize) 417 nblks = burstbytes / blksize; 418 if (doffset == NOOFFSET) 419 goto single_block_read; 420 if (nblks <= 1) 421 goto single_block_read; 422 423 bp = cluster_rbuild(vp, filesize, loffset, 424 doffset, blksize, nblks, bp, &sr); 425 loffset += bp->b_bufsize; 426 maxra -= bp->b_bufsize / blksize; 427 } else { 428 single_block_read: 429 /* 430 * If it isn't in the cache, then get a chunk from 431 * disk if sequential, otherwise just get the block. 432 */ 433 loffset += blksize; 434 --maxra; 435 } 436 } 437 438 /* 439 * If B_CACHE was not set issue bp. bp will either be an 440 * asynchronous cluster buf or a synchronous single-buf. 441 * If it is a single buf it will be the same as reqbp. 442 * 443 * NOTE: Once an async cluster buf is issued bp becomes invalid. 444 */ 445 if (bp) { 446 #if defined(CLUSTERDEBUG) 447 if (rcluster) 448 kprintf("S(%012jx,%d,%d)\n", 449 (intmax_t)bp->b_loffset, bp->b_bcount, maxra); 450 #endif 451 if ((bp->b_flags & B_CLUSTER) == 0) 452 vfs_busy_pages(vp, bp); 453 bp->b_flags &= ~(B_ERROR|B_INVAL); 454 vn_strategy(vp, &bp->b_bio1); 455 /* bp invalid now */ 456 bp = NULL; 457 } 458 459 #if defined(CLUSTERDEBUG) 460 if (rcluster) 461 kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n", 462 loffset, blksize, maxra, sr); 463 #endif 464 465 /* 466 * If we have been doing sequential I/O, then do some read-ahead. 467 * The code above us should have positioned us at the next likely 468 * offset. 469 * 470 * Only mess with buffers which we can immediately lock. HAMMER 471 * will do device-readahead irrespective of what the blocks 472 * represent. 473 * 474 * Set B_RAM on the first buffer (the next likely offset needing 475 * read-ahead), under the assumption that there are still 476 * approximately maxra/2 blocks good ahead of us. 477 */ 478 while (maxra > 0) { 479 int burstbytes; 480 int nblks; 481 482 rbp = getblk(vp, loffset, blksize, 483 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 484 #if defined(CLUSTERDEBUG) 485 if (rcluster) { 486 kprintf("read-ahead %016jx rbp=%p ", 487 loffset, rbp); 488 } 489 #endif 490 if (rbp == NULL) 491 goto no_read_ahead; 492 if ((rbp->b_flags & B_CACHE)) { 493 bqrelse(rbp); 494 goto no_read_ahead; 495 } 496 497 /* 498 * If BMAP is not supported or has an issue, we still do 499 * (maxra) read-ahead, but we do not try to use rbuild. 500 */ 501 error = VOP_BMAP(vp, loffset, &doffset, 502 &burstbytes, NULL, BUF_CMD_READ); 503 if (error || doffset == NOOFFSET) { 504 nblks = 1; 505 doffset = NOOFFSET; 506 } else { 507 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 508 if (nblks > burstbytes / blksize) 509 nblks = burstbytes / blksize; 510 } 511 rbp->b_cmd = BUF_CMD_READ; 512 513 if (nblks > 1) { 514 rbp = cluster_rbuild(vp, filesize, loffset, 515 doffset, blksize, 516 nblks, rbp, &sr); 517 } else { 518 rbp->b_bio2.bio_offset = doffset; 519 if (--sr == 0) 520 cluster_setram(rbp); 521 } 522 523 rbp->b_flags &= ~(B_ERROR|B_INVAL); 524 525 if ((rbp->b_flags & B_CLUSTER) == 0) 526 vfs_busy_pages(vp, rbp); 527 BUF_KERNPROC(rbp); 528 loffset += rbp->b_bufsize; 529 maxra -= rbp->b_bufsize / blksize; 530 vn_strategy(vp, &rbp->b_bio1); 531 /* rbp invalid now */ 532 } 533 534 /* 535 * Wait for our original buffer to complete its I/O. reqbp will 536 * be NULL if the original buffer was B_CACHE. We are returning 537 * (*bpp) which is the same as reqbp when reqbp != NULL. 538 */ 539 no_read_ahead: 540 if (reqbp) { 541 KKASSERT(reqbp->b_bio1.bio_flags & BIO_SYNC); 542 error = biowait(&reqbp->b_bio1, "clurd"); 543 } else { 544 error = 0; 545 } 546 return (error); 547 } 548 549 /* 550 * This replaces breadcb(), providing an asynchronous read of the requested 551 * buffer with a callback, plus an asynchronous read-ahead within the 552 * specified bounds. 553 * 554 * The callback must check whether BIO_DONE is set in the bio and issue 555 * the bpdone(bp, 0) if it isn't. The callback is responsible for clearing 556 * BIO_DONE and disposing of the I/O (bqrelse()ing it). 557 * 558 * filesize - read-ahead @ blksize will not cross this boundary 559 * loffset - loffset for returned *bpp 560 * blksize - blocksize for returned *bpp and read-ahead bps 561 * minreq - minimum (not a hard minimum) in bytes, typically reflects 562 * a higher level uio resid. 563 * maxreq - maximum (sequential heuristic) in bytes (highet typ ~2MB) 564 * bpp - return buffer (*bpp) for (loffset,blksize) 565 */ 566 void 567 cluster_readcb(struct vnode *vp, off_t filesize, off_t loffset, 568 int blksize, size_t minreq, size_t maxreq, 569 void (*func)(struct bio *), void *arg) 570 { 571 struct buf *bp, *rbp, *reqbp; 572 off_t origoffset; 573 off_t doffset; 574 int i; 575 int maxra; 576 int maxrbuild; 577 int sr; 578 579 sr = 0; 580 581 /* 582 * Calculate the desired read-ahead in blksize'd blocks (maxra). 583 * To do this we calculate maxreq. 584 * 585 * maxreq typically starts out as a sequential heuristic. If the 586 * high level uio/resid is bigger (minreq), we pop maxreq up to 587 * minreq. This represents the case where random I/O is being 588 * performed by the userland is issuing big read()'s. 589 * 590 * Then we limit maxreq to max_readahead to ensure it is a reasonable 591 * value. 592 * 593 * Finally we must ensure that (loffset + maxreq) does not cross the 594 * boundary (filesize) for the current blocksize. If we allowed it 595 * to cross we could end up with buffers past the boundary with the 596 * wrong block size (HAMMER large-data areas use mixed block sizes). 597 * minreq is also absolutely limited to filesize. 598 */ 599 if (maxreq < minreq) 600 maxreq = minreq; 601 /* minreq not used beyond this point */ 602 603 if (maxreq > max_readahead) { 604 maxreq = max_readahead; 605 if (maxreq > 16 * 1024 * 1024) 606 maxreq = 16 * 1024 * 1024; 607 } 608 if (maxreq < blksize) 609 maxreq = blksize; 610 if (loffset + maxreq > filesize) { 611 if (loffset > filesize) 612 maxreq = 0; 613 else 614 maxreq = filesize - loffset; 615 } 616 617 maxra = (int)(maxreq / blksize); 618 619 /* 620 * Get the requested block. 621 */ 622 reqbp = bp = getblk(vp, loffset, blksize, 0, 0); 623 origoffset = loffset; 624 625 /* 626 * Calculate the maximum cluster size for a single I/O, used 627 * by cluster_rbuild(). 628 */ 629 maxrbuild = vmaxiosize(vp) / blksize; 630 631 /* 632 * if it is in the cache, then check to see if the reads have been 633 * sequential. If they have, then try some read-ahead, otherwise 634 * back-off on prospective read-aheads. 635 */ 636 if (bp->b_flags & B_CACHE) { 637 /* 638 * Setup for func() call whether we do read-ahead or not. 639 */ 640 bp->b_bio1.bio_caller_info1.ptr = arg; 641 bp->b_bio1.bio_flags |= BIO_DONE; 642 643 /* 644 * Not sequential, do not do any read-ahead 645 */ 646 if (maxra <= 1) 647 goto no_read_ahead; 648 649 /* 650 * No read-ahead mark, do not do any read-ahead 651 * yet. 652 */ 653 if ((bp->b_flags & B_RAM) == 0) 654 goto no_read_ahead; 655 bp->b_flags &= ~B_RAM; 656 657 /* 658 * We hit a read-ahead-mark, figure out how much read-ahead 659 * to do (maxra) and where to start (loffset). 660 * 661 * Shortcut the scan. Typically the way this works is that 662 * we've built up all the blocks inbetween except for the 663 * last in previous iterations, so if the second-to-last 664 * block is present we just skip ahead to it. 665 * 666 * This algorithm has O(1) cpu in the steady state no 667 * matter how large maxra is. 668 */ 669 if (findblk(vp, loffset + (maxra - 2) * blksize, FINDBLK_TEST)) 670 i = maxra - 1; 671 else 672 i = 1; 673 while (i < maxra) { 674 if (findblk(vp, loffset + i * blksize, 675 FINDBLK_TEST) == NULL) { 676 break; 677 } 678 ++i; 679 } 680 681 /* 682 * We got everything or everything is in the cache, no 683 * point continuing. 684 */ 685 if (i >= maxra) 686 goto no_read_ahead; 687 688 /* 689 * Calculate where to start the read-ahead and how much 690 * to do. Generally speaking we want to read-ahead by 691 * (maxra) when we've found a read-ahead mark. We do 692 * not want to reduce maxra here as it will cause 693 * successive read-ahead I/O's to be smaller and smaller. 694 * 695 * However, we have to make sure we don't break the 696 * filesize limitation for the clustered operation. 697 */ 698 loffset += i * blksize; 699 bp = NULL; 700 /* leave reqbp intact to force function callback */ 701 702 if (loffset >= filesize) 703 goto no_read_ahead; 704 if (loffset + maxra * blksize > filesize) { 705 maxreq = filesize - loffset; 706 maxra = (int)(maxreq / blksize); 707 } 708 sr = 1; 709 } else { 710 /* 711 * bp is not valid, no prior cluster in progress so get a 712 * full cluster read-ahead going. 713 */ 714 __debugvar off_t firstread = bp->b_loffset; 715 int nblks; 716 int error; 717 718 /* 719 * Set-up synchronous read for bp. 720 */ 721 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 722 bp->b_cmd = BUF_CMD_READ; 723 bp->b_bio1.bio_done = func; 724 bp->b_bio1.bio_caller_info1.ptr = arg; 725 BUF_KERNPROC(bp); 726 reqbp = NULL; /* don't func() reqbp, it's running async */ 727 728 KASSERT(firstread != NOOFFSET, 729 ("cluster_read: no buffer offset")); 730 731 /* 732 * nblks is our cluster_rbuild request size, limited 733 * primarily by the device. 734 */ 735 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 736 737 /* 738 * Set RAM half-way through the full-cluster. 739 */ 740 sr = (maxra + 1) / 2; 741 742 if (nblks > 1) { 743 int burstbytes; 744 745 error = VOP_BMAP(vp, loffset, &doffset, 746 &burstbytes, NULL, BUF_CMD_READ); 747 if (error) 748 goto single_block_read; 749 if (nblks > burstbytes / blksize) 750 nblks = burstbytes / blksize; 751 if (doffset == NOOFFSET) 752 goto single_block_read; 753 if (nblks <= 1) 754 goto single_block_read; 755 756 bp = cluster_rbuild(vp, filesize, loffset, 757 doffset, blksize, nblks, bp, &sr); 758 loffset += bp->b_bufsize; 759 maxra -= bp->b_bufsize / blksize; 760 } else { 761 single_block_read: 762 /* 763 * If it isn't in the cache, then get a chunk from 764 * disk if sequential, otherwise just get the block. 765 */ 766 loffset += blksize; 767 --maxra; 768 } 769 } 770 771 /* 772 * If bp != NULL then B_CACHE was *NOT* set and bp must be issued. 773 * bp will either be an asynchronous cluster buf or an asynchronous 774 * single-buf. 775 * 776 * NOTE: Once an async cluster buf is issued bp becomes invalid. 777 */ 778 if (bp) { 779 #if defined(CLUSTERDEBUG) 780 if (rcluster) 781 kprintf("S(%012jx,%d,%d)\n", 782 (intmax_t)bp->b_loffset, bp->b_bcount, maxra); 783 #endif 784 if ((bp->b_flags & B_CLUSTER) == 0) 785 vfs_busy_pages(vp, bp); 786 bp->b_flags &= ~(B_ERROR|B_INVAL); 787 vn_strategy(vp, &bp->b_bio1); 788 /* bp invalid now */ 789 bp = NULL; 790 } 791 792 #if defined(CLUSTERDEBUG) 793 if (rcluster) 794 kprintf("cluster_rd %016jx/%d maxra=%d sr=%d\n", 795 loffset, blksize, maxra, sr); 796 #endif 797 798 /* 799 * If we have been doing sequential I/O, then do some read-ahead. 800 * The code above us should have positioned us at the next likely 801 * offset. 802 * 803 * Only mess with buffers which we can immediately lock. HAMMER 804 * will do device-readahead irrespective of what the blocks 805 * represent. 806 */ 807 while (maxra > 0) { 808 int burstbytes; 809 int error; 810 int nblks; 811 812 rbp = getblk(vp, loffset, blksize, 813 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 814 if (rbp == NULL) 815 goto no_read_ahead; 816 if ((rbp->b_flags & B_CACHE)) { 817 bqrelse(rbp); 818 goto no_read_ahead; 819 } 820 821 /* 822 * If BMAP is not supported or has an issue, we still do 823 * (maxra) read-ahead, but we do not try to use rbuild. 824 */ 825 error = VOP_BMAP(vp, loffset, &doffset, 826 &burstbytes, NULL, BUF_CMD_READ); 827 if (error || doffset == NOOFFSET) { 828 nblks = 1; 829 doffset = NOOFFSET; 830 } else { 831 nblks = calc_rbuild_reqsize(maxra, maxrbuild); 832 if (nblks > burstbytes / blksize) 833 nblks = burstbytes / blksize; 834 } 835 rbp->b_cmd = BUF_CMD_READ; 836 837 if (nblks > 1) { 838 rbp = cluster_rbuild(vp, filesize, loffset, 839 doffset, blksize, 840 nblks, rbp, &sr); 841 } else { 842 rbp->b_bio2.bio_offset = doffset; 843 if (--sr == 0) 844 cluster_setram(rbp); 845 } 846 847 rbp->b_flags &= ~(B_ERROR|B_INVAL); 848 849 if ((rbp->b_flags & B_CLUSTER) == 0) 850 vfs_busy_pages(vp, rbp); 851 BUF_KERNPROC(rbp); 852 loffset += rbp->b_bufsize; 853 maxra -= rbp->b_bufsize / blksize; 854 vn_strategy(vp, &rbp->b_bio1); 855 /* rbp invalid now */ 856 } 857 858 /* 859 * If reqbp is non-NULL it had B_CACHE set and we issue the 860 * function callback synchronously. 861 * 862 * Note that we may start additional asynchronous I/O before doing 863 * the func() callback for the B_CACHE case 864 */ 865 no_read_ahead: 866 if (reqbp) 867 func(&reqbp->b_bio1); 868 } 869 870 /* 871 * If blocks are contiguous on disk, use this to provide clustered 872 * read ahead. We will read as many blocks as possible sequentially 873 * and then parcel them up into logical blocks in the buffer hash table. 874 * 875 * This function either returns a cluster buf or it returns fbp. fbp is 876 * already expected to be set up as a synchronous or asynchronous request. 877 * 878 * If a cluster buf is returned it will always be async. 879 * 880 * (*srp) counts down original blocks to determine where B_RAM should be set. 881 * Set B_RAM when *srp drops to 0. If (*srp) starts at 0, B_RAM will not be 882 * set on any buffer. Make sure B_RAM is cleared on any other buffers to 883 * prevent degenerate read-aheads from being generated. 884 */ 885 static struct buf * 886 cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, 887 int blksize, int run, struct buf *fbp, int *srp) 888 { 889 struct buf *bp, *tbp; 890 off_t boffset; 891 int i, j; 892 int maxiosize = vmaxiosize(vp); 893 894 /* 895 * avoid a division 896 */ 897 while (loffset + run * blksize > filesize) { 898 --run; 899 } 900 901 tbp = fbp; 902 tbp->b_bio2.bio_offset = doffset; 903 if((tbp->b_flags & B_MALLOC) || 904 ((tbp->b_flags & B_VMIO) == 0) || (run <= 1)) { 905 if (--*srp == 0) 906 cluster_setram(tbp); 907 else 908 cluster_clrram(tbp); 909 return tbp; 910 } 911 912 bp = trypbuf_kva(&cluster_pbuf_freecnt); 913 if (bp == NULL) { 914 return tbp; 915 } 916 917 /* 918 * We are synthesizing a buffer out of vm_page_t's, but 919 * if the block size is not page aligned then the starting 920 * address may not be either. Inherit the b_data offset 921 * from the original buffer. 922 */ 923 bp->b_data = (char *)((vm_offset_t)bp->b_data | 924 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 925 bp->b_flags |= B_CLUSTER | B_VMIO; 926 bp->b_cmd = BUF_CMD_READ; 927 bp->b_bio1.bio_done = cluster_callback; /* default to async */ 928 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 929 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 930 bp->b_loffset = loffset; 931 bp->b_bio2.bio_offset = doffset; 932 KASSERT(bp->b_loffset != NOOFFSET, 933 ("cluster_rbuild: no buffer offset")); 934 935 bp->b_bcount = 0; 936 bp->b_bufsize = 0; 937 bp->b_xio.xio_npages = 0; 938 939 for (boffset = doffset, i = 0; i < run; ++i, boffset += blksize) { 940 if (i) { 941 if ((bp->b_xio.xio_npages * PAGE_SIZE) + 942 round_page(blksize) > maxiosize) { 943 break; 944 } 945 946 /* 947 * Shortcut some checks and try to avoid buffers that 948 * would block in the lock. The same checks have to 949 * be made again after we officially get the buffer. 950 */ 951 tbp = getblk(vp, loffset + i * blksize, blksize, 952 GETBLK_SZMATCH|GETBLK_NOWAIT, 0); 953 if (tbp == NULL) 954 break; 955 for (j = 0; j < tbp->b_xio.xio_npages; j++) { 956 if (tbp->b_xio.xio_pages[j]->valid) 957 break; 958 } 959 if (j != tbp->b_xio.xio_npages) { 960 bqrelse(tbp); 961 break; 962 } 963 964 /* 965 * Stop scanning if the buffer is fuly valid 966 * (marked B_CACHE), or locked (may be doing a 967 * background write), or if the buffer is not 968 * VMIO backed. The clustering code can only deal 969 * with VMIO-backed buffers. 970 */ 971 if ((tbp->b_flags & (B_CACHE|B_LOCKED)) || 972 (tbp->b_flags & B_VMIO) == 0 || 973 (LIST_FIRST(&tbp->b_dep) != NULL && 974 buf_checkread(tbp)) 975 ) { 976 bqrelse(tbp); 977 break; 978 } 979 980 /* 981 * The buffer must be completely invalid in order to 982 * take part in the cluster. If it is partially valid 983 * then we stop. 984 */ 985 for (j = 0;j < tbp->b_xio.xio_npages; j++) { 986 if (tbp->b_xio.xio_pages[j]->valid) 987 break; 988 } 989 if (j != tbp->b_xio.xio_npages) { 990 bqrelse(tbp); 991 break; 992 } 993 994 /* 995 * Depress the priority of buffers not explicitly 996 * requested. 997 */ 998 /* tbp->b_flags |= B_AGE; */ 999 1000 /* 1001 * Set the block number if it isn't set, otherwise 1002 * if it is make sure it matches the block number we 1003 * expect. 1004 */ 1005 if (tbp->b_bio2.bio_offset == NOOFFSET) { 1006 tbp->b_bio2.bio_offset = boffset; 1007 } else if (tbp->b_bio2.bio_offset != boffset) { 1008 brelse(tbp); 1009 break; 1010 } 1011 } 1012 1013 /* 1014 * Set B_RAM if (*srp) is 1. B_RAM is only set on one buffer 1015 * in the cluster, including potentially the first buffer 1016 * once we start streaming the read-aheads. 1017 */ 1018 if (--*srp == 0) 1019 cluster_setram(tbp); 1020 else 1021 cluster_clrram(tbp); 1022 1023 /* 1024 * The passed-in tbp (i == 0) will already be set up for 1025 * async or sync operation. All other tbp's acquire in 1026 * our loop are set up for async operation. 1027 */ 1028 tbp->b_cmd = BUF_CMD_READ; 1029 BUF_KERNPROC(tbp); 1030 cluster_append(&bp->b_bio1, tbp); 1031 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 1032 vm_page_t m; 1033 1034 m = tbp->b_xio.xio_pages[j]; 1035 vm_page_busy_wait(m, FALSE, "clurpg"); 1036 vm_page_io_start(m); 1037 vm_page_wakeup(m); 1038 vm_object_pip_add(m->object, 1); 1039 if ((bp->b_xio.xio_npages == 0) || 1040 (bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) { 1041 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 1042 bp->b_xio.xio_npages++; 1043 } 1044 if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) { 1045 tbp->b_xio.xio_pages[j] = bogus_page; 1046 tbp->b_flags |= B_HASBOGUS; 1047 } 1048 } 1049 /* 1050 * XXX shouldn't this be += size for both, like in 1051 * cluster_wbuild()? 1052 * 1053 * Don't inherit tbp->b_bufsize as it may be larger due to 1054 * a non-page-aligned size. Instead just aggregate using 1055 * 'size'. 1056 */ 1057 if (tbp->b_bcount != blksize) 1058 kprintf("warning: tbp->b_bcount wrong %d vs %d\n", tbp->b_bcount, blksize); 1059 if (tbp->b_bufsize != blksize) 1060 kprintf("warning: tbp->b_bufsize wrong %d vs %d\n", tbp->b_bufsize, blksize); 1061 bp->b_bcount += blksize; 1062 bp->b_bufsize += blksize; 1063 } 1064 1065 /* 1066 * Fully valid pages in the cluster are already good and do not need 1067 * to be re-read from disk. Replace the page with bogus_page 1068 */ 1069 for (j = 0; j < bp->b_xio.xio_npages; j++) { 1070 if ((bp->b_xio.xio_pages[j]->valid & VM_PAGE_BITS_ALL) == 1071 VM_PAGE_BITS_ALL) { 1072 bp->b_xio.xio_pages[j] = bogus_page; 1073 bp->b_flags |= B_HASBOGUS; 1074 } 1075 } 1076 if (bp->b_bufsize > bp->b_kvasize) { 1077 panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)", 1078 bp->b_bufsize, bp->b_kvasize); 1079 } 1080 pmap_qenter(trunc_page((vm_offset_t) bp->b_data), 1081 (vm_page_t *)bp->b_xio.xio_pages, bp->b_xio.xio_npages); 1082 BUF_KERNPROC(bp); 1083 return (bp); 1084 } 1085 1086 /* 1087 * Cleanup after a clustered read or write. 1088 * This is complicated by the fact that any of the buffers might have 1089 * extra memory (if there were no empty buffer headers at allocbuf time) 1090 * that we will need to shift around. 1091 * 1092 * The returned bio is &bp->b_bio1 1093 */ 1094 static void 1095 cluster_callback(struct bio *bio) 1096 { 1097 struct buf *bp = bio->bio_buf; 1098 struct buf *tbp; 1099 int error = 0; 1100 1101 /* 1102 * Must propogate errors to all the components. A short read (EOF) 1103 * is a critical error. 1104 */ 1105 if (bp->b_flags & B_ERROR) { 1106 error = bp->b_error; 1107 } else if (bp->b_bcount != bp->b_bufsize) { 1108 panic("cluster_callback: unexpected EOF on cluster %p!", bio); 1109 } 1110 1111 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), 1112 bp->b_xio.xio_npages); 1113 /* 1114 * Move memory from the large cluster buffer into the component 1115 * buffers and mark IO as done on these. Since the memory map 1116 * is the same, no actual copying is required. 1117 */ 1118 while ((tbp = bio->bio_caller_info1.cluster_head) != NULL) { 1119 bio->bio_caller_info1.cluster_head = tbp->b_cluster_next; 1120 if (error) { 1121 tbp->b_flags |= B_ERROR | B_IOISSUED; 1122 tbp->b_error = error; 1123 } else { 1124 tbp->b_dirtyoff = tbp->b_dirtyend = 0; 1125 tbp->b_flags &= ~(B_ERROR|B_INVAL); 1126 tbp->b_flags |= B_IOISSUED; 1127 /* 1128 * XXX the bdwrite()/bqrelse() issued during 1129 * cluster building clears B_RELBUF (see bqrelse() 1130 * comment). If direct I/O was specified, we have 1131 * to restore it here to allow the buffer and VM 1132 * to be freed. 1133 */ 1134 if (tbp->b_flags & B_DIRECT) 1135 tbp->b_flags |= B_RELBUF; 1136 1137 /* 1138 * XXX I think biodone() below will do this, but do 1139 * it here anyway for consistency. 1140 */ 1141 if (tbp->b_cmd == BUF_CMD_WRITE) 1142 bundirty(tbp); 1143 } 1144 biodone(&tbp->b_bio1); 1145 } 1146 relpbuf(bp, &cluster_pbuf_freecnt); 1147 } 1148 1149 /* 1150 * Implement modified write build for cluster. 1151 * 1152 * write_behind = 0 write behind disabled 1153 * write_behind = 1 write behind normal (default) 1154 * write_behind = 2 write behind backed-off 1155 * 1156 * In addition, write_behind is only activated for files that have 1157 * grown past a certain size (default 10MB). Otherwise temporary files 1158 * wind up generating a lot of unnecessary disk I/O. 1159 */ 1160 static __inline int 1161 cluster_wbuild_wb(struct vnode *vp, int blksize, off_t start_loffset, int len) 1162 { 1163 int r = 0; 1164 1165 switch(write_behind) { 1166 case 2: 1167 if (start_loffset < len) 1168 break; 1169 start_loffset -= len; 1170 /* fall through */ 1171 case 1: 1172 if (vp->v_filesize >= write_behind_minfilesize) { 1173 r = cluster_wbuild(vp, NULL, blksize, 1174 start_loffset, len); 1175 } 1176 /* fall through */ 1177 default: 1178 /* fall through */ 1179 break; 1180 } 1181 return(r); 1182 } 1183 1184 /* 1185 * Do clustered write for FFS. 1186 * 1187 * Three cases: 1188 * 1. Write is not sequential (write asynchronously) 1189 * Write is sequential: 1190 * 2. beginning of cluster - begin cluster 1191 * 3. middle of a cluster - add to cluster 1192 * 4. end of a cluster - asynchronously write cluster 1193 * 1194 * WARNING! vnode fields are not locked and must ONLY be used heuristically. 1195 */ 1196 void 1197 cluster_write(struct buf *bp, off_t filesize, int blksize, int seqcount) 1198 { 1199 struct vnode *vp; 1200 off_t loffset; 1201 int maxclen, cursize; 1202 int async; 1203 cluster_cache_t dummy; 1204 cluster_cache_t *cc; 1205 1206 vp = bp->b_vp; 1207 if (vp->v_type == VREG) 1208 async = vp->v_mount->mnt_flag & MNT_ASYNC; 1209 else 1210 async = 0; 1211 loffset = bp->b_loffset; 1212 KASSERT(bp->b_loffset != NOOFFSET, 1213 ("cluster_write: no buffer offset")); 1214 1215 cc = cluster_getcache(&dummy, vp, loffset); 1216 1217 /* 1218 * Initialize vnode to beginning of file. 1219 */ 1220 if (loffset == 0) 1221 cc->v_lasta = cc->v_clen = cc->v_cstart = cc->v_lastw = 0; 1222 1223 if (cc->v_clen == 0 || loffset != cc->v_lastw + blksize || 1224 (bp->b_bio2.bio_offset != NOOFFSET && 1225 (bp->b_bio2.bio_offset != cc->v_lasta + blksize))) { 1226 /* 1227 * Next block is not logically sequential, or, if physical 1228 * block offsets are available, not physically sequential. 1229 * 1230 * If physical block offsets are not available we only 1231 * get here if we weren't logically sequential. 1232 */ 1233 maxclen = vmaxiosize(vp); 1234 if (cc->v_clen != 0) { 1235 /* 1236 * Next block is not sequential. 1237 * 1238 * If we are not writing at end of file, the process 1239 * seeked to another point in the file since its last 1240 * write, or we have reached our maximum cluster size, 1241 * then push the previous cluster. Otherwise try 1242 * reallocating to make it sequential. 1243 * 1244 * Change to algorithm: only push previous cluster if 1245 * it was sequential from the point of view of the 1246 * seqcount heuristic, otherwise leave the buffer 1247 * intact so we can potentially optimize the I/O 1248 * later on in the buf_daemon or update daemon 1249 * flush. 1250 */ 1251 cursize = cc->v_lastw - cc->v_cstart + blksize; 1252 if (bp->b_loffset + blksize < filesize || 1253 loffset != cc->v_lastw + blksize || 1254 cc->v_clen <= cursize) { 1255 if (!async && seqcount > 0) { 1256 cluster_wbuild_wb(vp, blksize, 1257 cc->v_cstart, cursize); 1258 } 1259 } else { 1260 struct buf **bpp, **endbp; 1261 struct cluster_save *buflist; 1262 1263 buflist = cluster_collectbufs(cc, vp, 1264 bp, blksize); 1265 endbp = &buflist->bs_children 1266 [buflist->bs_nchildren - 1]; 1267 if (VOP_REALLOCBLKS(vp, buflist)) { 1268 /* 1269 * Failed, push the previous cluster 1270 * if *really* writing sequentially 1271 * in the logical file (seqcount > 1), 1272 * otherwise delay it in the hopes that 1273 * the low level disk driver can 1274 * optimize the write ordering. 1275 * 1276 * NOTE: We do not brelse the last 1277 * element which is bp, and we 1278 * do not return here. 1279 */ 1280 for (bpp = buflist->bs_children; 1281 bpp < endbp; bpp++) 1282 brelse(*bpp); 1283 kfree(buflist, M_SEGMENT); 1284 if (seqcount > 1) { 1285 cluster_wbuild_wb(vp, 1286 blksize, cc->v_cstart, 1287 cursize); 1288 } 1289 } else { 1290 /* 1291 * Succeeded, keep building cluster. 1292 */ 1293 for (bpp = buflist->bs_children; 1294 bpp <= endbp; bpp++) 1295 bdwrite(*bpp); 1296 kfree(buflist, M_SEGMENT); 1297 cc->v_lastw = loffset; 1298 cc->v_lasta = bp->b_bio2.bio_offset; 1299 cluster_putcache(cc); 1300 return; 1301 } 1302 } 1303 } 1304 1305 /* 1306 * Consider beginning a cluster. If at end of file, make 1307 * cluster as large as possible, otherwise find size of 1308 * existing cluster. 1309 */ 1310 if ((vp->v_type == VREG) && 1311 bp->b_loffset + blksize < filesize && 1312 (bp->b_bio2.bio_offset == NOOFFSET) && 1313 (VOP_BMAP(vp, loffset, &bp->b_bio2.bio_offset, &maxclen, NULL, BUF_CMD_WRITE) || 1314 bp->b_bio2.bio_offset == NOOFFSET)) { 1315 bdwrite(bp); 1316 cc->v_clen = 0; 1317 cc->v_lasta = bp->b_bio2.bio_offset; 1318 cc->v_cstart = loffset + blksize; 1319 cc->v_lastw = loffset; 1320 cluster_putcache(cc); 1321 return; 1322 } 1323 if (maxclen > blksize) 1324 cc->v_clen = maxclen - blksize; 1325 else 1326 cc->v_clen = 0; 1327 if (!async && cc->v_clen == 0) { /* I/O not contiguous */ 1328 cc->v_cstart = loffset + blksize; 1329 bdwrite(bp); 1330 } else { /* Wait for rest of cluster */ 1331 cc->v_cstart = loffset; 1332 bdwrite(bp); 1333 } 1334 } else if (loffset == cc->v_cstart + cc->v_clen) { 1335 /* 1336 * At end of cluster, write it out if seqcount tells us we 1337 * are operating sequentially, otherwise let the buf or 1338 * update daemon handle it. 1339 */ 1340 bdwrite(bp); 1341 if (seqcount > 1) 1342 cluster_wbuild_wb(vp, blksize, cc->v_cstart, 1343 cc->v_clen + blksize); 1344 cc->v_clen = 0; 1345 cc->v_cstart = loffset + blksize; 1346 } else if (vm_page_count_severe() && 1347 bp->b_loffset + blksize < filesize) { 1348 /* 1349 * We are low on memory, get it going NOW. However, do not 1350 * try to push out a partial block at the end of the file 1351 * as this could lead to extremely non-optimal write activity. 1352 */ 1353 bawrite(bp); 1354 } else { 1355 /* 1356 * In the middle of a cluster, so just delay the I/O for now. 1357 */ 1358 bdwrite(bp); 1359 } 1360 cc->v_lastw = loffset; 1361 cc->v_lasta = bp->b_bio2.bio_offset; 1362 cluster_putcache(cc); 1363 } 1364 1365 /* 1366 * This is the clustered version of bawrite(). It works similarly to 1367 * cluster_write() except I/O on the buffer is guaranteed to occur. 1368 */ 1369 int 1370 cluster_awrite(struct buf *bp) 1371 { 1372 int total; 1373 1374 /* 1375 * Don't bother if it isn't clusterable. 1376 */ 1377 if ((bp->b_flags & B_CLUSTEROK) == 0 || 1378 bp->b_vp == NULL || 1379 (bp->b_vp->v_flag & VOBJBUF) == 0) { 1380 total = bp->b_bufsize; 1381 bawrite(bp); 1382 return (total); 1383 } 1384 1385 total = cluster_wbuild(bp->b_vp, &bp, bp->b_bufsize, 1386 bp->b_loffset, vmaxiosize(bp->b_vp)); 1387 1388 /* 1389 * If bp is still non-NULL then cluster_wbuild() did not initiate 1390 * I/O on it and we must do so here to provide the API guarantee. 1391 */ 1392 if (bp) 1393 bawrite(bp); 1394 1395 return total; 1396 } 1397 1398 /* 1399 * This is an awful lot like cluster_rbuild...wish they could be combined. 1400 * The last lbn argument is the current block on which I/O is being 1401 * performed. Check to see that it doesn't fall in the middle of 1402 * the current block (if last_bp == NULL). 1403 * 1404 * cluster_wbuild() normally does not guarantee anything. If bpp is 1405 * non-NULL and cluster_wbuild() is able to incorporate it into the 1406 * I/O it will set *bpp to NULL, otherwise it will leave it alone and 1407 * the caller must dispose of *bpp. 1408 */ 1409 static int 1410 cluster_wbuild(struct vnode *vp, struct buf **bpp, 1411 int blksize, off_t start_loffset, int bytes) 1412 { 1413 struct buf *bp, *tbp; 1414 int i, j; 1415 int totalwritten = 0; 1416 int must_initiate; 1417 int maxiosize = vmaxiosize(vp); 1418 1419 while (bytes > 0) { 1420 /* 1421 * If the buffer matches the passed locked & removed buffer 1422 * we used the passed buffer (which might not be B_DELWRI). 1423 * 1424 * Otherwise locate the buffer and determine if it is 1425 * compatible. 1426 */ 1427 if (bpp && (*bpp)->b_loffset == start_loffset) { 1428 tbp = *bpp; 1429 *bpp = NULL; 1430 bpp = NULL; 1431 } else { 1432 tbp = findblk(vp, start_loffset, FINDBLK_NBLOCK); 1433 if (tbp == NULL || 1434 (tbp->b_flags & (B_LOCKED | B_INVAL | B_DELWRI)) != 1435 B_DELWRI || 1436 (LIST_FIRST(&tbp->b_dep) && buf_checkwrite(tbp))) { 1437 if (tbp) 1438 BUF_UNLOCK(tbp); 1439 start_loffset += blksize; 1440 bytes -= blksize; 1441 continue; 1442 } 1443 bremfree(tbp); 1444 } 1445 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 1446 1447 /* 1448 * Extra memory in the buffer, punt on this buffer. 1449 * XXX we could handle this in most cases, but we would 1450 * have to push the extra memory down to after our max 1451 * possible cluster size and then potentially pull it back 1452 * up if the cluster was terminated prematurely--too much 1453 * hassle. 1454 */ 1455 if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || 1456 (tbp->b_bcount != tbp->b_bufsize) || 1457 (tbp->b_bcount != blksize) || 1458 (bytes == blksize) || 1459 ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) { 1460 totalwritten += tbp->b_bufsize; 1461 bawrite(tbp); 1462 start_loffset += blksize; 1463 bytes -= blksize; 1464 continue; 1465 } 1466 1467 /* 1468 * Set up the pbuf. Track our append point with b_bcount 1469 * and b_bufsize. b_bufsize is not used by the device but 1470 * our caller uses it to loop clusters and we use it to 1471 * detect a premature EOF on the block device. 1472 */ 1473 bp->b_bcount = 0; 1474 bp->b_bufsize = 0; 1475 bp->b_xio.xio_npages = 0; 1476 bp->b_loffset = tbp->b_loffset; 1477 bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset; 1478 1479 /* 1480 * We are synthesizing a buffer out of vm_page_t's, but 1481 * if the block size is not page aligned then the starting 1482 * address may not be either. Inherit the b_data offset 1483 * from the original buffer. 1484 */ 1485 bp->b_data = (char *)((vm_offset_t)bp->b_data | 1486 ((vm_offset_t)tbp->b_data & PAGE_MASK)); 1487 bp->b_flags &= ~B_ERROR; 1488 bp->b_flags |= B_CLUSTER | B_BNOCLIP | 1489 (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); 1490 bp->b_bio1.bio_caller_info1.cluster_head = NULL; 1491 bp->b_bio1.bio_caller_info2.cluster_tail = NULL; 1492 1493 /* 1494 * From this location in the file, scan forward to see 1495 * if there are buffers with adjacent data that need to 1496 * be written as well. 1497 * 1498 * IO *must* be initiated on index 0 at this point 1499 * (particularly when called from cluster_awrite()). 1500 */ 1501 for (i = 0; i < bytes; (i += blksize), (start_loffset += blksize)) { 1502 if (i == 0) { 1503 must_initiate = 1; 1504 } else { 1505 /* 1506 * Not first buffer. 1507 */ 1508 must_initiate = 0; 1509 tbp = findblk(vp, start_loffset, 1510 FINDBLK_NBLOCK); 1511 /* 1512 * Buffer not found or could not be locked 1513 * non-blocking. 1514 */ 1515 if (tbp == NULL) 1516 break; 1517 1518 /* 1519 * If it IS in core, but has different 1520 * characteristics, then don't cluster 1521 * with it. 1522 */ 1523 if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK | 1524 B_INVAL | B_DELWRI | B_NEEDCOMMIT)) 1525 != (B_DELWRI | B_CLUSTEROK | 1526 (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) || 1527 (tbp->b_flags & B_LOCKED) 1528 ) { 1529 BUF_UNLOCK(tbp); 1530 break; 1531 } 1532 1533 /* 1534 * Check that the combined cluster 1535 * would make sense with regard to pages 1536 * and would not be too large 1537 * 1538 * WARNING! buf_checkwrite() must be the last 1539 * check made. If it returns 0 then 1540 * we must initiate the I/O. 1541 */ 1542 if ((tbp->b_bcount != blksize) || 1543 ((bp->b_bio2.bio_offset + i) != 1544 tbp->b_bio2.bio_offset) || 1545 ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > 1546 (maxiosize / PAGE_SIZE)) || 1547 (LIST_FIRST(&tbp->b_dep) && 1548 buf_checkwrite(tbp)) 1549 ) { 1550 BUF_UNLOCK(tbp); 1551 break; 1552 } 1553 if (LIST_FIRST(&tbp->b_dep)) 1554 must_initiate = 1; 1555 /* 1556 * Ok, it's passed all the tests, 1557 * so remove it from the free list 1558 * and mark it busy. We will use it. 1559 */ 1560 bremfree(tbp); 1561 KKASSERT(tbp->b_cmd == BUF_CMD_DONE); 1562 } 1563 1564 /* 1565 * If the IO is via the VM then we do some 1566 * special VM hackery (yuck). Since the buffer's 1567 * block size may not be page-aligned it is possible 1568 * for a page to be shared between two buffers. We 1569 * have to get rid of the duplication when building 1570 * the cluster. 1571 */ 1572 if (tbp->b_flags & B_VMIO) { 1573 vm_page_t m; 1574 1575 /* 1576 * Try to avoid deadlocks with the VM system. 1577 * However, we cannot abort the I/O if 1578 * must_initiate is non-zero. 1579 */ 1580 if (must_initiate == 0) { 1581 for (j = 0; 1582 j < tbp->b_xio.xio_npages; 1583 ++j) { 1584 m = tbp->b_xio.xio_pages[j]; 1585 if (m->flags & PG_BUSY) { 1586 bqrelse(tbp); 1587 goto finishcluster; 1588 } 1589 } 1590 } 1591 1592 for (j = 0; j < tbp->b_xio.xio_npages; ++j) { 1593 m = tbp->b_xio.xio_pages[j]; 1594 vm_page_busy_wait(m, FALSE, "clurpg"); 1595 vm_page_io_start(m); 1596 vm_page_wakeup(m); 1597 vm_object_pip_add(m->object, 1); 1598 if ((bp->b_xio.xio_npages == 0) || 1599 (bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) { 1600 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 1601 bp->b_xio.xio_npages++; 1602 } 1603 } 1604 } 1605 bp->b_bcount += blksize; 1606 bp->b_bufsize += blksize; 1607 1608 /* 1609 * NOTE: see bwrite/bawrite code for why we no longer 1610 * undirty tbp here. 1611 * 1612 * bundirty(tbp); REMOVED 1613 */ 1614 tbp->b_flags &= ~B_ERROR; 1615 tbp->b_cmd = BUF_CMD_WRITE; 1616 BUF_KERNPROC(tbp); 1617 cluster_append(&bp->b_bio1, tbp); 1618 1619 /* 1620 * check for latent dependencies to be handled 1621 */ 1622 if (LIST_FIRST(&tbp->b_dep) != NULL) 1623 buf_start(tbp); 1624 } 1625 finishcluster: 1626 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 1627 (vm_page_t *)bp->b_xio.xio_pages, 1628 bp->b_xio.xio_npages); 1629 if (bp->b_bufsize > bp->b_kvasize) { 1630 panic("cluster_wbuild: b_bufsize(%d) " 1631 "> b_kvasize(%d)\n", 1632 bp->b_bufsize, bp->b_kvasize); 1633 } 1634 totalwritten += bp->b_bufsize; 1635 bp->b_dirtyoff = 0; 1636 bp->b_dirtyend = bp->b_bufsize; 1637 bp->b_bio1.bio_done = cluster_callback; 1638 bp->b_cmd = BUF_CMD_WRITE; 1639 1640 vfs_busy_pages(vp, bp); 1641 bsetrunningbufspace(bp, bp->b_bufsize); 1642 BUF_KERNPROC(bp); 1643 vn_strategy(vp, &bp->b_bio1); 1644 1645 bytes -= i; 1646 } 1647 return totalwritten; 1648 } 1649 1650 /* 1651 * Collect together all the buffers in a cluster, plus add one 1652 * additional buffer passed-in. 1653 * 1654 * Only pre-existing buffers whos block size matches blksize are collected. 1655 * (this is primarily because HAMMER1 uses varying block sizes and we don't 1656 * want to override its choices). 1657 * 1658 * This code will not try to collect buffers that it cannot lock, otherwise 1659 * it might deadlock against SMP-friendly filesystems. 1660 */ 1661 static struct cluster_save * 1662 cluster_collectbufs(cluster_cache_t *cc, struct vnode *vp, 1663 struct buf *last_bp, int blksize) 1664 { 1665 struct cluster_save *buflist; 1666 struct buf *bp; 1667 off_t loffset; 1668 int i, len; 1669 int j; 1670 int k; 1671 1672 len = (int)(cc->v_lastw - cc->v_cstart + blksize) / blksize; 1673 KKASSERT(len > 0); 1674 buflist = kmalloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 1675 M_SEGMENT, M_WAITOK); 1676 buflist->bs_nchildren = 0; 1677 buflist->bs_children = (struct buf **) (buflist + 1); 1678 for (loffset = cc->v_cstart, i = 0, j = 0; 1679 i < len; 1680 (loffset += blksize), i++) { 1681 bp = getcacheblk(vp, loffset, 1682 last_bp->b_bcount, GETBLK_SZMATCH | 1683 GETBLK_NOWAIT); 1684 buflist->bs_children[i] = bp; 1685 if (bp == NULL) { 1686 j = i + 1; 1687 } else if (bp->b_bio2.bio_offset == NOOFFSET) { 1688 VOP_BMAP(bp->b_vp, bp->b_loffset, 1689 &bp->b_bio2.bio_offset, 1690 NULL, NULL, BUF_CMD_WRITE); 1691 } 1692 } 1693 1694 /* 1695 * Get rid of gaps 1696 */ 1697 for (k = 0; k < j; ++k) { 1698 if (buflist->bs_children[k]) { 1699 bqrelse(buflist->bs_children[k]); 1700 buflist->bs_children[k] = NULL; 1701 } 1702 } 1703 if (j != 0) { 1704 if (j != i) { 1705 bcopy(buflist->bs_children + j, 1706 buflist->bs_children + 0, 1707 sizeof(buflist->bs_children[0]) * (i - j)); 1708 } 1709 i -= j; 1710 } 1711 buflist->bs_children[i] = bp = last_bp; 1712 if (bp->b_bio2.bio_offset == NOOFFSET) { 1713 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1714 NULL, NULL, BUF_CMD_WRITE); 1715 } 1716 buflist->bs_nchildren = i + 1; 1717 return (buflist); 1718 } 1719 1720 void 1721 cluster_append(struct bio *bio, struct buf *tbp) 1722 { 1723 tbp->b_cluster_next = NULL; 1724 if (bio->bio_caller_info1.cluster_head == NULL) { 1725 bio->bio_caller_info1.cluster_head = tbp; 1726 bio->bio_caller_info2.cluster_tail = tbp; 1727 } else { 1728 bio->bio_caller_info2.cluster_tail->b_cluster_next = tbp; 1729 bio->bio_caller_info2.cluster_tail = tbp; 1730 } 1731 } 1732 1733 static 1734 void 1735 cluster_setram(struct buf *bp) 1736 { 1737 bp->b_flags |= B_RAM; 1738 if (bp->b_xio.xio_npages) 1739 vm_page_flag_set(bp->b_xio.xio_pages[0], PG_RAM); 1740 } 1741 1742 static 1743 void 1744 cluster_clrram(struct buf *bp) 1745 { 1746 bp->b_flags &= ~B_RAM; 1747 if (bp->b_xio.xio_npages) 1748 vm_page_flag_clear(bp->b_xio.xio_pages[0], PG_RAM); 1749 } 1750