1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * Implement the swapcache daemon. When enabled swap is assumed to be 39 * configured on a fast storage device such as a SSD. Swap is assigned 40 * to clean vnode-backed pages in the inactive queue, clustered by object 41 * if possible, and written out. The swap assignment sticks around even 42 * after the underlying pages have been recycled. 43 * 44 * The daemon manages write bandwidth based on sysctl settings to control 45 * wear on the SSD. 46 * 47 * The vnode strategy code will check for the swap assignments and divert 48 * reads to the swap device when the data is present in the swapcache. 49 * 50 * This operates on both regular files and the block device vnodes used by 51 * filesystems to manage meta-data. 52 */ 53 54 #include "opt_vm.h" 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/proc.h> 59 #include <sys/kthread.h> 60 #include <sys/resourcevar.h> 61 #include <sys/signalvar.h> 62 #include <sys/vnode.h> 63 #include <sys/vmmeter.h> 64 #include <sys/sysctl.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_param.h> 68 #include <sys/lock.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_map.h> 72 #include <vm/vm_pageout.h> 73 #include <vm/vm_pager.h> 74 #include <vm/swap_pager.h> 75 #include <vm/vm_extern.h> 76 77 #include <sys/thread2.h> 78 #include <sys/mplock2.h> 79 #include <vm/vm_page2.h> 80 81 #define INACTIVE_LIST (&vm_page_queues[PQ_INACTIVE].pl) 82 83 /* the kernel process "vm_pageout"*/ 84 static int vm_swapcached_flush (vm_page_t m, int isblkdev); 85 static int vm_swapcache_test(vm_page_t m); 86 static void vm_swapcache_writing(vm_page_t marker); 87 static void vm_swapcache_cleaning(vm_object_t marker); 88 struct thread *swapcached_thread; 89 90 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL); 91 92 int vm_swapcache_read_enable; 93 int vm_swapcache_inactive_heuristic; 94 static int vm_swapcache_sleep; 95 static int vm_swapcache_maxlaunder = 256; 96 static int vm_swapcache_data_enable = 0; 97 static int vm_swapcache_meta_enable = 0; 98 static int vm_swapcache_maxswappct = 75; 99 static int vm_swapcache_hysteresis; 100 static int vm_swapcache_use_chflags = 1; /* require chflags cache */ 101 static int64_t vm_swapcache_minburst = 10000000LL; /* 10MB */ 102 static int64_t vm_swapcache_curburst = 4000000000LL; /* 4G after boot */ 103 static int64_t vm_swapcache_maxburst = 2000000000LL; /* 2G nominal max */ 104 static int64_t vm_swapcache_accrate = 100000LL; /* 100K/s */ 105 static int64_t vm_swapcache_write_count; 106 static int64_t vm_swapcache_maxfilesize; 107 108 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder, 109 CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, ""); 110 111 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable, 112 CTLFLAG_RW, &vm_swapcache_data_enable, 0, ""); 113 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable, 114 CTLFLAG_RW, &vm_swapcache_meta_enable, 0, ""); 115 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable, 116 CTLFLAG_RW, &vm_swapcache_read_enable, 0, ""); 117 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct, 118 CTLFLAG_RW, &vm_swapcache_maxswappct, 0, ""); 119 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis, 120 CTLFLAG_RW, &vm_swapcache_hysteresis, 0, ""); 121 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags, 122 CTLFLAG_RW, &vm_swapcache_use_chflags, 0, ""); 123 124 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst, 125 CTLFLAG_RW, &vm_swapcache_minburst, 0, ""); 126 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst, 127 CTLFLAG_RW, &vm_swapcache_curburst, 0, ""); 128 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst, 129 CTLFLAG_RW, &vm_swapcache_maxburst, 0, ""); 130 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize, 131 CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, ""); 132 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate, 133 CTLFLAG_RW, &vm_swapcache_accrate, 0, ""); 134 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, 135 CTLFLAG_RW, &vm_swapcache_write_count, 0, ""); 136 137 #define SWAPMAX(adj) \ 138 ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100) 139 140 /* 141 * vm_swapcached is the high level pageout daemon. 142 * 143 * No requirements. 144 */ 145 static void 146 vm_swapcached_thread(void) 147 { 148 enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; 149 enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING; 150 struct vm_page page_marker; 151 struct vm_object object_marker; 152 153 /* 154 * Thread setup 155 */ 156 curthread->td_flags |= TDF_SYSTHREAD; 157 158 lwkt_gettoken(&vm_token); 159 crit_enter(); 160 161 /* 162 * Initialize our marker for the inactive scan (SWAPC_WRITING) 163 */ 164 bzero(&page_marker, sizeof(page_marker)); 165 page_marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 166 page_marker.queue = PQ_INACTIVE; 167 page_marker.wire_count = 1; 168 TAILQ_INSERT_HEAD(INACTIVE_LIST, &page_marker, pageq); 169 vm_swapcache_hysteresis = vmstats.v_inactive_target / 2; 170 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 171 172 /* 173 * Initialize our marker for the vm_object scan (SWAPC_CLEANING) 174 */ 175 bzero(&object_marker, sizeof(object_marker)); 176 object_marker.type = OBJT_MARKER; 177 lwkt_gettoken(&vmobj_token); 178 TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list); 179 lwkt_reltoken(&vmobj_token); 180 181 for (;;) { 182 /* 183 * Check every 5 seconds when not enabled or if no swap 184 * is present. 185 */ 186 if ((vm_swapcache_data_enable == 0 && 187 vm_swapcache_meta_enable == 0) || 188 vm_swap_max == 0) { 189 tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5); 190 continue; 191 } 192 193 /* 194 * Polling rate when enabled is approximately 10 hz. 195 */ 196 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); 197 198 /* 199 * State hysteresis. Generate write activity up to 75% of 200 * swap, then clean out swap assignments down to 70%, then 201 * repeat. 202 */ 203 if (state == SWAPC_WRITING) { 204 if (vm_swap_cache_use > SWAPMAX(0)) 205 state = SWAPC_CLEANING; 206 } else { 207 if (vm_swap_cache_use < SWAPMAX(-5)) 208 state = SWAPC_WRITING; 209 } 210 211 /* 212 * We are allowed to continue accumulating burst value 213 * in either state. Allow the user to set curburst > maxburst 214 * for the initial load-in. 215 */ 216 if (vm_swapcache_curburst < vm_swapcache_maxburst) { 217 vm_swapcache_curburst += vm_swapcache_accrate / 10; 218 if (vm_swapcache_curburst > vm_swapcache_maxburst) 219 vm_swapcache_curburst = vm_swapcache_maxburst; 220 } 221 222 /* 223 * We don't want to nickle-and-dime the scan as that will 224 * create unnecessary fragmentation. The minimum burst 225 * is one-seconds worth of accumulation. 226 */ 227 if (state == SWAPC_WRITING) { 228 if (vm_swapcache_curburst >= vm_swapcache_accrate) { 229 if (burst == SWAPB_BURSTING) { 230 vm_swapcache_writing(&page_marker); 231 if (vm_swapcache_curburst <= 0) 232 burst = SWAPB_RECOVERING; 233 } else if (vm_swapcache_curburst > 234 vm_swapcache_minburst) { 235 vm_swapcache_writing(&page_marker); 236 burst = SWAPB_BURSTING; 237 } 238 } 239 } else { 240 vm_swapcache_cleaning(&object_marker); 241 } 242 } 243 244 /* 245 * Cleanup (NOT REACHED) 246 */ 247 TAILQ_REMOVE(INACTIVE_LIST, &page_marker, pageq); 248 crit_exit(); 249 lwkt_reltoken(&vm_token); 250 251 lwkt_gettoken(&vmobj_token); 252 TAILQ_REMOVE(&vm_object_list, &object_marker, object_list); 253 lwkt_reltoken(&vmobj_token); 254 } 255 256 static struct kproc_desc swpc_kp = { 257 "swapcached", 258 vm_swapcached_thread, 259 &swapcached_thread 260 }; 261 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp) 262 263 /* 264 * The caller must hold vm_token. 265 */ 266 static void 267 vm_swapcache_writing(vm_page_t marker) 268 { 269 vm_object_t object; 270 struct vnode *vp; 271 vm_page_t m; 272 int count; 273 int isblkdev; 274 275 /* 276 * Deal with an overflow of the heuristic counter or if the user 277 * manually changes the hysteresis. 278 * 279 * Try to avoid small incremental pageouts by waiting for enough 280 * pages to buildup in the inactive queue to hopefully get a good 281 * burst in. This heuristic is bumped by the VM system and reset 282 * when our scan hits the end of the queue. 283 */ 284 if (vm_swapcache_inactive_heuristic < -vm_swapcache_hysteresis) 285 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 286 if (vm_swapcache_inactive_heuristic < 0) 287 return; 288 289 /* 290 * Scan the inactive queue from our marker to locate 291 * suitable pages to push to the swap cache. 292 * 293 * We are looking for clean vnode-backed pages. 294 * 295 * NOTE: PG_SWAPPED pages in particular are not part of 296 * our count because once the cache stabilizes we 297 * can end up with a very high datarate of VM pages 298 * cycling from it. 299 */ 300 m = marker; 301 count = vm_swapcache_maxlaunder; 302 303 while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) { 304 if (m->flags & (PG_MARKER | PG_SWAPPED)) { 305 ++count; 306 continue; 307 } 308 if (vm_swapcache_curburst < 0) 309 break; 310 if (vm_swapcache_test(m)) 311 continue; 312 object = m->object; 313 vp = object->handle; 314 if (vp == NULL) 315 continue; 316 317 switch(vp->v_type) { 318 case VREG: 319 /* 320 * If data_enable is 0 do not try to swapcache data. 321 * If use_chflags is set then only swapcache data for 322 * VSWAPCACHE marked vnodes, otherwise any vnode. 323 */ 324 if (vm_swapcache_data_enable == 0 || 325 ((vp->v_flag & VSWAPCACHE) == 0 && 326 vm_swapcache_use_chflags)) { 327 continue; 328 } 329 if (vm_swapcache_maxfilesize && 330 object->size > 331 (vm_swapcache_maxfilesize >> PAGE_SHIFT)) { 332 continue; 333 } 334 isblkdev = 0; 335 break; 336 case VCHR: 337 /* 338 * The PG_NOTMETA flag only applies to pages 339 * associated with block devices. 340 */ 341 if (m->flags & PG_NOTMETA) 342 continue; 343 if (vm_swapcache_meta_enable == 0) 344 continue; 345 isblkdev = 1; 346 break; 347 default: 348 continue; 349 } 350 351 /* 352 * Ok, move the marker and soft-busy the page. 353 */ 354 TAILQ_REMOVE(INACTIVE_LIST, marker, pageq); 355 TAILQ_INSERT_AFTER(INACTIVE_LIST, m, marker, pageq); 356 357 /* 358 * Assign swap and initiate I/O. 359 * 360 * (adjust for the --count which also occurs in the loop) 361 */ 362 count -= vm_swapcached_flush(m, isblkdev) - 1; 363 364 /* 365 * Setup for next loop using marker. 366 */ 367 m = marker; 368 } 369 370 /* 371 * Cleanup marker position. If we hit the end of the 372 * list the marker is placed at the tail. Newly deactivated 373 * pages will be placed after it. 374 * 375 * Earlier inactive pages that were dirty and become clean 376 * are typically moved to the end of PQ_INACTIVE by virtue 377 * of vfs_vmio_release() when they become unwired from the 378 * buffer cache. 379 */ 380 TAILQ_REMOVE(INACTIVE_LIST, marker, pageq); 381 if (m) { 382 TAILQ_INSERT_BEFORE(m, marker, pageq); 383 } else { 384 TAILQ_INSERT_TAIL(INACTIVE_LIST, marker, pageq); 385 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 386 } 387 } 388 389 /* 390 * Flush the specified page using the swap_pager. 391 * 392 * Try to collect surrounding pages, including pages which may 393 * have already been assigned swap. Try to cluster within a 394 * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block 395 * to match what swap_pager_putpages() can do. 396 * 397 * We also want to try to match against the buffer cache blocksize 398 * but we don't really know what it is here. Since the buffer cache 399 * wires and unwires pages in groups the fact that we skip wired pages 400 * should be sufficient. 401 * 402 * Returns a count of pages we might have flushed (minimum 1) 403 * 404 * The caller must hold vm_token. 405 */ 406 static 407 int 408 vm_swapcached_flush(vm_page_t m, int isblkdev) 409 { 410 vm_object_t object; 411 vm_page_t marray[SWAP_META_PAGES]; 412 vm_pindex_t basei; 413 int rtvals[SWAP_META_PAGES]; 414 int x; 415 int i; 416 int j; 417 int count; 418 419 vm_page_io_start(m); 420 vm_page_protect(m, VM_PROT_READ); 421 object = m->object; 422 423 /* 424 * Try to cluster around (m), keeping in mind that the swap pager 425 * can only do SMAP_META_PAGES worth of continguous write. 426 */ 427 x = (int)m->pindex & SWAP_META_MASK; 428 marray[x] = m; 429 basei = m->pindex; 430 431 for (i = x - 1; i >= 0; --i) { 432 m = vm_page_lookup(object, basei - x + i); 433 if (m == NULL) 434 break; 435 if (vm_swapcache_test(m)) 436 break; 437 if (isblkdev && (m->flags & PG_NOTMETA)) 438 break; 439 vm_page_io_start(m); 440 vm_page_protect(m, VM_PROT_READ); 441 if (m->queue - m->pc == PQ_CACHE) { 442 vm_page_unqueue_nowakeup(m); 443 vm_page_deactivate(m); 444 } 445 marray[i] = m; 446 } 447 ++i; 448 449 for (j = x + 1; j < SWAP_META_PAGES; ++j) { 450 m = vm_page_lookup(object, basei - x + j); 451 if (m == NULL) 452 break; 453 if (vm_swapcache_test(m)) 454 break; 455 if (isblkdev && (m->flags & PG_NOTMETA)) 456 break; 457 vm_page_io_start(m); 458 vm_page_protect(m, VM_PROT_READ); 459 if (m->queue - m->pc == PQ_CACHE) { 460 vm_page_unqueue_nowakeup(m); 461 vm_page_deactivate(m); 462 } 463 marray[j] = m; 464 } 465 466 count = j - i; 467 vm_object_pip_add(object, count); 468 swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i); 469 vm_swapcache_write_count += count * PAGE_SIZE; 470 vm_swapcache_curburst -= count * PAGE_SIZE; 471 472 while (i < j) { 473 if (rtvals[i] != VM_PAGER_PEND) { 474 vm_page_io_finish(marray[i]); 475 vm_object_pip_wakeup(object); 476 } 477 ++i; 478 } 479 return(count); 480 } 481 482 /* 483 * Test whether a VM page is suitable for writing to the swapcache. 484 * Does not test m->queue, PG_MARKER, or PG_SWAPPED. 485 * 486 * Returns 0 on success, 1 on failure 487 * 488 * The caller must hold vm_token. 489 */ 490 static int 491 vm_swapcache_test(vm_page_t m) 492 { 493 vm_object_t object; 494 495 if (m->flags & (PG_BUSY | PG_UNMANAGED)) 496 return(1); 497 if (m->busy || m->hold_count || m->wire_count) 498 return(1); 499 if (m->valid != VM_PAGE_BITS_ALL) 500 return(1); 501 if (m->dirty & m->valid) 502 return(1); 503 if ((object = m->object) == NULL) 504 return(1); 505 if (object->type != OBJT_VNODE || 506 (object->flags & OBJ_DEAD)) { 507 return(1); 508 } 509 vm_page_test_dirty(m); 510 if (m->dirty & m->valid) 511 return(1); 512 return(0); 513 } 514 515 /* 516 * Cleaning pass 517 * 518 * The caller must hold vm_token. 519 */ 520 static 521 void 522 vm_swapcache_cleaning(vm_object_t marker) 523 { 524 vm_object_t object; 525 struct vnode *vp; 526 int count; 527 int n; 528 529 object = marker; 530 count = vm_swapcache_maxlaunder; 531 532 /* 533 * Look for vnode objects 534 */ 535 lwkt_gettoken(&vm_token); 536 lwkt_gettoken(&vmobj_token); 537 538 while ((object = TAILQ_NEXT(object, object_list)) != NULL && count--) { 539 if (object->type != OBJT_VNODE) 540 continue; 541 if ((object->flags & OBJ_DEAD) || object->swblock_count == 0) 542 continue; 543 if ((vp = object->handle) == NULL) 544 continue; 545 if (vp->v_type != VREG && vp->v_type != VCHR) 546 continue; 547 548 /* 549 * Adjust iterator. 550 */ 551 if (marker->backing_object != object) 552 marker->size = 0; 553 554 /* 555 * Move the marker so we can work on the VM object 556 */ 557 TAILQ_REMOVE(&vm_object_list, marker, object_list); 558 TAILQ_INSERT_AFTER(&vm_object_list, object, 559 marker, object_list); 560 561 /* 562 * Look for swblocks starting at our iterator. 563 * 564 * The swap_pager_condfree() function attempts to free 565 * swap space starting at the specified index. The index 566 * will be updated on return. The function will return 567 * a scan factor (NOT the number of blocks freed). 568 * 569 * If it must cut its scan of the object short due to an 570 * excessive number of swblocks, or is able to free the 571 * requested number of blocks, it will return n >= count 572 * and we break and pick it back up on a future attempt. 573 */ 574 n = swap_pager_condfree(object, &marker->size, count); 575 count -= n; 576 if (count < 0) 577 break; 578 579 /* 580 * Setup for loop. 581 */ 582 marker->size = 0; 583 object = marker; 584 } 585 586 /* 587 * Adjust marker so we continue the scan from where we left off. 588 * When we reach the end we start back at the beginning. 589 */ 590 TAILQ_REMOVE(&vm_object_list, marker, object_list); 591 if (object) 592 TAILQ_INSERT_BEFORE(object, marker, object_list); 593 else 594 TAILQ_INSERT_HEAD(&vm_object_list, marker, object_list); 595 marker->backing_object = object; 596 597 lwkt_reltoken(&vmobj_token); 598 lwkt_reltoken(&vm_token); 599 } 600