1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * Implement the swapcache daemon. When enabled swap is assumed to be 39 * configured on a fast storage device such as a SSD. Swap is assigned 40 * to clean vnode-backed pages in the inactive queue, clustered by object 41 * if possible, and written out. The swap assignment sticks around even 42 * after the underlying pages have been recycled. 43 * 44 * The daemon manages write bandwidth based on sysctl settings to control 45 * wear on the SSD. 46 * 47 * The vnode strategy code will check for the swap assignments and divert 48 * reads to the swap device when the data is present in the swapcache. 49 * 50 * This operates on both regular files and the block device vnodes used by 51 * filesystems to manage meta-data. 52 */ 53 54 #include "opt_vm.h" 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/proc.h> 59 #include <sys/kthread.h> 60 #include <sys/resourcevar.h> 61 #include <sys/signalvar.h> 62 #include <sys/vnode.h> 63 #include <sys/vmmeter.h> 64 #include <sys/sysctl.h> 65 #include <sys/eventhandler.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <sys/lock.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_pager.h> 75 #include <vm/swap_pager.h> 76 #include <vm/vm_extern.h> 77 78 #include <sys/thread2.h> 79 #include <sys/spinlock2.h> 80 #include <vm/vm_page2.h> 81 82 /* the kernel process "vm_pageout"*/ 83 static int vm_swapcached_flush (vm_page_t m, int isblkdev); 84 static int vm_swapcache_test(vm_page_t m); 85 static int vm_swapcache_writing_heuristic(void); 86 static int vm_swapcache_writing(vm_page_t marker, int count, int scount); 87 static void vm_swapcache_cleaning(vm_object_t marker); 88 static void vm_swapcache_movemarker(vm_object_t marker, vm_object_t object); 89 struct thread *swapcached_thread; 90 91 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL); 92 93 int vm_swapcache_read_enable; 94 int vm_swapcache_inactive_heuristic; 95 static int vm_swapcache_sleep; 96 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8; 97 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4; 98 static int vm_swapcache_data_enable = 0; 99 static int vm_swapcache_meta_enable = 0; 100 static int vm_swapcache_maxswappct = 75; 101 static int vm_swapcache_hysteresis; 102 static int vm_swapcache_min_hysteresis; 103 int vm_swapcache_use_chflags = 1; /* require chflags cache */ 104 static int64_t vm_swapcache_minburst = 10000000LL; /* 10MB */ 105 static int64_t vm_swapcache_curburst = 4000000000LL; /* 4G after boot */ 106 static int64_t vm_swapcache_maxburst = 2000000000LL; /* 2G nominal max */ 107 static int64_t vm_swapcache_accrate = 100000LL; /* 100K/s */ 108 static int64_t vm_swapcache_write_count; 109 static int64_t vm_swapcache_maxfilesize; 110 static int64_t vm_swapcache_cleanperobj = 16*1024*1024; 111 112 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder, 113 CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, ""); 114 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan, 115 CTLFLAG_RW, &vm_swapcache_maxscan, 0, ""); 116 117 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable, 118 CTLFLAG_RW, &vm_swapcache_data_enable, 0, ""); 119 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable, 120 CTLFLAG_RW, &vm_swapcache_meta_enable, 0, ""); 121 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable, 122 CTLFLAG_RW, &vm_swapcache_read_enable, 0, ""); 123 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct, 124 CTLFLAG_RW, &vm_swapcache_maxswappct, 0, ""); 125 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis, 126 CTLFLAG_RD, &vm_swapcache_hysteresis, 0, ""); 127 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis, 128 CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, ""); 129 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags, 130 CTLFLAG_RW, &vm_swapcache_use_chflags, 0, ""); 131 132 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst, 133 CTLFLAG_RW, &vm_swapcache_minburst, 0, ""); 134 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst, 135 CTLFLAG_RW, &vm_swapcache_curburst, 0, ""); 136 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst, 137 CTLFLAG_RW, &vm_swapcache_maxburst, 0, ""); 138 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize, 139 CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, ""); 140 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate, 141 CTLFLAG_RW, &vm_swapcache_accrate, 0, ""); 142 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, 143 CTLFLAG_RW, &vm_swapcache_write_count, 0, ""); 144 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj, 145 CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, ""); 146 147 #define SWAPMAX(adj) \ 148 ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100) 149 150 /* 151 * When shutting down the machine we want to stop swapcache operation 152 * immediately so swap is not accessed after devices have been shuttered. 153 */ 154 static void 155 shutdown_swapcache(void *arg __unused) 156 { 157 vm_swapcache_read_enable = 0; 158 vm_swapcache_data_enable = 0; 159 vm_swapcache_meta_enable = 0; 160 wakeup(&vm_swapcache_sleep); /* shortcut 5-second wait */ 161 } 162 163 /* 164 * vm_swapcached is the high level pageout daemon. 165 * 166 * No requirements. 167 */ 168 static void 169 vm_swapcached_thread(void) 170 { 171 enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; 172 enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING; 173 static struct vm_page page_marker[PQ_L2_SIZE]; 174 static struct vm_object object_marker; 175 int q; 176 177 /* 178 * Thread setup 179 */ 180 curthread->td_flags |= TDF_SYSTHREAD; 181 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 182 swapcached_thread, SHUTDOWN_PRI_FIRST); 183 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache, 184 NULL, SHUTDOWN_PRI_SECOND); 185 186 /* 187 * Initialize our marker for the inactive scan (SWAPC_WRITING) 188 */ 189 bzero(&page_marker, sizeof(page_marker)); 190 for (q = 0; q < PQ_L2_SIZE; ++q) { 191 page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 192 page_marker[q].queue = PQ_INACTIVE + q; 193 page_marker[q].pc = q; 194 page_marker[q].wire_count = 1; 195 vm_page_queues_spin_lock(PQ_INACTIVE + q); 196 TAILQ_INSERT_HEAD( 197 &vm_page_queues[PQ_INACTIVE + q].pl, 198 &page_marker[q], pageq); 199 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 200 } 201 202 vm_swapcache_min_hysteresis = 1024; 203 vm_swapcache_hysteresis = vm_swapcache_min_hysteresis; 204 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 205 206 /* 207 * Initialize our marker for the vm_object scan (SWAPC_CLEANING) 208 */ 209 bzero(&object_marker, sizeof(object_marker)); 210 object_marker.type = OBJT_MARKER; 211 lwkt_gettoken(&vmobj_token); 212 TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list); 213 lwkt_reltoken(&vmobj_token); 214 215 for (;;) { 216 int reached_end; 217 int scount; 218 int count; 219 220 /* 221 * Handle shutdown 222 */ 223 kproc_suspend_loop(); 224 225 /* 226 * Check every 5 seconds when not enabled or if no swap 227 * is present. 228 */ 229 if ((vm_swapcache_data_enable == 0 && 230 vm_swapcache_meta_enable == 0) || 231 vm_swap_max == 0) { 232 tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5); 233 continue; 234 } 235 236 /* 237 * Polling rate when enabled is approximately 10 hz. 238 */ 239 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); 240 241 /* 242 * State hysteresis. Generate write activity up to 75% of 243 * swap, then clean out swap assignments down to 70%, then 244 * repeat. 245 */ 246 if (state == SWAPC_WRITING) { 247 if (vm_swap_cache_use > SWAPMAX(0)) 248 state = SWAPC_CLEANING; 249 } else { 250 if (vm_swap_cache_use < SWAPMAX(-10)) 251 state = SWAPC_WRITING; 252 } 253 254 /* 255 * We are allowed to continue accumulating burst value 256 * in either state. Allow the user to set curburst > maxburst 257 * for the initial load-in. 258 */ 259 if (vm_swapcache_curburst < vm_swapcache_maxburst) { 260 vm_swapcache_curburst += vm_swapcache_accrate / 10; 261 if (vm_swapcache_curburst > vm_swapcache_maxburst) 262 vm_swapcache_curburst = vm_swapcache_maxburst; 263 } 264 265 /* 266 * We don't want to nickle-and-dime the scan as that will 267 * create unnecessary fragmentation. The minimum burst 268 * is one-seconds worth of accumulation. 269 */ 270 if (state != SWAPC_WRITING) { 271 vm_swapcache_cleaning(&object_marker); 272 continue; 273 } 274 if (vm_swapcache_curburst < vm_swapcache_accrate) 275 continue; 276 277 reached_end = 0; 278 count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2; 279 scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2; 280 281 if (burst == SWAPB_BURSTING) { 282 if (vm_swapcache_writing_heuristic()) { 283 for (q = 0; q < PQ_L2_SIZE; ++q) { 284 reached_end += 285 vm_swapcache_writing( 286 &page_marker[q], 287 count, 288 scount); 289 } 290 } 291 if (vm_swapcache_curburst <= 0) 292 burst = SWAPB_RECOVERING; 293 } else if (vm_swapcache_curburst > vm_swapcache_minburst) { 294 if (vm_swapcache_writing_heuristic()) { 295 for (q = 0; q < PQ_L2_SIZE; ++q) { 296 reached_end += 297 vm_swapcache_writing( 298 &page_marker[q], 299 count, 300 scount); 301 } 302 } 303 burst = SWAPB_BURSTING; 304 } 305 if (reached_end == PQ_L2_SIZE) { 306 vm_swapcache_inactive_heuristic = 307 -vm_swapcache_hysteresis; 308 } 309 } 310 311 /* 312 * Cleanup (NOT REACHED) 313 */ 314 for (q = 0; q < PQ_L2_SIZE; ++q) { 315 vm_page_queues_spin_lock(PQ_INACTIVE + q); 316 TAILQ_REMOVE( 317 &vm_page_queues[PQ_INACTIVE + q].pl, 318 &page_marker[q], pageq); 319 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 320 } 321 322 lwkt_gettoken(&vmobj_token); 323 TAILQ_REMOVE(&vm_object_list, &object_marker, object_list); 324 lwkt_reltoken(&vmobj_token); 325 } 326 327 static struct kproc_desc swpc_kp = { 328 "swapcached", 329 vm_swapcached_thread, 330 &swapcached_thread 331 }; 332 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp) 333 334 /* 335 * Deal with an overflow of the heuristic counter or if the user 336 * manually changes the hysteresis. 337 * 338 * Try to avoid small incremental pageouts by waiting for enough 339 * pages to buildup in the inactive queue to hopefully get a good 340 * burst in. This heuristic is bumped by the VM system and reset 341 * when our scan hits the end of the queue. 342 * 343 * Return TRUE if we need to take a writing pass. 344 */ 345 static int 346 vm_swapcache_writing_heuristic(void) 347 { 348 int hyst; 349 350 hyst = vmstats.v_inactive_count / 4; 351 if (hyst < vm_swapcache_min_hysteresis) 352 hyst = vm_swapcache_min_hysteresis; 353 cpu_ccfence(); 354 vm_swapcache_hysteresis = hyst; 355 356 if (vm_swapcache_inactive_heuristic < -hyst) 357 vm_swapcache_inactive_heuristic = -hyst; 358 359 return (vm_swapcache_inactive_heuristic >= 0); 360 } 361 362 /* 363 * Take a writing pass on one of the inactive queues, return non-zero if 364 * we hit the end of the queue. 365 */ 366 static int 367 vm_swapcache_writing(vm_page_t marker, int count, int scount) 368 { 369 vm_object_t object; 370 struct vnode *vp; 371 vm_page_t m; 372 int isblkdev; 373 374 /* 375 * Scan the inactive queue from our marker to locate 376 * suitable pages to push to the swap cache. 377 * 378 * We are looking for clean vnode-backed pages. 379 */ 380 vm_page_queues_spin_lock(marker->queue); 381 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 382 count > 0 && scount-- > 0) { 383 KKASSERT(m->queue == marker->queue); 384 385 if (vm_swapcache_curburst < 0) 386 break; 387 TAILQ_REMOVE( 388 &vm_page_queues[marker->queue].pl, marker, pageq); 389 TAILQ_INSERT_AFTER( 390 &vm_page_queues[marker->queue].pl, m, marker, pageq); 391 392 /* 393 * Ignore markers and ignore pages that already have a swap 394 * assignment. 395 */ 396 if (m->flags & (PG_MARKER | PG_SWAPPED)) 397 continue; 398 if (vm_page_busy_try(m, TRUE)) 399 continue; 400 vm_page_queues_spin_unlock(marker->queue); 401 402 if ((object = m->object) == NULL) { 403 vm_page_wakeup(m); 404 vm_page_queues_spin_lock(marker->queue); 405 continue; 406 } 407 vm_object_hold(object); 408 if (m->object != object) { 409 vm_object_drop(object); 410 vm_page_wakeup(m); 411 vm_page_queues_spin_lock(marker->queue); 412 continue; 413 } 414 if (vm_swapcache_test(m)) { 415 vm_object_drop(object); 416 vm_page_wakeup(m); 417 vm_page_queues_spin_lock(marker->queue); 418 continue; 419 } 420 421 vp = object->handle; 422 if (vp == NULL) { 423 vm_object_drop(object); 424 vm_page_wakeup(m); 425 vm_page_queues_spin_lock(marker->queue); 426 continue; 427 } 428 429 switch(vp->v_type) { 430 case VREG: 431 /* 432 * PG_NOTMETA generically means 'don't swapcache this', 433 * and HAMMER will set this for regular data buffers 434 * (and leave it unset for meta-data buffers) as 435 * appropriate when double buffering is enabled. 436 */ 437 if (m->flags & PG_NOTMETA) { 438 vm_object_drop(object); 439 vm_page_wakeup(m); 440 vm_page_queues_spin_lock(marker->queue); 441 continue; 442 } 443 444 /* 445 * If data_enable is 0 do not try to swapcache data. 446 * If use_chflags is set then only swapcache data for 447 * VSWAPCACHE marked vnodes, otherwise any vnode. 448 */ 449 if (vm_swapcache_data_enable == 0 || 450 ((vp->v_flag & VSWAPCACHE) == 0 && 451 vm_swapcache_use_chflags)) { 452 vm_object_drop(object); 453 vm_page_wakeup(m); 454 vm_page_queues_spin_lock(marker->queue); 455 continue; 456 } 457 if (vm_swapcache_maxfilesize && 458 object->size > 459 (vm_swapcache_maxfilesize >> PAGE_SHIFT)) { 460 vm_object_drop(object); 461 vm_page_wakeup(m); 462 vm_page_queues_spin_lock(marker->queue); 463 continue; 464 } 465 isblkdev = 0; 466 break; 467 case VCHR: 468 /* 469 * PG_NOTMETA generically means 'don't swapcache this', 470 * and HAMMER will set this for regular data buffers 471 * (and leave it unset for meta-data buffers) as 472 * appropriate when double buffering is enabled. 473 */ 474 if (m->flags & PG_NOTMETA) { 475 vm_object_drop(object); 476 vm_page_wakeup(m); 477 vm_page_queues_spin_lock(marker->queue); 478 continue; 479 } 480 if (vm_swapcache_meta_enable == 0) { 481 vm_object_drop(object); 482 vm_page_wakeup(m); 483 vm_page_queues_spin_lock(marker->queue); 484 continue; 485 } 486 isblkdev = 1; 487 break; 488 default: 489 vm_object_drop(object); 490 vm_page_wakeup(m); 491 vm_page_queues_spin_lock(marker->queue); 492 continue; 493 } 494 495 496 /* 497 * Assign swap and initiate I/O. 498 * 499 * (adjust for the --count which also occurs in the loop) 500 */ 501 count -= vm_swapcached_flush(m, isblkdev); 502 503 /* 504 * Setup for next loop using marker. 505 */ 506 vm_object_drop(object); 507 vm_page_queues_spin_lock(marker->queue); 508 } 509 510 /* 511 * The marker could wind up at the end, which is ok. If we hit the 512 * end of the list adjust the heuristic. 513 * 514 * Earlier inactive pages that were dirty and become clean 515 * are typically moved to the end of PQ_INACTIVE by virtue 516 * of vfs_vmio_release() when they become unwired from the 517 * buffer cache. 518 */ 519 vm_page_queues_spin_unlock(marker->queue); 520 521 /* 522 * m invalid but can be used to test for NULL 523 */ 524 return (m == NULL); 525 } 526 527 /* 528 * Flush the specified page using the swap_pager. The page 529 * must be busied by the caller and its disposition will become 530 * the responsibility of this function. 531 * 532 * Try to collect surrounding pages, including pages which may 533 * have already been assigned swap. Try to cluster within a 534 * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block 535 * to match what swap_pager_putpages() can do. 536 * 537 * We also want to try to match against the buffer cache blocksize 538 * but we don't really know what it is here. Since the buffer cache 539 * wires and unwires pages in groups the fact that we skip wired pages 540 * should be sufficient. 541 * 542 * Returns a count of pages we might have flushed (minimum 1) 543 */ 544 static 545 int 546 vm_swapcached_flush(vm_page_t m, int isblkdev) 547 { 548 vm_object_t object; 549 vm_page_t marray[SWAP_META_PAGES]; 550 vm_pindex_t basei; 551 int rtvals[SWAP_META_PAGES]; 552 int x; 553 int i; 554 int j; 555 int count; 556 int error; 557 558 vm_page_io_start(m); 559 vm_page_protect(m, VM_PROT_READ); 560 object = m->object; 561 vm_object_hold(object); 562 563 /* 564 * Try to cluster around (m), keeping in mind that the swap pager 565 * can only do SMAP_META_PAGES worth of continguous write. 566 */ 567 x = (int)m->pindex & SWAP_META_MASK; 568 marray[x] = m; 569 basei = m->pindex; 570 vm_page_wakeup(m); 571 572 for (i = x - 1; i >= 0; --i) { 573 m = vm_page_lookup_busy_try(object, basei - x + i, 574 TRUE, &error); 575 if (error || m == NULL) 576 break; 577 if (vm_swapcache_test(m)) { 578 vm_page_wakeup(m); 579 break; 580 } 581 if (isblkdev && (m->flags & PG_NOTMETA)) { 582 vm_page_wakeup(m); 583 break; 584 } 585 vm_page_io_start(m); 586 vm_page_protect(m, VM_PROT_READ); 587 if (m->queue - m->pc == PQ_CACHE) { 588 vm_page_unqueue_nowakeup(m); 589 vm_page_deactivate(m); 590 } 591 marray[i] = m; 592 vm_page_wakeup(m); 593 } 594 ++i; 595 596 for (j = x + 1; j < SWAP_META_PAGES; ++j) { 597 m = vm_page_lookup_busy_try(object, basei - x + j, 598 TRUE, &error); 599 if (error || m == NULL) 600 break; 601 if (vm_swapcache_test(m)) { 602 vm_page_wakeup(m); 603 break; 604 } 605 if (isblkdev && (m->flags & PG_NOTMETA)) { 606 vm_page_wakeup(m); 607 break; 608 } 609 vm_page_io_start(m); 610 vm_page_protect(m, VM_PROT_READ); 611 if (m->queue - m->pc == PQ_CACHE) { 612 vm_page_unqueue_nowakeup(m); 613 vm_page_deactivate(m); 614 } 615 marray[j] = m; 616 vm_page_wakeup(m); 617 } 618 619 count = j - i; 620 vm_object_pip_add(object, count); 621 swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i); 622 vm_swapcache_write_count += count * PAGE_SIZE; 623 vm_swapcache_curburst -= count * PAGE_SIZE; 624 625 while (i < j) { 626 if (rtvals[i] != VM_PAGER_PEND) { 627 vm_page_busy_wait(marray[i], FALSE, "swppgfd"); 628 vm_page_io_finish(marray[i]); 629 vm_page_wakeup(marray[i]); 630 vm_object_pip_wakeup(object); 631 } 632 ++i; 633 } 634 vm_object_drop(object); 635 return(count); 636 } 637 638 /* 639 * Test whether a VM page is suitable for writing to the swapcache. 640 * Does not test m->queue, PG_MARKER, or PG_SWAPPED. 641 * 642 * Returns 0 on success, 1 on failure 643 */ 644 static int 645 vm_swapcache_test(vm_page_t m) 646 { 647 vm_object_t object; 648 649 if (m->flags & PG_UNMANAGED) 650 return(1); 651 if (m->hold_count || m->wire_count) 652 return(1); 653 if (m->valid != VM_PAGE_BITS_ALL) 654 return(1); 655 if (m->dirty & m->valid) 656 return(1); 657 if ((object = m->object) == NULL) 658 return(1); 659 if (object->type != OBJT_VNODE || 660 (object->flags & OBJ_DEAD)) { 661 return(1); 662 } 663 vm_page_test_dirty(m); 664 if (m->dirty & m->valid) 665 return(1); 666 return(0); 667 } 668 669 /* 670 * Cleaning pass. 671 * 672 * We clean whole objects up to 16MB 673 */ 674 static 675 void 676 vm_swapcache_cleaning(vm_object_t marker) 677 { 678 vm_object_t object; 679 struct vnode *vp; 680 int count; 681 int scount; 682 int n; 683 684 count = vm_swapcache_maxlaunder; 685 scount = vm_swapcache_maxscan; 686 687 /* 688 * Look for vnode objects 689 */ 690 lwkt_gettoken(&vmobj_token); 691 692 while ((object = TAILQ_NEXT(marker, object_list)) != NULL) { 693 /* 694 * We have to skip markers. We cannot hold/drop marker 695 * objects! 696 */ 697 if (object->type == OBJT_MARKER) { 698 vm_swapcache_movemarker(marker, object); 699 continue; 700 } 701 702 /* 703 * Safety, or in case there are millions of VM objects 704 * without swapcache backing. 705 */ 706 if (--scount <= 0) 707 break; 708 709 /* 710 * We must hold the object before potentially yielding. 711 */ 712 vm_object_hold(object); 713 lwkt_yield(); 714 715 /* 716 * Only operate on live VNODE objects that are either 717 * VREG or VCHR (VCHR for meta-data). 718 */ 719 if ((object->type != OBJT_VNODE) || 720 ((object->flags & OBJ_DEAD) || 721 object->swblock_count == 0) || 722 ((vp = object->handle) == NULL) || 723 (vp->v_type != VREG && vp->v_type != VCHR)) { 724 vm_object_drop(object); 725 /* object may be invalid now */ 726 vm_swapcache_movemarker(marker, object); 727 continue; 728 } 729 730 /* 731 * Reset the object pindex stored in the marker if the 732 * working object has changed. 733 */ 734 if (marker->backing_object != object) { 735 marker->size = 0; 736 marker->backing_object_offset = 0; 737 marker->backing_object = object; 738 } 739 740 /* 741 * Look for swblocks starting at our iterator. 742 * 743 * The swap_pager_condfree() function attempts to free 744 * swap space starting at the specified index. The index 745 * will be updated on return. The function will return 746 * a scan factor (NOT the number of blocks freed). 747 * 748 * If it must cut its scan of the object short due to an 749 * excessive number of swblocks, or is able to free the 750 * requested number of blocks, it will return n >= count 751 * and we break and pick it back up on a future attempt. 752 * 753 * Scan the object linearly and try to batch large sets of 754 * blocks that are likely to clean out entire swap radix 755 * tree leafs. 756 */ 757 lwkt_token_swap(); 758 lwkt_reltoken(&vmobj_token); 759 760 n = swap_pager_condfree(object, &marker->size, 761 (count + SWAP_META_MASK) & ~SWAP_META_MASK); 762 763 vm_object_drop(object); /* object may be invalid now */ 764 lwkt_gettoken(&vmobj_token); 765 766 /* 767 * If we have exhausted the object or deleted our per-pass 768 * page limit then move us to the next object. Note that 769 * the current object may no longer be on the vm_object_list. 770 */ 771 if (n <= 0 || 772 marker->backing_object_offset > vm_swapcache_cleanperobj) { 773 vm_swapcache_movemarker(marker, object); 774 } 775 776 /* 777 * If we have exhausted our max-launder stop for now. 778 */ 779 count -= n; 780 marker->backing_object_offset += n * PAGE_SIZE; 781 if (count < 0) 782 break; 783 } 784 785 /* 786 * If we wound up at the end of the list this will move the 787 * marker back to the beginning. 788 */ 789 if (object == NULL) 790 vm_swapcache_movemarker(marker, NULL); 791 792 lwkt_reltoken(&vmobj_token); 793 } 794 795 /* 796 * Move the marker past the current object. Object can be stale, but we 797 * still need it to determine if the marker has to be moved. If the object 798 * is still the 'current object' (object after the marker), we hop-scotch 799 * the marker past it. 800 */ 801 static void 802 vm_swapcache_movemarker(vm_object_t marker, vm_object_t object) 803 { 804 if (TAILQ_NEXT(marker, object_list) == object) { 805 TAILQ_REMOVE(&vm_object_list, marker, object_list); 806 if (object) { 807 TAILQ_INSERT_AFTER(&vm_object_list, object, 808 marker, object_list); 809 } else { 810 TAILQ_INSERT_HEAD(&vm_object_list, 811 marker, object_list); 812 } 813 } 814 } 815