1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * Implement the swapcache daemon. When enabled swap is assumed to be 39 * configured on a fast storage device such as a SSD. Swap is assigned 40 * to clean vnode-backed pages in the inactive queue, clustered by object 41 * if possible, and written out. The swap assignment sticks around even 42 * after the underlying pages have been recycled. 43 * 44 * The daemon manages write bandwidth based on sysctl settings to control 45 * wear on the SSD. 46 * 47 * The vnode strategy code will check for the swap assignments and divert 48 * reads to the swap device when the data is present in the swapcache. 49 * 50 * This operates on both regular files and the block device vnodes used by 51 * filesystems to manage meta-data. 52 */ 53 54 #include "opt_vm.h" 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/proc.h> 59 #include <sys/kthread.h> 60 #include <sys/resourcevar.h> 61 #include <sys/signalvar.h> 62 #include <sys/vnode.h> 63 #include <sys/vmmeter.h> 64 #include <sys/sysctl.h> 65 #include <sys/eventhandler.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <sys/lock.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_pager.h> 75 #include <vm/swap_pager.h> 76 #include <vm/vm_extern.h> 77 78 #include <sys/thread2.h> 79 #include <sys/spinlock2.h> 80 #include <vm/vm_page2.h> 81 82 /* the kernel process "vm_pageout"*/ 83 static int vm_swapcached_flush (vm_page_t m, int isblkdev); 84 static int vm_swapcache_test(vm_page_t m); 85 static int vm_swapcache_writing_heuristic(void); 86 static int vm_swapcache_writing(vm_page_t marker, int count, int scount); 87 static void vm_swapcache_cleaning(vm_object_t marker, int *swindexp); 88 static void vm_swapcache_movemarker(vm_object_t marker, int swindex, 89 vm_object_t object); 90 struct thread *swapcached_thread; 91 92 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL); 93 94 int vm_swapcache_read_enable; 95 int vm_swapcache_inactive_heuristic; 96 static int vm_swapcache_sleep; 97 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8; 98 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4; 99 static int vm_swapcache_data_enable = 0; 100 static int vm_swapcache_meta_enable = 0; 101 static int vm_swapcache_maxswappct = 75; 102 static int vm_swapcache_hysteresis; 103 static int vm_swapcache_min_hysteresis; 104 int vm_swapcache_use_chflags = 1; /* require chflags cache */ 105 static int64_t vm_swapcache_minburst = 10000000LL; /* 10MB */ 106 static int64_t vm_swapcache_curburst = 4000000000LL; /* 4G after boot */ 107 static int64_t vm_swapcache_maxburst = 2000000000LL; /* 2G nominal max */ 108 static int64_t vm_swapcache_accrate = 100000LL; /* 100K/s */ 109 static int64_t vm_swapcache_write_count; 110 static int64_t vm_swapcache_maxfilesize; 111 static int64_t vm_swapcache_cleanperobj = 16*1024*1024; 112 113 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder, 114 CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, ""); 115 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan, 116 CTLFLAG_RW, &vm_swapcache_maxscan, 0, ""); 117 118 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable, 119 CTLFLAG_RW, &vm_swapcache_data_enable, 0, ""); 120 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable, 121 CTLFLAG_RW, &vm_swapcache_meta_enable, 0, ""); 122 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable, 123 CTLFLAG_RW, &vm_swapcache_read_enable, 0, ""); 124 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct, 125 CTLFLAG_RW, &vm_swapcache_maxswappct, 0, ""); 126 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis, 127 CTLFLAG_RD, &vm_swapcache_hysteresis, 0, ""); 128 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis, 129 CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, ""); 130 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags, 131 CTLFLAG_RW, &vm_swapcache_use_chflags, 0, ""); 132 133 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst, 134 CTLFLAG_RW, &vm_swapcache_minburst, 0, ""); 135 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst, 136 CTLFLAG_RW, &vm_swapcache_curburst, 0, ""); 137 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst, 138 CTLFLAG_RW, &vm_swapcache_maxburst, 0, ""); 139 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize, 140 CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, ""); 141 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate, 142 CTLFLAG_RW, &vm_swapcache_accrate, 0, ""); 143 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, 144 CTLFLAG_RW, &vm_swapcache_write_count, 0, ""); 145 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj, 146 CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, ""); 147 148 #define SWAPMAX(adj) \ 149 ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100) 150 151 /* 152 * When shutting down the machine we want to stop swapcache operation 153 * immediately so swap is not accessed after devices have been shuttered. 154 */ 155 static void 156 shutdown_swapcache(void *arg __unused) 157 { 158 vm_swapcache_read_enable = 0; 159 vm_swapcache_data_enable = 0; 160 vm_swapcache_meta_enable = 0; 161 wakeup(&vm_swapcache_sleep); /* shortcut 5-second wait */ 162 } 163 164 /* 165 * vm_swapcached is the high level pageout daemon. 166 * 167 * No requirements. 168 */ 169 static void 170 vm_swapcached_thread(void) 171 { 172 enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; 173 enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING; 174 static struct vm_page page_marker[PQ_L2_SIZE]; 175 static struct vm_object swmarker; 176 static int swindex; 177 int q; 178 179 /* 180 * Thread setup 181 */ 182 curthread->td_flags |= TDF_SYSTHREAD; 183 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 184 swapcached_thread, SHUTDOWN_PRI_FIRST); 185 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache, 186 NULL, SHUTDOWN_PRI_SECOND); 187 188 /* 189 * Initialize our marker for the inactive scan (SWAPC_WRITING) 190 */ 191 bzero(&page_marker, sizeof(page_marker)); 192 for (q = 0; q < PQ_L2_SIZE; ++q) { 193 page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 194 page_marker[q].queue = PQ_INACTIVE + q; 195 page_marker[q].pc = q; 196 page_marker[q].wire_count = 1; 197 vm_page_queues_spin_lock(PQ_INACTIVE + q); 198 TAILQ_INSERT_HEAD( 199 &vm_page_queues[PQ_INACTIVE + q].pl, 200 &page_marker[q], pageq); 201 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 202 } 203 204 vm_swapcache_min_hysteresis = 1024; 205 vm_swapcache_hysteresis = vm_swapcache_min_hysteresis; 206 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 207 208 /* 209 * Initialize our marker for the vm_object scan (SWAPC_CLEANING) 210 */ 211 bzero(&swmarker, sizeof(swmarker)); 212 swmarker.type = OBJT_MARKER; 213 swindex = 0; 214 lwkt_gettoken(&vmobj_tokens[swindex]); 215 TAILQ_INSERT_HEAD(&vm_object_lists[swindex], 216 &swmarker, object_list); 217 lwkt_reltoken(&vmobj_tokens[swindex]); 218 219 for (;;) { 220 int reached_end; 221 int scount; 222 int count; 223 224 /* 225 * Handle shutdown 226 */ 227 kproc_suspend_loop(); 228 229 /* 230 * Check every 5 seconds when not enabled or if no swap 231 * is present. 232 */ 233 if ((vm_swapcache_data_enable == 0 && 234 vm_swapcache_meta_enable == 0) || 235 vm_swap_max == 0) { 236 tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5); 237 continue; 238 } 239 240 /* 241 * Polling rate when enabled is approximately 10 hz. 242 */ 243 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); 244 245 /* 246 * State hysteresis. Generate write activity up to 75% of 247 * swap, then clean out swap assignments down to 70%, then 248 * repeat. 249 */ 250 if (state == SWAPC_WRITING) { 251 if (vm_swap_cache_use > SWAPMAX(0)) 252 state = SWAPC_CLEANING; 253 } else { 254 if (vm_swap_cache_use < SWAPMAX(-10)) 255 state = SWAPC_WRITING; 256 } 257 258 /* 259 * We are allowed to continue accumulating burst value 260 * in either state. Allow the user to set curburst > maxburst 261 * for the initial load-in. 262 */ 263 if (vm_swapcache_curburst < vm_swapcache_maxburst) { 264 vm_swapcache_curburst += vm_swapcache_accrate / 10; 265 if (vm_swapcache_curburst > vm_swapcache_maxburst) 266 vm_swapcache_curburst = vm_swapcache_maxburst; 267 } 268 269 /* 270 * We don't want to nickle-and-dime the scan as that will 271 * create unnecessary fragmentation. The minimum burst 272 * is one-seconds worth of accumulation. 273 */ 274 if (state != SWAPC_WRITING) { 275 vm_swapcache_cleaning(&swmarker, &swindex); 276 continue; 277 } 278 if (vm_swapcache_curburst < vm_swapcache_accrate) 279 continue; 280 281 reached_end = 0; 282 count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2; 283 scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2; 284 285 if (burst == SWAPB_BURSTING) { 286 if (vm_swapcache_writing_heuristic()) { 287 for (q = 0; q < PQ_L2_SIZE; ++q) { 288 reached_end += 289 vm_swapcache_writing( 290 &page_marker[q], 291 count, 292 scount); 293 } 294 } 295 if (vm_swapcache_curburst <= 0) 296 burst = SWAPB_RECOVERING; 297 } else if (vm_swapcache_curburst > vm_swapcache_minburst) { 298 if (vm_swapcache_writing_heuristic()) { 299 for (q = 0; q < PQ_L2_SIZE; ++q) { 300 reached_end += 301 vm_swapcache_writing( 302 &page_marker[q], 303 count, 304 scount); 305 } 306 } 307 burst = SWAPB_BURSTING; 308 } 309 if (reached_end == PQ_L2_SIZE) { 310 vm_swapcache_inactive_heuristic = 311 -vm_swapcache_hysteresis; 312 } 313 } 314 315 /* 316 * Cleanup (NOT REACHED) 317 */ 318 for (q = 0; q < PQ_L2_SIZE; ++q) { 319 vm_page_queues_spin_lock(PQ_INACTIVE + q); 320 TAILQ_REMOVE( 321 &vm_page_queues[PQ_INACTIVE + q].pl, 322 &page_marker[q], pageq); 323 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 324 } 325 326 lwkt_gettoken(&vmobj_tokens[swindex]); 327 TAILQ_REMOVE(&vm_object_lists[swindex], &swmarker, object_list); 328 lwkt_reltoken(&vmobj_tokens[swindex]); 329 } 330 331 static struct kproc_desc swpc_kp = { 332 "swapcached", 333 vm_swapcached_thread, 334 &swapcached_thread 335 }; 336 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp) 337 338 /* 339 * Deal with an overflow of the heuristic counter or if the user 340 * manually changes the hysteresis. 341 * 342 * Try to avoid small incremental pageouts by waiting for enough 343 * pages to buildup in the inactive queue to hopefully get a good 344 * burst in. This heuristic is bumped by the VM system and reset 345 * when our scan hits the end of the queue. 346 * 347 * Return TRUE if we need to take a writing pass. 348 */ 349 static int 350 vm_swapcache_writing_heuristic(void) 351 { 352 int hyst; 353 354 hyst = vmstats.v_inactive_count / 4; 355 if (hyst < vm_swapcache_min_hysteresis) 356 hyst = vm_swapcache_min_hysteresis; 357 cpu_ccfence(); 358 vm_swapcache_hysteresis = hyst; 359 360 if (vm_swapcache_inactive_heuristic < -hyst) 361 vm_swapcache_inactive_heuristic = -hyst; 362 363 return (vm_swapcache_inactive_heuristic >= 0); 364 } 365 366 /* 367 * Take a writing pass on one of the inactive queues, return non-zero if 368 * we hit the end of the queue. 369 */ 370 static int 371 vm_swapcache_writing(vm_page_t marker, int count, int scount) 372 { 373 vm_object_t object; 374 struct vnode *vp; 375 vm_page_t m; 376 int isblkdev; 377 378 /* 379 * Scan the inactive queue from our marker to locate 380 * suitable pages to push to the swap cache. 381 * 382 * We are looking for clean vnode-backed pages. 383 */ 384 vm_page_queues_spin_lock(marker->queue); 385 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 386 count > 0 && scount-- > 0) { 387 KKASSERT(m->queue == marker->queue); 388 389 if (vm_swapcache_curburst < 0) 390 break; 391 TAILQ_REMOVE( 392 &vm_page_queues[marker->queue].pl, marker, pageq); 393 TAILQ_INSERT_AFTER( 394 &vm_page_queues[marker->queue].pl, m, marker, pageq); 395 396 /* 397 * Ignore markers and ignore pages that already have a swap 398 * assignment. 399 */ 400 if (m->flags & (PG_MARKER | PG_SWAPPED)) 401 continue; 402 if (vm_page_busy_try(m, TRUE)) 403 continue; 404 vm_page_queues_spin_unlock(marker->queue); 405 406 if ((object = m->object) == NULL) { 407 vm_page_wakeup(m); 408 vm_page_queues_spin_lock(marker->queue); 409 continue; 410 } 411 vm_object_hold(object); 412 if (m->object != object) { 413 vm_object_drop(object); 414 vm_page_wakeup(m); 415 vm_page_queues_spin_lock(marker->queue); 416 continue; 417 } 418 if (vm_swapcache_test(m)) { 419 vm_object_drop(object); 420 vm_page_wakeup(m); 421 vm_page_queues_spin_lock(marker->queue); 422 continue; 423 } 424 425 vp = object->handle; 426 if (vp == NULL) { 427 vm_object_drop(object); 428 vm_page_wakeup(m); 429 vm_page_queues_spin_lock(marker->queue); 430 continue; 431 } 432 433 switch(vp->v_type) { 434 case VREG: 435 /* 436 * PG_NOTMETA generically means 'don't swapcache this', 437 * and HAMMER will set this for regular data buffers 438 * (and leave it unset for meta-data buffers) as 439 * appropriate when double buffering is enabled. 440 */ 441 if (m->flags & PG_NOTMETA) { 442 vm_object_drop(object); 443 vm_page_wakeup(m); 444 vm_page_queues_spin_lock(marker->queue); 445 continue; 446 } 447 448 /* 449 * If data_enable is 0 do not try to swapcache data. 450 * If use_chflags is set then only swapcache data for 451 * VSWAPCACHE marked vnodes, otherwise any vnode. 452 */ 453 if (vm_swapcache_data_enable == 0 || 454 ((vp->v_flag & VSWAPCACHE) == 0 && 455 vm_swapcache_use_chflags)) { 456 vm_object_drop(object); 457 vm_page_wakeup(m); 458 vm_page_queues_spin_lock(marker->queue); 459 continue; 460 } 461 if (vm_swapcache_maxfilesize && 462 object->size > 463 (vm_swapcache_maxfilesize >> PAGE_SHIFT)) { 464 vm_object_drop(object); 465 vm_page_wakeup(m); 466 vm_page_queues_spin_lock(marker->queue); 467 continue; 468 } 469 isblkdev = 0; 470 break; 471 case VCHR: 472 /* 473 * PG_NOTMETA generically means 'don't swapcache this', 474 * and HAMMER will set this for regular data buffers 475 * (and leave it unset for meta-data buffers) as 476 * appropriate when double buffering is enabled. 477 */ 478 if (m->flags & PG_NOTMETA) { 479 vm_object_drop(object); 480 vm_page_wakeup(m); 481 vm_page_queues_spin_lock(marker->queue); 482 continue; 483 } 484 if (vm_swapcache_meta_enable == 0) { 485 vm_object_drop(object); 486 vm_page_wakeup(m); 487 vm_page_queues_spin_lock(marker->queue); 488 continue; 489 } 490 isblkdev = 1; 491 break; 492 default: 493 vm_object_drop(object); 494 vm_page_wakeup(m); 495 vm_page_queues_spin_lock(marker->queue); 496 continue; 497 } 498 499 500 /* 501 * Assign swap and initiate I/O. 502 * 503 * (adjust for the --count which also occurs in the loop) 504 */ 505 count -= vm_swapcached_flush(m, isblkdev); 506 507 /* 508 * Setup for next loop using marker. 509 */ 510 vm_object_drop(object); 511 vm_page_queues_spin_lock(marker->queue); 512 } 513 514 /* 515 * The marker could wind up at the end, which is ok. If we hit the 516 * end of the list adjust the heuristic. 517 * 518 * Earlier inactive pages that were dirty and become clean 519 * are typically moved to the end of PQ_INACTIVE by virtue 520 * of vfs_vmio_release() when they become unwired from the 521 * buffer cache. 522 */ 523 vm_page_queues_spin_unlock(marker->queue); 524 525 /* 526 * m invalid but can be used to test for NULL 527 */ 528 return (m == NULL); 529 } 530 531 /* 532 * Flush the specified page using the swap_pager. The page 533 * must be busied by the caller and its disposition will become 534 * the responsibility of this function. 535 * 536 * Try to collect surrounding pages, including pages which may 537 * have already been assigned swap. Try to cluster within a 538 * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block 539 * to match what swap_pager_putpages() can do. 540 * 541 * We also want to try to match against the buffer cache blocksize 542 * but we don't really know what it is here. Since the buffer cache 543 * wires and unwires pages in groups the fact that we skip wired pages 544 * should be sufficient. 545 * 546 * Returns a count of pages we might have flushed (minimum 1) 547 */ 548 static 549 int 550 vm_swapcached_flush(vm_page_t m, int isblkdev) 551 { 552 vm_object_t object; 553 vm_page_t marray[SWAP_META_PAGES]; 554 vm_pindex_t basei; 555 int rtvals[SWAP_META_PAGES]; 556 int x; 557 int i; 558 int j; 559 int count; 560 int error; 561 562 vm_page_io_start(m); 563 vm_page_protect(m, VM_PROT_READ); 564 object = m->object; 565 vm_object_hold(object); 566 567 /* 568 * Try to cluster around (m), keeping in mind that the swap pager 569 * can only do SMAP_META_PAGES worth of continguous write. 570 */ 571 x = (int)m->pindex & SWAP_META_MASK; 572 marray[x] = m; 573 basei = m->pindex; 574 vm_page_wakeup(m); 575 576 for (i = x - 1; i >= 0; --i) { 577 m = vm_page_lookup_busy_try(object, basei - x + i, 578 TRUE, &error); 579 if (error || m == NULL) 580 break; 581 if (vm_swapcache_test(m)) { 582 vm_page_wakeup(m); 583 break; 584 } 585 if (isblkdev && (m->flags & PG_NOTMETA)) { 586 vm_page_wakeup(m); 587 break; 588 } 589 vm_page_io_start(m); 590 vm_page_protect(m, VM_PROT_READ); 591 if (m->queue - m->pc == PQ_CACHE) { 592 vm_page_unqueue_nowakeup(m); 593 vm_page_deactivate(m); 594 } 595 marray[i] = m; 596 vm_page_wakeup(m); 597 } 598 ++i; 599 600 for (j = x + 1; j < SWAP_META_PAGES; ++j) { 601 m = vm_page_lookup_busy_try(object, basei - x + j, 602 TRUE, &error); 603 if (error || m == NULL) 604 break; 605 if (vm_swapcache_test(m)) { 606 vm_page_wakeup(m); 607 break; 608 } 609 if (isblkdev && (m->flags & PG_NOTMETA)) { 610 vm_page_wakeup(m); 611 break; 612 } 613 vm_page_io_start(m); 614 vm_page_protect(m, VM_PROT_READ); 615 if (m->queue - m->pc == PQ_CACHE) { 616 vm_page_unqueue_nowakeup(m); 617 vm_page_deactivate(m); 618 } 619 marray[j] = m; 620 vm_page_wakeup(m); 621 } 622 623 count = j - i; 624 vm_object_pip_add(object, count); 625 swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i); 626 vm_swapcache_write_count += count * PAGE_SIZE; 627 vm_swapcache_curburst -= count * PAGE_SIZE; 628 629 while (i < j) { 630 if (rtvals[i] != VM_PAGER_PEND) { 631 vm_page_busy_wait(marray[i], FALSE, "swppgfd"); 632 vm_page_io_finish(marray[i]); 633 vm_page_wakeup(marray[i]); 634 vm_object_pip_wakeup(object); 635 } 636 ++i; 637 } 638 vm_object_drop(object); 639 return(count); 640 } 641 642 /* 643 * Test whether a VM page is suitable for writing to the swapcache. 644 * Does not test m->queue, PG_MARKER, or PG_SWAPPED. 645 * 646 * Returns 0 on success, 1 on failure 647 */ 648 static int 649 vm_swapcache_test(vm_page_t m) 650 { 651 vm_object_t object; 652 653 if (m->flags & PG_UNMANAGED) 654 return(1); 655 if (m->hold_count || m->wire_count) 656 return(1); 657 if (m->valid != VM_PAGE_BITS_ALL) 658 return(1); 659 if (m->dirty & m->valid) 660 return(1); 661 if ((object = m->object) == NULL) 662 return(1); 663 if (object->type != OBJT_VNODE || 664 (object->flags & OBJ_DEAD)) { 665 return(1); 666 } 667 vm_page_test_dirty(m); 668 if (m->dirty & m->valid) 669 return(1); 670 return(0); 671 } 672 673 /* 674 * Cleaning pass. 675 * 676 * We clean whole objects up to 16MB 677 */ 678 static 679 void 680 vm_swapcache_cleaning(vm_object_t marker, int *swindexp) 681 { 682 vm_object_t object; 683 struct vnode *vp; 684 int count; 685 int scount; 686 int n; 687 688 count = vm_swapcache_maxlaunder; 689 scount = vm_swapcache_maxscan; 690 691 /* 692 * Look for vnode objects 693 */ 694 lwkt_gettoken(&vmobj_tokens[*swindexp]); 695 696 outerloop: 697 while ((object = TAILQ_NEXT(marker, object_list)) != NULL) { 698 /* 699 * We have to skip markers. We cannot hold/drop marker 700 * objects! 701 */ 702 if (object->type == OBJT_MARKER) { 703 vm_swapcache_movemarker(marker, *swindexp, object); 704 continue; 705 } 706 707 /* 708 * Safety, or in case there are millions of VM objects 709 * without swapcache backing. 710 */ 711 if (--scount <= 0) 712 goto breakout; 713 714 /* 715 * We must hold the object before potentially yielding. 716 */ 717 vm_object_hold(object); 718 lwkt_yield(); 719 720 /* 721 * Only operate on live VNODE objects that are either 722 * VREG or VCHR (VCHR for meta-data). 723 */ 724 if ((object->type != OBJT_VNODE) || 725 ((object->flags & OBJ_DEAD) || 726 object->swblock_count == 0) || 727 ((vp = object->handle) == NULL) || 728 (vp->v_type != VREG && vp->v_type != VCHR)) { 729 vm_object_drop(object); 730 /* object may be invalid now */ 731 vm_swapcache_movemarker(marker, *swindexp, object); 732 continue; 733 } 734 735 /* 736 * Reset the object pindex stored in the marker if the 737 * working object has changed. 738 */ 739 if (marker->backing_object != object) { 740 marker->size = 0; 741 marker->backing_object_offset = 0; 742 marker->backing_object = object; 743 } 744 745 /* 746 * Look for swblocks starting at our iterator. 747 * 748 * The swap_pager_condfree() function attempts to free 749 * swap space starting at the specified index. The index 750 * will be updated on return. The function will return 751 * a scan factor (NOT the number of blocks freed). 752 * 753 * If it must cut its scan of the object short due to an 754 * excessive number of swblocks, or is able to free the 755 * requested number of blocks, it will return n >= count 756 * and we break and pick it back up on a future attempt. 757 * 758 * Scan the object linearly and try to batch large sets of 759 * blocks that are likely to clean out entire swap radix 760 * tree leafs. 761 */ 762 lwkt_token_swap(); 763 lwkt_reltoken(&vmobj_tokens[*swindexp]); 764 765 n = swap_pager_condfree(object, &marker->size, 766 (count + SWAP_META_MASK) & ~SWAP_META_MASK); 767 768 vm_object_drop(object); /* object may be invalid now */ 769 lwkt_gettoken(&vmobj_tokens[*swindexp]); 770 771 /* 772 * If we have exhausted the object or deleted our per-pass 773 * page limit then move us to the next object. Note that 774 * the current object may no longer be on the vm_object_list. 775 */ 776 if (n <= 0 || 777 marker->backing_object_offset > vm_swapcache_cleanperobj) { 778 vm_swapcache_movemarker(marker, *swindexp, object); 779 } 780 781 /* 782 * If we have exhausted our max-launder stop for now. 783 */ 784 count -= n; 785 marker->backing_object_offset += n * PAGE_SIZE; 786 if (count < 0) 787 goto breakout; 788 } 789 790 /* 791 * Iterate vm_object_lists[] hash table 792 */ 793 TAILQ_REMOVE(&vm_object_lists[*swindexp], marker, object_list); 794 lwkt_reltoken(&vmobj_tokens[*swindexp]); 795 if (++*swindexp >= VMOBJ_HSIZE) 796 *swindexp = 0; 797 lwkt_gettoken(&vmobj_tokens[*swindexp]); 798 TAILQ_INSERT_HEAD(&vm_object_lists[*swindexp], marker, object_list); 799 800 if (*swindexp != 0) 801 goto outerloop; 802 803 breakout: 804 lwkt_reltoken(&vmobj_tokens[*swindexp]); 805 } 806 807 /* 808 * Move the marker past the current object. Object can be stale, but we 809 * still need it to determine if the marker has to be moved. If the object 810 * is still the 'current object' (object after the marker), we hop-scotch 811 * the marker past it. 812 */ 813 static void 814 vm_swapcache_movemarker(vm_object_t marker, int swindex, vm_object_t object) 815 { 816 if (TAILQ_NEXT(marker, object_list) == object) { 817 TAILQ_REMOVE(&vm_object_lists[swindex], marker, object_list); 818 TAILQ_INSERT_AFTER(&vm_object_lists[swindex], object, 819 marker, object_list); 820 } 821 } 822