1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2010,2019 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * Implement the swapcache daemon. When enabled swap is assumed to be 39 * configured on a fast storage device such as a SSD. Swap is assigned 40 * to clean vnode-backed pages in the inactive queue, clustered by object 41 * if possible, and written out. The swap assignment sticks around even 42 * after the underlying pages have been recycled. 43 * 44 * The daemon manages write bandwidth based on sysctl settings to control 45 * wear on the SSD. 46 * 47 * The vnode strategy code will check for the swap assignments and divert 48 * reads to the swap device when the data is present in the swapcache. 49 * 50 * This operates on both regular files and the block device vnodes used by 51 * filesystems to manage meta-data. 52 */ 53 54 #include "opt_vm.h" 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/proc.h> 59 #include <sys/kthread.h> 60 #include <sys/resourcevar.h> 61 #include <sys/signalvar.h> 62 #include <sys/vnode.h> 63 #include <sys/vmmeter.h> 64 #include <sys/sysctl.h> 65 #include <sys/eventhandler.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <sys/lock.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_pager.h> 75 #include <vm/swap_pager.h> 76 #include <vm/vm_extern.h> 77 78 #include <sys/spinlock2.h> 79 #include <vm/vm_page2.h> 80 81 struct swmarker { 82 struct vm_object dummy_obj; 83 struct vm_object *save_obj; 84 vm_ooffset_t save_off; 85 }; 86 87 typedef struct swmarker swmarker_t; 88 89 /* the kernel process "vm_pageout"*/ 90 static int vm_swapcached_flush (vm_page_t m, int isblkdev); 91 static int vm_swapcache_test(vm_page_t m); 92 static int vm_swapcache_writing_heuristic(void); 93 static int vm_swapcache_writing(vm_page_t marker, int count, int scount); 94 static void vm_swapcache_cleaning(swmarker_t *marker, 95 struct vm_object_hash **swindexp); 96 static void vm_swapcache_movemarker(swmarker_t *marker, 97 struct vm_object_hash *swindex, vm_object_t object); 98 struct thread *swapcached_thread; 99 100 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL); 101 102 int vm_swapcache_read_enable; 103 static long vm_swapcache_wtrigger; 104 static int vm_swapcache_sleep; 105 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8; 106 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4; 107 static int vm_swapcache_data_enable = 0; 108 static int vm_swapcache_meta_enable = 0; 109 static int vm_swapcache_maxswappct = 75; 110 static int vm_swapcache_hysteresis; 111 static int vm_swapcache_min_hysteresis; 112 int vm_swapcache_use_chflags = 0; /* require chflags cache */ 113 static int64_t vm_swapcache_minburst = 10000000LL; /* 10MB */ 114 static int64_t vm_swapcache_curburst = 4000000000LL; /* 4G after boot */ 115 static int64_t vm_swapcache_maxburst = 2000000000LL; /* 2G nominal max */ 116 static int64_t vm_swapcache_accrate = 100000LL; /* 100K/s */ 117 static int64_t vm_swapcache_write_count; 118 static int64_t vm_swapcache_maxfilesize; 119 static int64_t vm_swapcache_cleanperobj = 16*1024*1024; 120 121 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder, 122 CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, ""); 123 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan, 124 CTLFLAG_RW, &vm_swapcache_maxscan, 0, ""); 125 126 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable, 127 CTLFLAG_RW, &vm_swapcache_data_enable, 0, ""); 128 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable, 129 CTLFLAG_RW, &vm_swapcache_meta_enable, 0, ""); 130 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable, 131 CTLFLAG_RW, &vm_swapcache_read_enable, 0, ""); 132 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct, 133 CTLFLAG_RW, &vm_swapcache_maxswappct, 0, ""); 134 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis, 135 CTLFLAG_RD, &vm_swapcache_hysteresis, 0, ""); 136 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis, 137 CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, ""); 138 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags, 139 CTLFLAG_RW, &vm_swapcache_use_chflags, 0, ""); 140 141 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst, 142 CTLFLAG_RW, &vm_swapcache_minburst, 0, ""); 143 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst, 144 CTLFLAG_RW, &vm_swapcache_curburst, 0, ""); 145 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst, 146 CTLFLAG_RW, &vm_swapcache_maxburst, 0, ""); 147 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize, 148 CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, ""); 149 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate, 150 CTLFLAG_RW, &vm_swapcache_accrate, 0, ""); 151 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, 152 CTLFLAG_RW, &vm_swapcache_write_count, 0, ""); 153 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj, 154 CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, ""); 155 156 #define SWAPMAX(adj) \ 157 ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100) 158 159 /* 160 * When shutting down the machine we want to stop swapcache operation 161 * immediately so swap is not accessed after devices have been shuttered. 162 */ 163 static void 164 shutdown_swapcache(void *arg __unused) 165 { 166 vm_swapcache_read_enable = 0; 167 vm_swapcache_data_enable = 0; 168 vm_swapcache_meta_enable = 0; 169 wakeup(&vm_swapcache_sleep); /* shortcut 5-second wait */ 170 } 171 172 /* 173 * vm_swapcached is the high level pageout daemon. 174 * 175 * No requirements. 176 */ 177 static void 178 vm_swapcached_thread(void) 179 { 180 enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; 181 enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING; 182 static struct vm_page page_marker[PQ_L2_SIZE]; 183 static swmarker_t swmarker; 184 static struct vm_object_hash *swindex; 185 int q; 186 187 /* 188 * Thread setup 189 */ 190 curthread->td_flags |= TDF_SYSTHREAD; 191 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 192 swapcached_thread, SHUTDOWN_PRI_FIRST); 193 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache, 194 NULL, SHUTDOWN_PRI_SECOND); 195 196 /* 197 * Initialize our marker for the inactive scan (SWAPC_WRITING) 198 */ 199 bzero(&page_marker, sizeof(page_marker)); 200 for (q = 0; q < PQ_L2_SIZE; ++q) { 201 page_marker[q].flags = PG_FICTITIOUS | PG_MARKER; 202 page_marker[q].busy_count = PBUSY_LOCKED; 203 page_marker[q].queue = PQ_INACTIVE + q; 204 page_marker[q].pc = q; 205 page_marker[q].wire_count = 1; 206 vm_page_queues_spin_lock(PQ_INACTIVE + q); 207 TAILQ_INSERT_HEAD( 208 &vm_page_queues[PQ_INACTIVE + q].pl, 209 &page_marker[q], pageq); 210 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 211 } 212 213 vm_swapcache_min_hysteresis = 1024; 214 vm_swapcache_hysteresis = vm_swapcache_min_hysteresis; 215 vm_swapcache_wtrigger = -vm_swapcache_hysteresis; 216 217 /* 218 * Initialize our marker for the vm_object scan (SWAPC_CLEANING) 219 */ 220 bzero(&swmarker, sizeof(swmarker)); 221 swmarker.dummy_obj.type = OBJT_MARKER; 222 swindex = &vm_object_hash[0]; 223 lwkt_gettoken(&swindex->token); 224 TAILQ_INSERT_HEAD(&swindex->list, &swmarker.dummy_obj, object_entry); 225 lwkt_reltoken(&swindex->token); 226 227 for (;;) { 228 int reached_end; 229 int scount; 230 int count; 231 232 /* 233 * Handle shutdown 234 */ 235 kproc_suspend_loop(); 236 237 /* 238 * Check every 5 seconds when not enabled or if no swap 239 * is present. 240 */ 241 if ((vm_swapcache_data_enable == 0 && 242 vm_swapcache_meta_enable == 0 && 243 vm_swap_cache_use <= SWAPMAX(0)) || 244 vm_swap_max == 0) { 245 tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5); 246 continue; 247 } 248 249 /* 250 * Polling rate when enabled is approximately 10 hz. 251 */ 252 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); 253 254 /* 255 * State hysteresis. Generate write activity up to 75% of 256 * swap, then clean out swap assignments down to 70%, then 257 * repeat. 258 */ 259 if (state == SWAPC_WRITING) { 260 if (vm_swap_cache_use > SWAPMAX(0)) 261 state = SWAPC_CLEANING; 262 } else { 263 if (vm_swap_cache_use < SWAPMAX(-10)) 264 state = SWAPC_WRITING; 265 } 266 267 /* 268 * We are allowed to continue accumulating burst value 269 * in either state. Allow the user to set curburst > maxburst 270 * for the initial load-in. 271 */ 272 if (vm_swapcache_curburst < vm_swapcache_maxburst) { 273 vm_swapcache_curburst += vm_swapcache_accrate / 10; 274 if (vm_swapcache_curburst > vm_swapcache_maxburst) 275 vm_swapcache_curburst = vm_swapcache_maxburst; 276 } 277 278 /* 279 * We don't want to nickle-and-dime the scan as that will 280 * create unnecessary fragmentation. The minimum burst 281 * is one-seconds worth of accumulation. 282 */ 283 if (state != SWAPC_WRITING) { 284 vm_swapcache_cleaning(&swmarker, &swindex); 285 continue; 286 } 287 if (vm_swapcache_curburst < vm_swapcache_accrate) 288 continue; 289 290 reached_end = 0; 291 count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2; 292 scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2; 293 294 if (burst == SWAPB_BURSTING) { 295 if (vm_swapcache_writing_heuristic()) { 296 for (q = 0; q < PQ_L2_SIZE; ++q) { 297 reached_end += 298 vm_swapcache_writing( 299 &page_marker[q], 300 count, 301 scount); 302 } 303 } 304 if (vm_swapcache_curburst <= 0) 305 burst = SWAPB_RECOVERING; 306 } else if (vm_swapcache_curburst > vm_swapcache_minburst) { 307 if (vm_swapcache_writing_heuristic()) { 308 for (q = 0; q < PQ_L2_SIZE; ++q) { 309 reached_end += 310 vm_swapcache_writing( 311 &page_marker[q], 312 count, 313 scount); 314 } 315 } 316 burst = SWAPB_BURSTING; 317 } 318 if (reached_end == PQ_L2_SIZE) { 319 vm_swapcache_wtrigger = -vm_swapcache_hysteresis; 320 } 321 } 322 323 /* 324 * Cleanup (NOT REACHED) 325 */ 326 for (q = 0; q < PQ_L2_SIZE; ++q) { 327 vm_page_queues_spin_lock(PQ_INACTIVE + q); 328 TAILQ_REMOVE( 329 &vm_page_queues[PQ_INACTIVE + q].pl, 330 &page_marker[q], pageq); 331 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 332 } 333 334 lwkt_gettoken(&swindex->token); 335 TAILQ_REMOVE(&swindex->list, &swmarker.dummy_obj, object_entry); 336 lwkt_reltoken(&swindex->token); 337 } 338 339 static struct kproc_desc swpc_kp = { 340 "swapcached", 341 vm_swapcached_thread, 342 &swapcached_thread 343 }; 344 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp); 345 346 /* 347 * Deal with an overflow of the heuristic counter or if the user 348 * manually changes the hysteresis. 349 * 350 * Try to avoid small incremental pageouts by waiting for enough 351 * pages to buildup in the inactive queue to hopefully get a good 352 * burst in. This heuristic is bumped by the VM system and reset 353 * when our scan hits the end of the queue. 354 * 355 * Return TRUE if we need to take a writing pass. 356 */ 357 static int 358 vm_swapcache_writing_heuristic(void) 359 { 360 int hyst; 361 int q; 362 long adds; 363 364 hyst = vmstats.v_inactive_count / 4; 365 if (hyst < vm_swapcache_min_hysteresis) 366 hyst = vm_swapcache_min_hysteresis; 367 cpu_ccfence(); 368 vm_swapcache_hysteresis = hyst; 369 370 adds = 0; 371 for (q = PQ_INACTIVE; q < PQ_INACTIVE + PQ_L2_SIZE; ++q) { 372 adds += atomic_swap_long(&vm_page_queues[q].adds, 0); 373 } 374 vm_swapcache_wtrigger += adds; 375 if (vm_swapcache_wtrigger < -hyst) 376 vm_swapcache_wtrigger = -hyst; 377 return (vm_swapcache_wtrigger >= 0); 378 } 379 380 /* 381 * Take a writing pass on one of the inactive queues, return non-zero if 382 * we hit the end of the queue. 383 */ 384 static int 385 vm_swapcache_writing(vm_page_t marker, int count, int scount) 386 { 387 vm_object_t object; 388 struct vnode *vp; 389 vm_page_t m; 390 int isblkdev; 391 392 /* 393 * Scan the inactive queue from our marker to locate 394 * suitable pages to push to the swap cache. 395 * 396 * We are looking for clean vnode-backed pages. 397 */ 398 vm_page_queues_spin_lock(marker->queue); 399 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 400 count > 0 && scount-- > 0) { 401 KKASSERT(m->queue == marker->queue); 402 403 /* 404 * Stop using swap if paniced, dumping, or dumped. 405 * Don't try to write if our curburst has been exhausted. 406 */ 407 if (panicstr || dumping) 408 break; 409 if (vm_swapcache_curburst < 0) 410 break; 411 412 /* 413 * Move marker 414 */ 415 TAILQ_REMOVE( 416 &vm_page_queues[marker->queue].pl, marker, pageq); 417 TAILQ_INSERT_AFTER( 418 &vm_page_queues[marker->queue].pl, m, marker, pageq); 419 420 /* 421 * Ignore markers and ignore pages that already have a swap 422 * assignment. 423 */ 424 if (m->flags & (PG_MARKER | PG_SWAPPED)) 425 continue; 426 if (vm_page_busy_try(m, TRUE)) 427 continue; 428 vm_page_queues_spin_unlock(marker->queue); 429 430 if ((object = m->object) == NULL) { 431 vm_page_wakeup(m); 432 vm_page_queues_spin_lock(marker->queue); 433 continue; 434 } 435 vm_object_hold(object); 436 if (m->object != object) { 437 vm_object_drop(object); 438 vm_page_wakeup(m); 439 vm_page_queues_spin_lock(marker->queue); 440 continue; 441 } 442 if (vm_swapcache_test(m)) { 443 vm_object_drop(object); 444 vm_page_wakeup(m); 445 vm_page_queues_spin_lock(marker->queue); 446 continue; 447 } 448 449 vp = object->handle; 450 if (vp == NULL) { 451 vm_object_drop(object); 452 vm_page_wakeup(m); 453 vm_page_queues_spin_lock(marker->queue); 454 continue; 455 } 456 457 switch(vp->v_type) { 458 case VREG: 459 /* 460 * PG_NOTMETA generically means 'don't swapcache this', 461 * and HAMMER will set this for regular data buffers 462 * (and leave it unset for meta-data buffers) as 463 * appropriate when double buffering is enabled. 464 */ 465 if (m->flags & PG_NOTMETA) { 466 vm_object_drop(object); 467 vm_page_wakeup(m); 468 vm_page_queues_spin_lock(marker->queue); 469 continue; 470 } 471 472 /* 473 * If data_enable is 0 do not try to swapcache data. 474 * If use_chflags is set then only swapcache data for 475 * VSWAPCACHE marked vnodes, otherwise any vnode. 476 */ 477 if (vm_swapcache_data_enable == 0 || 478 ((vp->v_flag & VSWAPCACHE) == 0 && 479 vm_swapcache_use_chflags)) { 480 vm_object_drop(object); 481 vm_page_wakeup(m); 482 vm_page_queues_spin_lock(marker->queue); 483 continue; 484 } 485 if (vm_swapcache_maxfilesize && 486 object->size > 487 (vm_swapcache_maxfilesize >> PAGE_SHIFT)) { 488 vm_object_drop(object); 489 vm_page_wakeup(m); 490 vm_page_queues_spin_lock(marker->queue); 491 continue; 492 } 493 isblkdev = 0; 494 break; 495 case VCHR: 496 /* 497 * PG_NOTMETA generically means 'don't swapcache this', 498 * and HAMMER will set this for regular data buffers 499 * (and leave it unset for meta-data buffers) as 500 * appropriate when double buffering is enabled. 501 */ 502 if (m->flags & PG_NOTMETA) { 503 vm_object_drop(object); 504 vm_page_wakeup(m); 505 vm_page_queues_spin_lock(marker->queue); 506 continue; 507 } 508 if (vm_swapcache_meta_enable == 0) { 509 vm_object_drop(object); 510 vm_page_wakeup(m); 511 vm_page_queues_spin_lock(marker->queue); 512 continue; 513 } 514 isblkdev = 1; 515 break; 516 default: 517 vm_object_drop(object); 518 vm_page_wakeup(m); 519 vm_page_queues_spin_lock(marker->queue); 520 continue; 521 } 522 523 524 /* 525 * Assign swap and initiate I/O. 526 * 527 * (adjust for the --count which also occurs in the loop) 528 */ 529 count -= vm_swapcached_flush(m, isblkdev); 530 531 /* 532 * Setup for next loop using marker. 533 */ 534 vm_object_drop(object); 535 vm_page_queues_spin_lock(marker->queue); 536 } 537 538 /* 539 * The marker could wind up at the end, which is ok. If we hit the 540 * end of the list adjust the heuristic. 541 * 542 * Earlier inactive pages that were dirty and become clean 543 * are typically moved to the end of PQ_INACTIVE by virtue 544 * of vfs_vmio_release() when they become unwired from the 545 * buffer cache. 546 */ 547 vm_page_queues_spin_unlock(marker->queue); 548 549 /* 550 * m invalid but can be used to test for NULL 551 */ 552 return (m == NULL); 553 } 554 555 /* 556 * Flush the specified page using the swap_pager. The page 557 * must be busied by the caller and its disposition will become 558 * the responsibility of this function. 559 * 560 * Try to collect surrounding pages, including pages which may 561 * have already been assigned swap. Try to cluster within a 562 * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block 563 * to match what swap_pager_putpages() can do. 564 * 565 * We also want to try to match against the buffer cache blocksize 566 * but we don't really know what it is here. Since the buffer cache 567 * wires and unwires pages in groups the fact that we skip wired pages 568 * should be sufficient. 569 * 570 * Returns a count of pages we might have flushed (minimum 1) 571 */ 572 static 573 int 574 vm_swapcached_flush(vm_page_t m, int isblkdev) 575 { 576 vm_object_t object; 577 vm_page_t marray[SWAP_META_PAGES]; 578 vm_pindex_t basei; 579 int rtvals[SWAP_META_PAGES]; 580 int x; 581 int i; 582 int j; 583 int count; 584 int error; 585 586 vm_page_io_start(m); 587 vm_page_protect(m, VM_PROT_READ); 588 object = m->object; 589 vm_object_hold(object); 590 591 /* 592 * Try to cluster around (m), keeping in mind that the swap pager 593 * can only do SMAP_META_PAGES worth of continguous write. 594 */ 595 x = (int)m->pindex & SWAP_META_MASK; 596 marray[x] = m; 597 basei = m->pindex; 598 vm_page_wakeup(m); 599 600 for (i = x - 1; i >= 0; --i) { 601 m = vm_page_lookup_busy_try(object, basei - x + i, 602 TRUE, &error); 603 if (error || m == NULL) 604 break; 605 if (vm_swapcache_test(m)) { 606 vm_page_wakeup(m); 607 break; 608 } 609 if (isblkdev && (m->flags & PG_NOTMETA)) { 610 vm_page_wakeup(m); 611 break; 612 } 613 vm_page_io_start(m); 614 vm_page_protect(m, VM_PROT_READ); 615 if (m->queue - m->pc == PQ_CACHE) { 616 vm_page_unqueue_nowakeup(m); 617 vm_page_deactivate(m); 618 } 619 marray[i] = m; 620 vm_page_wakeup(m); 621 } 622 ++i; 623 624 for (j = x + 1; j < SWAP_META_PAGES; ++j) { 625 m = vm_page_lookup_busy_try(object, basei - x + j, 626 TRUE, &error); 627 if (error || m == NULL) 628 break; 629 if (vm_swapcache_test(m)) { 630 vm_page_wakeup(m); 631 break; 632 } 633 if (isblkdev && (m->flags & PG_NOTMETA)) { 634 vm_page_wakeup(m); 635 break; 636 } 637 vm_page_io_start(m); 638 vm_page_protect(m, VM_PROT_READ); 639 if (m->queue - m->pc == PQ_CACHE) { 640 vm_page_unqueue_nowakeup(m); 641 vm_page_deactivate(m); 642 } 643 marray[j] = m; 644 vm_page_wakeup(m); 645 } 646 647 count = j - i; 648 vm_object_pip_add(object, count); 649 swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i); 650 vm_swapcache_write_count += count * PAGE_SIZE; 651 vm_swapcache_curburst -= count * PAGE_SIZE; 652 653 while (i < j) { 654 if (rtvals[i] != VM_PAGER_PEND) { 655 vm_page_busy_wait(marray[i], FALSE, "swppgfd"); 656 vm_page_io_finish(marray[i]); 657 vm_page_wakeup(marray[i]); 658 vm_object_pip_wakeup(object); 659 } 660 ++i; 661 } 662 vm_object_drop(object); 663 return(count); 664 } 665 666 /* 667 * Test whether a VM page is suitable for writing to the swapcache. 668 * Does not test m->queue, PG_MARKER, or PG_SWAPPED. 669 * 670 * Returns 0 on success, 1 on failure 671 */ 672 static int 673 vm_swapcache_test(vm_page_t m) 674 { 675 vm_object_t object; 676 677 if (m->flags & (PG_UNQUEUED | PG_FICTITIOUS)) 678 return(1); 679 if (m->hold_count || m->wire_count) 680 return(1); 681 if (m->valid != VM_PAGE_BITS_ALL) 682 return(1); 683 if (m->dirty & m->valid) 684 return(1); 685 if ((object = m->object) == NULL) 686 return(1); 687 if (object->type != OBJT_VNODE || 688 (object->flags & OBJ_DEAD)) { 689 return(1); 690 } 691 vm_page_test_dirty(m); 692 if (m->dirty & m->valid) 693 return(1); 694 return(0); 695 } 696 697 /* 698 * Cleaning pass. 699 * 700 * We clean whole objects up to 16MB 701 */ 702 static 703 void 704 vm_swapcache_cleaning(swmarker_t *marker, struct vm_object_hash **swindexp) 705 { 706 vm_object_t object; 707 struct vnode *vp; 708 int count; 709 int scount; 710 int n; 711 int didmove; 712 713 count = vm_swapcache_maxlaunder; 714 scount = vm_swapcache_maxscan; 715 716 /* 717 * Look for vnode objects 718 */ 719 lwkt_gettoken(&(*swindexp)->token); 720 721 didmove = 0; 722 outerloop: 723 while ((object = TAILQ_NEXT(&marker->dummy_obj, 724 object_entry)) != NULL) { 725 /* 726 * We have to skip markers. We cannot hold/drop marker 727 * objects! 728 */ 729 if (object->type == OBJT_MARKER) { 730 vm_swapcache_movemarker(marker, *swindexp, object); 731 didmove = 1; 732 continue; 733 } 734 735 /* 736 * Safety, or in case there are millions of VM objects 737 * without swapcache backing. 738 */ 739 if (--scount <= 0) 740 goto breakout; 741 742 /* 743 * We must hold the object before potentially yielding. 744 */ 745 vm_object_hold(object); 746 lwkt_yield(); 747 748 /* 749 * Only operate on live VNODE objects that are either 750 * VREG or VCHR (VCHR for meta-data). 751 */ 752 if ((object->type != OBJT_VNODE) || 753 ((object->flags & OBJ_DEAD) || 754 object->swblock_count == 0) || 755 ((vp = object->handle) == NULL) || 756 (vp->v_type != VREG && vp->v_type != VCHR)) { 757 vm_object_drop(object); 758 /* object may be invalid now */ 759 vm_swapcache_movemarker(marker, *swindexp, object); 760 didmove = 1; 761 continue; 762 } 763 764 /* 765 * Reset the object pindex stored in the marker if the 766 * working object has changed. 767 */ 768 if (marker->save_obj != object || didmove) { 769 marker->dummy_obj.size = 0; 770 marker->save_off = 0; 771 marker->save_obj = object; 772 didmove = 0; 773 } 774 775 /* 776 * Look for swblocks starting at our iterator. 777 * 778 * The swap_pager_condfree() function attempts to free 779 * swap space starting at the specified index. The index 780 * will be updated on return. The function will return 781 * a scan factor (NOT the number of blocks freed). 782 * 783 * If it must cut its scan of the object short due to an 784 * excessive number of swblocks, or is able to free the 785 * requested number of blocks, it will return n >= count 786 * and we break and pick it back up on a future attempt. 787 * 788 * Scan the object linearly and try to batch large sets of 789 * blocks that are likely to clean out entire swap radix 790 * tree leafs. 791 */ 792 lwkt_token_swap(); 793 lwkt_reltoken(&(*swindexp)->token); 794 795 n = swap_pager_condfree(object, &marker->dummy_obj.size, 796 (count + SWAP_META_MASK) & ~SWAP_META_MASK); 797 798 vm_object_drop(object); /* object may be invalid now */ 799 lwkt_gettoken(&(*swindexp)->token); 800 801 /* 802 * If we have exhausted the object or deleted our per-pass 803 * page limit then move us to the next object. Note that 804 * the current object may no longer be on the vm_object_entry. 805 */ 806 if (n <= 0 || 807 marker->save_off > vm_swapcache_cleanperobj) { 808 vm_swapcache_movemarker(marker, *swindexp, object); 809 didmove = 1; 810 } 811 812 /* 813 * If we have exhausted our max-launder stop for now. 814 */ 815 count -= n; 816 marker->save_off += n * PAGE_SIZE; 817 if (count < 0) 818 goto breakout; 819 } 820 821 /* 822 * Iterate vm_object_hash[] hash table 823 */ 824 TAILQ_REMOVE(&(*swindexp)->list, &marker->dummy_obj, object_entry); 825 lwkt_reltoken(&(*swindexp)->token); 826 if (++*swindexp >= &vm_object_hash[VMOBJ_HSIZE]) 827 *swindexp = &vm_object_hash[0]; 828 lwkt_gettoken(&(*swindexp)->token); 829 TAILQ_INSERT_HEAD(&(*swindexp)->list, &marker->dummy_obj, object_entry); 830 831 if (*swindexp != &vm_object_hash[0]) 832 goto outerloop; 833 834 breakout: 835 lwkt_reltoken(&(*swindexp)->token); 836 } 837 838 /* 839 * Move the marker past the current object. Object can be stale, but we 840 * still need it to determine if the marker has to be moved. If the object 841 * is still the 'current object' (object after the marker), we hop-scotch 842 * the marker past it. 843 */ 844 static void 845 vm_swapcache_movemarker(swmarker_t *marker, struct vm_object_hash *swindex, 846 vm_object_t object) 847 { 848 if (TAILQ_NEXT(&marker->dummy_obj, object_entry) == object) { 849 TAILQ_REMOVE(&swindex->list, &marker->dummy_obj, object_entry); 850 TAILQ_INSERT_AFTER(&swindex->list, object, 851 &marker->dummy_obj, object_entry); 852 } 853 } 854