xref: /dragonfly/sys/vm/vm_swapcache.c (revision f9993810)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 2010,2019 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * Implement the swapcache daemon.  When enabled swap is assumed to be
39  * configured on a fast storage device such as a SSD.  Swap is assigned
40  * to clean vnode-backed pages in the inactive queue, clustered by object
41  * if possible, and written out.  The swap assignment sticks around even
42  * after the underlying pages have been recycled.
43  *
44  * The daemon manages write bandwidth based on sysctl settings to control
45  * wear on the SSD.
46  *
47  * The vnode strategy code will check for the swap assignments and divert
48  * reads to the swap device when the data is present in the swapcache.
49  *
50  * This operates on both regular files and the block device vnodes used by
51  * filesystems to manage meta-data.
52  */
53 
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/kernel.h>
57 #include <sys/proc.h>
58 #include <sys/kthread.h>
59 #include <sys/resourcevar.h>
60 #include <sys/signalvar.h>
61 #include <sys/vnode.h>
62 #include <sys/vmmeter.h>
63 #include <sys/sysctl.h>
64 #include <sys/eventhandler.h>
65 
66 #include <vm/vm.h>
67 #include <vm/vm_param.h>
68 #include <sys/lock.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_map.h>
72 #include <vm/vm_pageout.h>
73 #include <vm/vm_pager.h>
74 #include <vm/swap_pager.h>
75 #include <vm/vm_extern.h>
76 
77 #include <sys/spinlock2.h>
78 #include <vm/vm_page2.h>
79 
80 struct swmarker {
81 	struct vm_object dummy_obj;
82 	struct vm_object *save_obj;
83 	vm_ooffset_t save_off;
84 };
85 
86 typedef struct swmarker swmarker_t;
87 
88 /* the kernel process "vm_pageout"*/
89 static int vm_swapcached_flush (vm_page_t m, int isblkdev);
90 static int vm_swapcache_test(vm_page_t m);
91 static int vm_swapcache_writing_heuristic(void);
92 static int vm_swapcache_writing(vm_page_t marker, int count, int scount);
93 static void vm_swapcache_cleaning(swmarker_t *marker,
94 			struct vm_object_hash **swindexp);
95 static void vm_swapcache_movemarker(swmarker_t *marker,
96 			struct vm_object_hash *swindex, vm_object_t object);
97 struct thread *swapcached_thread;
98 
99 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL);
100 
101 int vm_swapcache_read_enable;
102 static long vm_swapcache_wtrigger;
103 static int vm_swapcache_sleep;
104 static int vm_swapcache_maxscan = PQ_L2_SIZE * 8;
105 static int vm_swapcache_maxlaunder = PQ_L2_SIZE * 4;
106 static int vm_swapcache_data_enable = 0;
107 static int vm_swapcache_meta_enable = 0;
108 static int vm_swapcache_maxswappct = 75;
109 static int vm_swapcache_hysteresis;
110 static int vm_swapcache_min_hysteresis;
111 int vm_swapcache_use_chflags = 0;	/* require chflags cache */
112 static int64_t vm_swapcache_minburst = 10000000LL;	/* 10MB */
113 static int64_t vm_swapcache_curburst = 4000000000LL;	/* 4G after boot */
114 static int64_t vm_swapcache_maxburst = 2000000000LL;	/* 2G nominal max */
115 static int64_t vm_swapcache_accrate = 100000LL;		/* 100K/s */
116 static int64_t vm_swapcache_write_count;
117 static int64_t vm_swapcache_maxfilesize;
118 static int64_t vm_swapcache_cleanperobj = 16*1024*1024;
119 
120 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder,
121 	CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, "");
122 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxscan,
123 	CTLFLAG_RW, &vm_swapcache_maxscan, 0, "");
124 
125 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable,
126 	CTLFLAG_RW, &vm_swapcache_data_enable, 0, "");
127 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable,
128 	CTLFLAG_RW, &vm_swapcache_meta_enable, 0, "");
129 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable,
130 	CTLFLAG_RW, &vm_swapcache_read_enable, 0, "");
131 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct,
132 	CTLFLAG_RW, &vm_swapcache_maxswappct, 0, "");
133 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis,
134 	CTLFLAG_RD, &vm_swapcache_hysteresis, 0, "");
135 SYSCTL_INT(_vm_swapcache, OID_AUTO, min_hysteresis,
136 	CTLFLAG_RW, &vm_swapcache_min_hysteresis, 0, "");
137 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags,
138 	CTLFLAG_RW, &vm_swapcache_use_chflags, 0, "");
139 
140 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst,
141 	CTLFLAG_RW, &vm_swapcache_minburst, 0, "");
142 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst,
143 	CTLFLAG_RW, &vm_swapcache_curburst, 0, "");
144 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst,
145 	CTLFLAG_RW, &vm_swapcache_maxburst, 0, "");
146 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize,
147 	CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, "");
148 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate,
149 	CTLFLAG_RW, &vm_swapcache_accrate, 0, "");
150 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count,
151 	CTLFLAG_RW, &vm_swapcache_write_count, 0, "");
152 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj,
153 	CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, "");
154 
155 #define SWAPMAX(adj)	\
156 	((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100)
157 
158 /*
159  * When shutting down the machine we want to stop swapcache operation
160  * immediately so swap is not accessed after devices have been shuttered.
161  */
162 static void
163 shutdown_swapcache(void *arg __unused)
164 {
165 	vm_swapcache_read_enable = 0;
166 	vm_swapcache_data_enable = 0;
167 	vm_swapcache_meta_enable = 0;
168 	wakeup(&vm_swapcache_sleep);	/* shortcut 5-second wait */
169 }
170 
171 /*
172  * vm_swapcached is the high level pageout daemon.
173  *
174  * No requirements.
175  */
176 static void
177 vm_swapcached_thread(void)
178 {
179 	enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING;
180 	enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING;
181 	static struct vm_page page_marker[PQ_L2_SIZE];
182 	static swmarker_t swmarker;
183 	static struct vm_object_hash *swindex;
184 	int q;
185 
186 	/*
187 	 * Thread setup
188 	 */
189 	curthread->td_flags |= TDF_SYSTHREAD;
190 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc,
191 			      swapcached_thread, SHUTDOWN_PRI_FIRST);
192 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
193 			      NULL, SHUTDOWN_PRI_SECOND);
194 
195 	/*
196 	 * Initialize our marker for the inactive scan (SWAPC_WRITING)
197 	 */
198 	bzero(&page_marker, sizeof(page_marker));
199 	for (q = 0; q < PQ_L2_SIZE; ++q) {
200 		page_marker[q].flags = PG_FICTITIOUS | PG_MARKER;
201 		page_marker[q].busy_count = PBUSY_LOCKED;
202 		page_marker[q].queue = PQ_INACTIVE + q;
203 		page_marker[q].pc = q;
204 		page_marker[q].wire_count = 1;
205 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
206 		TAILQ_INSERT_HEAD(
207 			&vm_page_queues[PQ_INACTIVE + q].pl,
208 			&page_marker[q], pageq);
209 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
210 	}
211 
212 	vm_swapcache_min_hysteresis = 1024;
213 	vm_swapcache_hysteresis = vm_swapcache_min_hysteresis;
214 	vm_swapcache_wtrigger = -vm_swapcache_hysteresis;
215 
216 	/*
217 	 * Initialize our marker for the vm_object scan (SWAPC_CLEANING)
218 	 */
219 	bzero(&swmarker, sizeof(swmarker));
220 	swmarker.dummy_obj.type = OBJT_MARKER;
221 	swindex = &vm_object_hash[0];
222 	lwkt_gettoken(&swindex->token);
223 	TAILQ_INSERT_HEAD(&swindex->list, &swmarker.dummy_obj, object_entry);
224 	lwkt_reltoken(&swindex->token);
225 
226 	for (;;) {
227 		int reached_end;
228 		int scount;
229 		int count;
230 
231 		/*
232 		 * Handle shutdown
233 		 */
234 		kproc_suspend_loop();
235 
236 		/*
237 		 * Check every 5 seconds when not enabled or if no swap
238 		 * is present.
239 		 */
240 		if ((vm_swapcache_data_enable == 0 &&
241 		     vm_swapcache_meta_enable == 0 &&
242 		     vm_swap_cache_use <= SWAPMAX(0)) ||
243 		    vm_swap_max == 0) {
244 			tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5);
245 			continue;
246 		}
247 
248 		/*
249 		 * Polling rate when enabled is approximately 10 hz.
250 		 */
251 		tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10);
252 
253 		/*
254 		 * State hysteresis.  Generate write activity up to 75% of
255 		 * swap, then clean out swap assignments down to 70%, then
256 		 * repeat.
257 		 */
258 		if (state == SWAPC_WRITING) {
259 			if (vm_swap_cache_use > SWAPMAX(0))
260 				state = SWAPC_CLEANING;
261 		} else {
262 			if (vm_swap_cache_use < SWAPMAX(-10))
263 				state = SWAPC_WRITING;
264 		}
265 
266 		/*
267 		 * We are allowed to continue accumulating burst value
268 		 * in either state.  Allow the user to set curburst > maxburst
269 		 * for the initial load-in.
270 		 */
271 		if (vm_swapcache_curburst < vm_swapcache_maxburst) {
272 			vm_swapcache_curburst += vm_swapcache_accrate / 10;
273 			if (vm_swapcache_curburst > vm_swapcache_maxburst)
274 				vm_swapcache_curburst = vm_swapcache_maxburst;
275 		}
276 
277 		/*
278 		 * We don't want to nickle-and-dime the scan as that will
279 		 * create unnecessary fragmentation.  The minimum burst
280 		 * is one-seconds worth of accumulation.
281 		 */
282 		if (state != SWAPC_WRITING) {
283 			vm_swapcache_cleaning(&swmarker, &swindex);
284 			continue;
285 		}
286 		if (vm_swapcache_curburst < vm_swapcache_accrate)
287 			continue;
288 
289 		reached_end = 0;
290 		count = vm_swapcache_maxlaunder / PQ_L2_SIZE + 2;
291 		scount = vm_swapcache_maxscan / PQ_L2_SIZE + 2;
292 
293 		if (burst == SWAPB_BURSTING) {
294 			if (vm_swapcache_writing_heuristic()) {
295 				for (q = 0; q < PQ_L2_SIZE; ++q) {
296 					reached_end +=
297 						vm_swapcache_writing(
298 							&page_marker[q],
299 							count,
300 							scount);
301 				}
302 			}
303 			if (vm_swapcache_curburst <= 0)
304 				burst = SWAPB_RECOVERING;
305 		} else if (vm_swapcache_curburst > vm_swapcache_minburst) {
306 			if (vm_swapcache_writing_heuristic()) {
307 				for (q = 0; q < PQ_L2_SIZE; ++q) {
308 					reached_end +=
309 						vm_swapcache_writing(
310 							&page_marker[q],
311 							count,
312 							scount);
313 				}
314 			}
315 			burst = SWAPB_BURSTING;
316 		}
317 		if (reached_end == PQ_L2_SIZE) {
318 			vm_swapcache_wtrigger = -vm_swapcache_hysteresis;
319 		}
320 	}
321 
322 	/*
323 	 * Cleanup (NOT REACHED)
324 	 */
325 	for (q = 0; q < PQ_L2_SIZE; ++q) {
326 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
327 		TAILQ_REMOVE(
328 			&vm_page_queues[PQ_INACTIVE + q].pl,
329 			&page_marker[q], pageq);
330 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
331 	}
332 
333 	lwkt_gettoken(&swindex->token);
334 	TAILQ_REMOVE(&swindex->list, &swmarker.dummy_obj, object_entry);
335 	lwkt_reltoken(&swindex->token);
336 }
337 
338 static struct kproc_desc swpc_kp = {
339 	"swapcached",
340 	vm_swapcached_thread,
341 	&swapcached_thread
342 };
343 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp);
344 
345 /*
346  * Deal with an overflow of the heuristic counter or if the user
347  * manually changes the hysteresis.
348  *
349  * Try to avoid small incremental pageouts by waiting for enough
350  * pages to buildup in the inactive queue to hopefully get a good
351  * burst in.  This heuristic is bumped by the VM system and reset
352  * when our scan hits the end of the queue.
353  *
354  * Return TRUE if we need to take a writing pass.
355  */
356 static int
357 vm_swapcache_writing_heuristic(void)
358 {
359 	int hyst;
360 	int q;
361 	long adds;
362 
363 	hyst = vmstats.v_inactive_count / 4;
364 	if (hyst < vm_swapcache_min_hysteresis)
365 		hyst = vm_swapcache_min_hysteresis;
366 	cpu_ccfence();
367 	vm_swapcache_hysteresis = hyst;
368 
369 	adds = 0;
370 	for (q = PQ_INACTIVE; q < PQ_INACTIVE + PQ_L2_SIZE; ++q) {
371 		adds += atomic_swap_long(&vm_page_queues[q].adds, 0);
372 	}
373 	vm_swapcache_wtrigger += adds;
374 	if (vm_swapcache_wtrigger < -hyst)
375 		vm_swapcache_wtrigger = -hyst;
376 	return (vm_swapcache_wtrigger >= 0);
377 }
378 
379 /*
380  * Take a writing pass on one of the inactive queues, return non-zero if
381  * we hit the end of the queue.
382  */
383 static int
384 vm_swapcache_writing(vm_page_t marker, int count, int scount)
385 {
386 	vm_object_t object;
387 	struct vnode *vp;
388 	vm_page_t m;
389 	int isblkdev;
390 
391 	/*
392 	 * Scan the inactive queue from our marker to locate
393 	 * suitable pages to push to the swap cache.
394 	 *
395 	 * We are looking for clean vnode-backed pages.
396 	 */
397 	vm_page_queues_spin_lock(marker->queue);
398 	while ((m = TAILQ_NEXT(marker, pageq)) != NULL &&
399 	       count > 0 && scount-- > 0) {
400 		KKASSERT(m->queue == marker->queue);
401 
402 		/*
403 		 * Stop using swap if paniced, dumping, or dumped.
404 		 * Don't try to write if our curburst has been exhausted.
405 		 */
406 		if (panicstr || dumping)
407 			break;
408 		if (vm_swapcache_curburst < 0)
409 			break;
410 
411 		/*
412 		 * Move marker
413 		 */
414 		TAILQ_REMOVE(
415 			&vm_page_queues[marker->queue].pl, marker, pageq);
416 		TAILQ_INSERT_AFTER(
417 			&vm_page_queues[marker->queue].pl, m, marker, pageq);
418 
419 		/*
420 		 * Ignore markers and ignore pages that already have a swap
421 		 * assignment.
422 		 */
423 		if (m->flags & (PG_MARKER | PG_SWAPPED))
424 			continue;
425 		if (vm_page_busy_try(m, TRUE))
426 			continue;
427 		vm_page_queues_spin_unlock(marker->queue);
428 
429 		if ((object = m->object) == NULL) {
430 			vm_page_wakeup(m);
431 			vm_page_queues_spin_lock(marker->queue);
432 			continue;
433 		}
434 		vm_object_hold(object);
435 		if (m->object != object) {
436 			vm_object_drop(object);
437 			vm_page_wakeup(m);
438 			vm_page_queues_spin_lock(marker->queue);
439 			continue;
440 		}
441 		if (vm_swapcache_test(m)) {
442 			vm_object_drop(object);
443 			vm_page_wakeup(m);
444 			vm_page_queues_spin_lock(marker->queue);
445 			continue;
446 		}
447 
448 		vp = object->handle;
449 		if (vp == NULL) {
450 			vm_object_drop(object);
451 			vm_page_wakeup(m);
452 			vm_page_queues_spin_lock(marker->queue);
453 			continue;
454 		}
455 
456 		switch(vp->v_type) {
457 		case VREG:
458 			/*
459 			 * PG_NOTMETA generically means 'don't swapcache this',
460 			 * and HAMMER will set this for regular data buffers
461 			 * (and leave it unset for meta-data buffers) as
462 			 * appropriate when double buffering is enabled.
463 			 */
464 			if (m->flags & PG_NOTMETA) {
465 				vm_object_drop(object);
466 				vm_page_wakeup(m);
467 				vm_page_queues_spin_lock(marker->queue);
468 				continue;
469 			}
470 
471 			/*
472 			 * If data_enable is 0 do not try to swapcache data.
473 			 * If use_chflags is set then only swapcache data for
474 			 * VSWAPCACHE marked vnodes, otherwise any vnode.
475 			 */
476 			if (vm_swapcache_data_enable == 0 ||
477 			    ((vp->v_flag & VSWAPCACHE) == 0 &&
478 			     vm_swapcache_use_chflags)) {
479 				vm_object_drop(object);
480 				vm_page_wakeup(m);
481 				vm_page_queues_spin_lock(marker->queue);
482 				continue;
483 			}
484 			if (vm_swapcache_maxfilesize &&
485 			    object->size >
486 			    (vm_swapcache_maxfilesize >> PAGE_SHIFT)) {
487 				vm_object_drop(object);
488 				vm_page_wakeup(m);
489 				vm_page_queues_spin_lock(marker->queue);
490 				continue;
491 			}
492 			isblkdev = 0;
493 			break;
494 		case VCHR:
495 			/*
496 			 * PG_NOTMETA generically means 'don't swapcache this',
497 			 * and HAMMER will set this for regular data buffers
498 			 * (and leave it unset for meta-data buffers) as
499 			 * appropriate when double buffering is enabled.
500 			 */
501 			if (m->flags & PG_NOTMETA) {
502 				vm_object_drop(object);
503 				vm_page_wakeup(m);
504 				vm_page_queues_spin_lock(marker->queue);
505 				continue;
506 			}
507 			if (vm_swapcache_meta_enable == 0) {
508 				vm_object_drop(object);
509 				vm_page_wakeup(m);
510 				vm_page_queues_spin_lock(marker->queue);
511 				continue;
512 			}
513 			isblkdev = 1;
514 			break;
515 		default:
516 			vm_object_drop(object);
517 			vm_page_wakeup(m);
518 			vm_page_queues_spin_lock(marker->queue);
519 			continue;
520 		}
521 
522 
523 		/*
524 		 * Assign swap and initiate I/O.
525 		 *
526 		 * (adjust for the --count which also occurs in the loop)
527 		 */
528 		count -= vm_swapcached_flush(m, isblkdev);
529 
530 		/*
531 		 * Setup for next loop using marker.
532 		 */
533 		vm_object_drop(object);
534 		vm_page_queues_spin_lock(marker->queue);
535 	}
536 
537 	/*
538 	 * The marker could wind up at the end, which is ok.  If we hit the
539 	 * end of the list adjust the heuristic.
540 	 *
541 	 * Earlier inactive pages that were dirty and become clean
542 	 * are typically moved to the end of PQ_INACTIVE by virtue
543 	 * of vfs_vmio_release() when they become unwired from the
544 	 * buffer cache.
545 	 */
546 	vm_page_queues_spin_unlock(marker->queue);
547 
548 	/*
549 	 * m invalid but can be used to test for NULL
550 	 */
551 	return (m == NULL);
552 }
553 
554 /*
555  * Flush the specified page using the swap_pager.  The page
556  * must be busied by the caller and its disposition will become
557  * the responsibility of this function.
558  *
559  * Try to collect surrounding pages, including pages which may
560  * have already been assigned swap.  Try to cluster within a
561  * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block
562  * to match what swap_pager_putpages() can do.
563  *
564  * We also want to try to match against the buffer cache blocksize
565  * but we don't really know what it is here.  Since the buffer cache
566  * wires and unwires pages in groups the fact that we skip wired pages
567  * should be sufficient.
568  *
569  * Returns a count of pages we might have flushed (minimum 1)
570  */
571 static
572 int
573 vm_swapcached_flush(vm_page_t m, int isblkdev)
574 {
575 	vm_object_t object;
576 	vm_page_t marray[SWAP_META_PAGES];
577 	vm_pindex_t basei;
578 	int rtvals[SWAP_META_PAGES];
579 	int x;
580 	int i;
581 	int j;
582 	int count;
583 	int error;
584 
585 	vm_page_io_start(m);
586 	vm_page_protect(m, VM_PROT_READ);
587 	object = m->object;
588 	vm_object_hold(object);
589 
590 	/*
591 	 * Try to cluster around (m), keeping in mind that the swap pager
592 	 * can only do SMAP_META_PAGES worth of continguous write.
593 	 */
594 	x = (int)m->pindex & SWAP_META_MASK;
595 	marray[x] = m;
596 	basei = m->pindex;
597 	vm_page_wakeup(m);
598 
599 	for (i = x - 1; i >= 0; --i) {
600 		m = vm_page_lookup_busy_try(object, basei - x + i,
601 					    TRUE, &error);
602 		if (error || m == NULL)
603 			break;
604 		if (vm_swapcache_test(m)) {
605 			vm_page_wakeup(m);
606 			break;
607 		}
608 		if (isblkdev && (m->flags & PG_NOTMETA)) {
609 			vm_page_wakeup(m);
610 			break;
611 		}
612 		vm_page_io_start(m);
613 		vm_page_protect(m, VM_PROT_READ);
614 		if (m->queue - m->pc == PQ_CACHE) {
615 			vm_page_unqueue_nowakeup(m);
616 			vm_page_deactivate(m);
617 		}
618 		marray[i] = m;
619 		vm_page_wakeup(m);
620 	}
621 	++i;
622 
623 	for (j = x + 1; j < SWAP_META_PAGES; ++j) {
624 		m = vm_page_lookup_busy_try(object, basei - x + j,
625 					    TRUE, &error);
626 		if (error || m == NULL)
627 			break;
628 		if (vm_swapcache_test(m)) {
629 			vm_page_wakeup(m);
630 			break;
631 		}
632 		if (isblkdev && (m->flags & PG_NOTMETA)) {
633 			vm_page_wakeup(m);
634 			break;
635 		}
636 		vm_page_io_start(m);
637 		vm_page_protect(m, VM_PROT_READ);
638 		if (m->queue - m->pc == PQ_CACHE) {
639 			vm_page_unqueue_nowakeup(m);
640 			vm_page_deactivate(m);
641 		}
642 		marray[j] = m;
643 		vm_page_wakeup(m);
644 	}
645 
646 	count = j - i;
647 	vm_object_pip_add(object, count);
648 	swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i);
649 	vm_swapcache_write_count += count * PAGE_SIZE;
650 	vm_swapcache_curburst -= count * PAGE_SIZE;
651 
652 	while (i < j) {
653 		if (rtvals[i] != VM_PAGER_PEND) {
654 			vm_page_busy_wait(marray[i], FALSE, "swppgfd");
655 			vm_page_io_finish(marray[i]);
656 			vm_page_wakeup(marray[i]);
657 			vm_object_pip_wakeup(object);
658 		}
659 		++i;
660 	}
661 	vm_object_drop(object);
662 	return(count);
663 }
664 
665 /*
666  * Test whether a VM page is suitable for writing to the swapcache.
667  * Does not test m->queue, PG_MARKER, or PG_SWAPPED.
668  *
669  * Returns 0 on success, 1 on failure
670  */
671 static int
672 vm_swapcache_test(vm_page_t m)
673 {
674 	vm_object_t object;
675 
676 	if (m->flags & (PG_UNQUEUED | PG_FICTITIOUS))
677 		return(1);
678 	if (m->hold_count || m->wire_count)
679 		return(1);
680 	if (m->valid != VM_PAGE_BITS_ALL)
681 		return(1);
682 	if (m->dirty & m->valid)
683 		return(1);
684 	if ((object = m->object) == NULL)
685 		return(1);
686 	if (object->type != OBJT_VNODE ||
687 	    (object->flags & OBJ_DEAD)) {
688 		return(1);
689 	}
690 	vm_page_test_dirty(m);
691 	if (m->dirty & m->valid)
692 		return(1);
693 	return(0);
694 }
695 
696 /*
697  * Cleaning pass.
698  *
699  * We clean whole objects up to 16MB
700  */
701 static
702 void
703 vm_swapcache_cleaning(swmarker_t *marker, struct vm_object_hash **swindexp)
704 {
705 	vm_object_t object;
706 	struct vnode *vp;
707 	int count;
708 	int scount;
709 	int n;
710 	int didmove;
711 
712 	count = vm_swapcache_maxlaunder;
713 	scount = vm_swapcache_maxscan;
714 
715 	/*
716 	 * Look for vnode objects
717 	 */
718 	lwkt_gettoken(&(*swindexp)->token);
719 
720 	didmove = 0;
721 outerloop:
722 	while ((object = TAILQ_NEXT(&marker->dummy_obj,
723 				    object_entry)) != NULL) {
724 		/*
725 		 * We have to skip markers.  We cannot hold/drop marker
726 		 * objects!
727 		 */
728 		if (object->type == OBJT_MARKER) {
729 			vm_swapcache_movemarker(marker, *swindexp, object);
730 			didmove = 1;
731 			continue;
732 		}
733 
734 		/*
735 		 * Safety, or in case there are millions of VM objects
736 		 * without swapcache backing.
737 		 */
738 		if (--scount <= 0)
739 			goto breakout;
740 
741 		/*
742 		 * We must hold the object before potentially yielding.
743 		 */
744 		vm_object_hold(object);
745 		lwkt_yield();
746 
747 		/*
748 		 * Only operate on live VNODE objects that are either
749 		 * VREG or VCHR (VCHR for meta-data).
750 		 */
751 		if ((object->type != OBJT_VNODE) ||
752 		    ((object->flags & OBJ_DEAD) ||
753 		     object->swblock_count == 0) ||
754 		    ((vp = object->handle) == NULL) ||
755 		    (vp->v_type != VREG && vp->v_type != VCHR)) {
756 			vm_object_drop(object);
757 			/* object may be invalid now */
758 			vm_swapcache_movemarker(marker, *swindexp, object);
759 			didmove = 1;
760 			continue;
761 		}
762 
763 		/*
764 		 * Reset the object pindex stored in the marker if the
765 		 * working object has changed.
766 		 */
767 		if (marker->save_obj != object || didmove) {
768 			marker->dummy_obj.size = 0;
769 			marker->save_off = 0;
770 			marker->save_obj = object;
771 			didmove = 0;
772 		}
773 
774 		/*
775 		 * Look for swblocks starting at our iterator.
776 		 *
777 		 * The swap_pager_condfree() function attempts to free
778 		 * swap space starting at the specified index.  The index
779 		 * will be updated on return.  The function will return
780 		 * a scan factor (NOT the number of blocks freed).
781 		 *
782 		 * If it must cut its scan of the object short due to an
783 		 * excessive number of swblocks, or is able to free the
784 		 * requested number of blocks, it will return n >= count
785 		 * and we break and pick it back up on a future attempt.
786 		 *
787 		 * Scan the object linearly and try to batch large sets of
788 		 * blocks that are likely to clean out entire swap radix
789 		 * tree leafs.
790 		 */
791 		lwkt_token_swap();
792 		lwkt_reltoken(&(*swindexp)->token);
793 
794 		n = swap_pager_condfree(object, &marker->dummy_obj.size,
795 				    (count + SWAP_META_MASK) & ~SWAP_META_MASK);
796 
797 		vm_object_drop(object);		/* object may be invalid now */
798 		lwkt_gettoken(&(*swindexp)->token);
799 
800 		/*
801 		 * If we have exhausted the object or deleted our per-pass
802 		 * page limit then move us to the next object.  Note that
803 		 * the current object may no longer be on the vm_object_entry.
804 		 */
805 		if (n <= 0 ||
806 		    marker->save_off > vm_swapcache_cleanperobj) {
807 			vm_swapcache_movemarker(marker, *swindexp, object);
808 			didmove = 1;
809 		}
810 
811 		/*
812 		 * If we have exhausted our max-launder stop for now.
813 		 */
814 		count -= n;
815 		marker->save_off += n * PAGE_SIZE;
816 		if (count < 0)
817 			goto breakout;
818 	}
819 
820 	/*
821 	 * Iterate vm_object_hash[] hash table
822 	 */
823 	TAILQ_REMOVE(&(*swindexp)->list, &marker->dummy_obj, object_entry);
824 	lwkt_reltoken(&(*swindexp)->token);
825 	if (++*swindexp >= &vm_object_hash[VMOBJ_HSIZE])
826 		*swindexp = &vm_object_hash[0];
827 	lwkt_gettoken(&(*swindexp)->token);
828 	TAILQ_INSERT_HEAD(&(*swindexp)->list, &marker->dummy_obj, object_entry);
829 
830 	if (*swindexp != &vm_object_hash[0])
831 		goto outerloop;
832 
833 breakout:
834 	lwkt_reltoken(&(*swindexp)->token);
835 }
836 
837 /*
838  * Move the marker past the current object.  Object can be stale, but we
839  * still need it to determine if the marker has to be moved.  If the object
840  * is still the 'current object' (object after the marker), we hop-scotch
841  * the marker past it.
842  */
843 static void
844 vm_swapcache_movemarker(swmarker_t *marker, struct vm_object_hash *swindex,
845 			vm_object_t object)
846 {
847 	if (TAILQ_NEXT(&marker->dummy_obj, object_entry) == object) {
848 		TAILQ_REMOVE(&swindex->list, &marker->dummy_obj, object_entry);
849 		TAILQ_INSERT_AFTER(&swindex->list, object,
850 				   &marker->dummy_obj, object_entry);
851 	}
852 }
853