xref: /dragonfly/sys/vm/swap_pager.c (revision 3170ffd7)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1998-2010 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Copyright (c) 1994 John S. Dyson
37  * Copyright (c) 1990 University of Utah.
38  * Copyright (c) 1991, 1993
39  *	The Regents of the University of California.  All rights reserved.
40  *
41  * This code is derived from software contributed to Berkeley by
42  * the Systems Programming Group of the University of Utah Computer
43  * Science Department.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  * 4. Neither the name of the University nor the names of its contributors
54  *    may be used to endorse or promote products derived from this software
55  *    without specific prior written permission.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67  * SUCH DAMAGE.
68  *
69  *				New Swap System
70  *				Matthew Dillon
71  *
72  * Radix Bitmap 'blists'.
73  *
74  *	- The new swapper uses the new radix bitmap code.  This should scale
75  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
76  *	  arbitrary degree of fragmentation.
77  *
78  * Features:
79  *
80  *	- on the fly reallocation of swap during putpages.  The new system
81  *	  does not try to keep previously allocated swap blocks for dirty
82  *	  pages.
83  *
84  *	- on the fly deallocation of swap
85  *
86  *	- No more garbage collection required.  Unnecessarily allocated swap
87  *	  blocks only exist for dirty vm_page_t's now and these are already
88  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
89  *	  removal of invalidated swap blocks when a page is destroyed
90  *	  or renamed.
91  *
92  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
93  * @(#)swap_pager.c	8.9 (Berkeley) 3/21/94
94  * $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
95  */
96 
97 #include <sys/param.h>
98 #include <sys/systm.h>
99 #include <sys/conf.h>
100 #include <sys/kernel.h>
101 #include <sys/proc.h>
102 #include <sys/buf.h>
103 #include <sys/vnode.h>
104 #include <sys/malloc.h>
105 #include <sys/vmmeter.h>
106 #include <sys/sysctl.h>
107 #include <sys/blist.h>
108 #include <sys/lock.h>
109 #include <sys/thread2.h>
110 
111 #include "opt_swap.h"
112 #include <vm/vm.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
115 #include <vm/vm_pager.h>
116 #include <vm/vm_pageout.h>
117 #include <vm/swap_pager.h>
118 #include <vm/vm_extern.h>
119 #include <vm/vm_zone.h>
120 #include <vm/vnode_pager.h>
121 
122 #include <sys/buf2.h>
123 #include <vm/vm_page2.h>
124 
125 #ifndef MAX_PAGEOUT_CLUSTER
126 #define MAX_PAGEOUT_CLUSTER	SWB_NPAGES
127 #endif
128 
129 #define SWM_FREE	0x02	/* free, period			*/
130 #define SWM_POP		0x04	/* pop out			*/
131 
132 #define SWBIO_READ	0x01
133 #define SWBIO_WRITE	0x02
134 #define SWBIO_SYNC	0x04
135 
136 struct swfreeinfo {
137 	vm_object_t	object;
138 	vm_pindex_t	basei;
139 	vm_pindex_t	begi;
140 	vm_pindex_t	endi;	/* inclusive */
141 };
142 
143 struct swswapoffinfo {
144 	vm_object_t	object;
145 	int		devidx;
146 };
147 
148 /*
149  * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
150  * in the old system.
151  */
152 
153 int swap_pager_full;		/* swap space exhaustion (task killing) */
154 int vm_swap_cache_use;
155 int vm_swap_anon_use;
156 
157 static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
158 static int nsw_rcount;		/* free read buffers			*/
159 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
160 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
161 static int nsw_wcount_async_max;/* assigned maximum			*/
162 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
163 
164 struct blist *swapblist;
165 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
166 static int swap_burst_read = 0;	/* allow burst reading */
167 static swblk_t swapiterator;	/* linearize allocations */
168 
169 /* from vm_swap.c */
170 extern struct vnode *swapdev_vp;
171 extern struct swdevt *swdevt;
172 extern int nswdev;
173 
174 #define BLK2DEVIDX(blk) (nswdev > 1 ? blk / dmmax % nswdev : 0)
175 
176 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
177         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
178 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
179         CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
180 
181 SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
182         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
183 SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
184         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
185 SYSCTL_INT(_vm, OID_AUTO, swap_size,
186         CTLFLAG_RD, &vm_swap_size, 0, "");
187 
188 vm_zone_t		swap_zone;
189 
190 /*
191  * Red-Black tree for swblock entries
192  *
193  * The caller must hold vm_token
194  */
195 RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
196 	     vm_pindex_t, swb_index);
197 
198 int
199 rb_swblock_compare(struct swblock *swb1, struct swblock *swb2)
200 {
201 	if (swb1->swb_index < swb2->swb_index)
202 		return(-1);
203 	if (swb1->swb_index > swb2->swb_index)
204 		return(1);
205 	return(0);
206 }
207 
208 static
209 int
210 rb_swblock_scancmp(struct swblock *swb, void *data)
211 {
212 	struct swfreeinfo *info = data;
213 
214 	if (swb->swb_index < info->basei)
215 		return(-1);
216 	if (swb->swb_index > info->endi)
217 		return(1);
218 	return(0);
219 }
220 
221 static
222 int
223 rb_swblock_condcmp(struct swblock *swb, void *data)
224 {
225 	struct swfreeinfo *info = data;
226 
227 	if (swb->swb_index < info->basei)
228 		return(-1);
229 	return(0);
230 }
231 
232 /*
233  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
234  * calls hooked from other parts of the VM system and do not appear here.
235  * (see vm/swap_pager.h).
236  */
237 
238 static void	swap_pager_dealloc (vm_object_t object);
239 static int	swap_pager_getpage (vm_object_t, vm_page_t *, int);
240 static void	swap_chain_iodone(struct bio *biox);
241 
242 struct pagerops swappagerops = {
243 	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
244 	swap_pager_getpage,	/* pagein				*/
245 	swap_pager_putpages,	/* pageout				*/
246 	swap_pager_haspage	/* get backing store status for page	*/
247 };
248 
249 /*
250  * dmmax is in page-sized chunks with the new swap system.  It was
251  * dev-bsized chunks in the old.  dmmax is always a power of 2.
252  *
253  * swap_*() routines are externally accessible.  swp_*() routines are
254  * internal.
255  */
256 
257 int dmmax;
258 static int dmmax_mask;
259 int nswap_lowat = 128;		/* in pages, swap_pager_almost_full warn */
260 int nswap_hiwat = 512;		/* in pages, swap_pager_almost_full warn */
261 
262 static __inline void	swp_sizecheck (void);
263 static void	swp_pager_async_iodone (struct bio *bio);
264 
265 /*
266  * Swap bitmap functions
267  */
268 
269 static __inline void	swp_pager_freeswapspace(vm_object_t object,
270 						swblk_t blk, int npages);
271 static __inline swblk_t	swp_pager_getswapspace(vm_object_t object, int npages);
272 
273 /*
274  * Metadata functions
275  */
276 
277 static void swp_pager_meta_convert(vm_object_t);
278 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, swblk_t);
279 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
280 static void swp_pager_meta_free_all(vm_object_t);
281 static swblk_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
282 
283 /*
284  * SWP_SIZECHECK() -	update swap_pager_full indication
285  *
286  *	update the swap_pager_almost_full indication and warn when we are
287  *	about to run out of swap space, using lowat/hiwat hysteresis.
288  *
289  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
290  *
291  * No restrictions on call
292  * This routine may not block.
293  * SMP races are ok.
294  */
295 static __inline void
296 swp_sizecheck(void)
297 {
298 	if (vm_swap_size < nswap_lowat) {
299 		if (swap_pager_almost_full == 0) {
300 			kprintf("swap_pager: out of swap space\n");
301 			swap_pager_almost_full = 1;
302 		}
303 	} else {
304 		swap_pager_full = 0;
305 		if (vm_swap_size > nswap_hiwat)
306 			swap_pager_almost_full = 0;
307 	}
308 }
309 
310 /*
311  * SWAP_PAGER_INIT() -	initialize the swap pager!
312  *
313  *	Expected to be started from system init.  NOTE:  This code is run
314  *	before much else so be careful what you depend on.  Most of the VM
315  *	system has yet to be initialized at this point.
316  *
317  * Called from the low level boot code only.
318  */
319 static void
320 swap_pager_init(void *arg __unused)
321 {
322 	/*
323 	 * Device Stripe, in PAGE_SIZE'd blocks
324 	 */
325 	dmmax = SWB_NPAGES * 2;
326 	dmmax_mask = ~(dmmax - 1);
327 }
328 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL)
329 
330 /*
331  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
332  *
333  *	Expected to be started from pageout process once, prior to entering
334  *	its main loop.
335  *
336  * Called from the low level boot code only.
337  */
338 void
339 swap_pager_swap_init(void)
340 {
341 	int n, n2;
342 
343 	/*
344 	 * Number of in-transit swap bp operations.  Don't
345 	 * exhaust the pbufs completely.  Make sure we
346 	 * initialize workable values (0 will work for hysteresis
347 	 * but it isn't very efficient).
348 	 *
349 	 * The nsw_cluster_max is constrained by the number of pages an XIO
350 	 * holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
351 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
352 	 * constrained by the swap device interleave stripe size.
353 	 *
354 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
355 	 * designed to prevent other I/O from having high latencies due to
356 	 * our pageout I/O.  The value 4 works well for one or two active swap
357 	 * devices but is probably a little low if you have more.  Even so,
358 	 * a higher value would probably generate only a limited improvement
359 	 * with three or four active swap devices since the system does not
360 	 * typically have to pageout at extreme bandwidths.   We will want
361 	 * at least 2 per swap devices, and 4 is a pretty good value if you
362 	 * have one NFS swap device due to the command/ack latency over NFS.
363 	 * So it all works out pretty well.
364 	 */
365 
366 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
367 
368 	nsw_rcount = (nswbuf + 1) / 2;
369 	nsw_wcount_sync = (nswbuf + 3) / 4;
370 	nsw_wcount_async = 4;
371 	nsw_wcount_async_max = nsw_wcount_async;
372 
373 	/*
374 	 * The zone is dynamically allocated so generally size it to
375 	 * maxswzone (32MB to 512MB of KVM).  Set a minimum size based
376 	 * on physical memory of around 8x (each swblock can hold 16 pages).
377 	 *
378 	 * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
379 	 * has increased dramatically.
380 	 */
381 	n = vmstats.v_page_count / 2;
382 	if (maxswzone && n < maxswzone / sizeof(struct swblock))
383 		n = maxswzone / sizeof(struct swblock);
384 	n2 = n;
385 
386 	do {
387 		swap_zone = zinit(
388 			"SWAPMETA",
389 			sizeof(struct swblock),
390 			n,
391 			ZONE_INTERRUPT,
392 			1);
393 		if (swap_zone != NULL)
394 			break;
395 		/*
396 		 * if the allocation failed, try a zone two thirds the
397 		 * size of the previous attempt.
398 		 */
399 		n -= ((n + 2) / 3);
400 	} while (n > 0);
401 
402 	if (swap_zone == NULL)
403 		panic("swap_pager_swap_init: swap_zone == NULL");
404 	if (n2 != n)
405 		kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
406 }
407 
408 /*
409  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
410  *			its metadata structures.
411  *
412  *	This routine is called from the mmap and fork code to create a new
413  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
414  *	and then converting it with swp_pager_meta_convert().
415  *
416  *	We only support unnamed objects.
417  *
418  * No restrictions.
419  */
420 vm_object_t
421 swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
422 {
423 	vm_object_t object;
424 
425 	KKASSERT(handle == NULL);
426 	object = vm_object_allocate_hold(OBJT_DEFAULT,
427 					 OFF_TO_IDX(offset + PAGE_MASK + size));
428 	swp_pager_meta_convert(object);
429 	vm_object_drop(object);
430 
431 	return (object);
432 }
433 
434 /*
435  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
436  *
437  *	The swap backing for the object is destroyed.  The code is
438  *	designed such that we can reinstantiate it later, but this
439  *	routine is typically called only when the entire object is
440  *	about to be destroyed.
441  *
442  * The object must be locked or unreferenceable.
443  * No other requirements.
444  */
445 static void
446 swap_pager_dealloc(vm_object_t object)
447 {
448 	vm_object_hold(object);
449 	vm_object_pip_wait(object, "swpdea");
450 
451 	/*
452 	 * Free all remaining metadata.  We only bother to free it from
453 	 * the swap meta data.  We do not attempt to free swapblk's still
454 	 * associated with vm_page_t's for this object.  We do not care
455 	 * if paging is still in progress on some objects.
456 	 */
457 	swp_pager_meta_free_all(object);
458 	vm_object_drop(object);
459 }
460 
461 /************************************************************************
462  *			SWAP PAGER BITMAP ROUTINES			*
463  ************************************************************************/
464 
465 /*
466  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
467  *
468  *	Allocate swap for the requested number of pages.  The starting
469  *	swap block number (a page index) is returned or SWAPBLK_NONE
470  *	if the allocation failed.
471  *
472  *	Also has the side effect of advising that somebody made a mistake
473  *	when they configured swap and didn't configure enough.
474  *
475  * The caller must hold the object.
476  * This routine may not block.
477  */
478 static __inline swblk_t
479 swp_pager_getswapspace(vm_object_t object, int npages)
480 {
481 	swblk_t blk;
482 
483 	lwkt_gettoken(&vm_token);
484 	blk = blist_allocat(swapblist, npages, swapiterator);
485 	if (blk == SWAPBLK_NONE)
486 		blk = blist_allocat(swapblist, npages, 0);
487 	if (blk == SWAPBLK_NONE) {
488 		if (swap_pager_full != 2) {
489 			kprintf("swap_pager_getswapspace: failed alloc=%d\n",
490 				npages);
491 			swap_pager_full = 2;
492 			swap_pager_almost_full = 1;
493 		}
494 	} else {
495 		swapiterator = blk;
496 		swapacctspace(blk, -npages);
497 		if (object->type == OBJT_SWAP)
498 			vm_swap_anon_use += npages;
499 		else
500 			vm_swap_cache_use += npages;
501 		swp_sizecheck();
502 	}
503 	lwkt_reltoken(&vm_token);
504 	return(blk);
505 }
506 
507 /*
508  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
509  *
510  *	This routine returns the specified swap blocks back to the bitmap.
511  *
512  *	Note:  This routine may not block (it could in the old swap code),
513  *	and through the use of the new blist routines it does not block.
514  *
515  *	We must be called at splvm() to avoid races with bitmap frees from
516  *	vm_page_remove() aka swap_pager_page_removed().
517  *
518  * This routine may not block.
519  */
520 
521 static __inline void
522 swp_pager_freeswapspace(vm_object_t object, swblk_t blk, int npages)
523 {
524 	struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
525 
526 	lwkt_gettoken(&vm_token);
527 	sp->sw_nused -= npages;
528 	if (object->type == OBJT_SWAP)
529 		vm_swap_anon_use -= npages;
530 	else
531 		vm_swap_cache_use -= npages;
532 
533 	if (sp->sw_flags & SW_CLOSING) {
534 		lwkt_reltoken(&vm_token);
535 		return;
536 	}
537 
538 	blist_free(swapblist, blk, npages);
539 	vm_swap_size += npages;
540 	swp_sizecheck();
541 	lwkt_reltoken(&vm_token);
542 }
543 
544 /*
545  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
546  *				range within an object.
547  *
548  *	This is a globally accessible routine.
549  *
550  *	This routine removes swapblk assignments from swap metadata.
551  *
552  *	The external callers of this routine typically have already destroyed
553  *	or renamed vm_page_t's associated with this range in the object so
554  *	we should be ok.
555  *
556  * No requirements.
557  */
558 void
559 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
560 {
561 	vm_object_hold(object);
562 	swp_pager_meta_free(object, start, size);
563 	vm_object_drop(object);
564 }
565 
566 /*
567  * No requirements.
568  */
569 void
570 swap_pager_freespace_all(vm_object_t object)
571 {
572 	vm_object_hold(object);
573 	swp_pager_meta_free_all(object);
574 	vm_object_drop(object);
575 }
576 
577 /*
578  * This function conditionally frees swap cache swap starting at
579  * (*basei) in the object.  (count) swap blocks will be nominally freed.
580  * The actual number of blocks freed can be more or less than the
581  * requested number.
582  *
583  * This function nominally returns the number of blocks freed.  However,
584  * the actual number of blocks freed may be less then the returned value.
585  * If the function is unable to exhaust the object or if it is able to
586  * free (approximately) the requested number of blocks it returns
587  * a value n > count.
588  *
589  * If we exhaust the object we will return a value n <= count.
590  *
591  * The caller must hold the object.
592  *
593  * WARNING!  If count == 0 then -1 can be returned as a degenerate case,
594  *	     callers should always pass a count value > 0.
595  */
596 static int swap_pager_condfree_callback(struct swblock *swap, void *data);
597 
598 int
599 swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count)
600 {
601 	struct swfreeinfo info;
602 	int n;
603 	int t;
604 
605 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
606 
607 	info.object = object;
608 	info.basei = *basei;	/* skip up to this page index */
609 	info.begi = count;	/* max swap pages to destroy */
610 	info.endi = count * 8;	/* max swblocks to scan */
611 
612 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
613 				swap_pager_condfree_callback, &info);
614 	*basei = info.basei;
615 
616 	/*
617 	 * Take the higher difference swblocks vs pages
618 	 */
619 	n = count - (int)info.begi;
620 	t = count * 8 - (int)info.endi;
621 	if (n < t)
622 		n = t;
623 	if (n < 1)
624 		n = 1;
625 	return(n);
626 }
627 
628 /*
629  * The idea is to free whole meta-block to avoid fragmenting
630  * the swap space or disk I/O.  We only do this if NO VM pages
631  * are present.
632  *
633  * We do not have to deal with clearing PG_SWAPPED in related VM
634  * pages because there are no related VM pages.
635  *
636  * The caller must hold the object.
637  */
638 static int
639 swap_pager_condfree_callback(struct swblock *swap, void *data)
640 {
641 	struct swfreeinfo *info = data;
642 	vm_object_t object = info->object;
643 	int i;
644 
645 	for (i = 0; i < SWAP_META_PAGES; ++i) {
646 		if (vm_page_lookup(object, swap->swb_index + i))
647 			break;
648 	}
649 	info->basei = swap->swb_index + SWAP_META_PAGES;
650 	if (i == SWAP_META_PAGES) {
651 		info->begi -= swap->swb_count;
652 		swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
653 	}
654 	--info->endi;
655 	if ((int)info->begi < 0 || (int)info->endi < 0)
656 		return(-1);
657 	lwkt_yield();
658 	return(0);
659 }
660 
661 /*
662  * Called by vm_page_alloc() when a new VM page is inserted
663  * into a VM object.  Checks whether swap has been assigned to
664  * the page and sets PG_SWAPPED as necessary.
665  *
666  * No requirements.
667  */
668 void
669 swap_pager_page_inserted(vm_page_t m)
670 {
671 	if (m->object->swblock_count) {
672 		vm_object_hold(m->object);
673 		if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
674 			vm_page_flag_set(m, PG_SWAPPED);
675 		vm_object_drop(m->object);
676 	}
677 }
678 
679 /*
680  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
681  *
682  *	Assigns swap blocks to the specified range within the object.  The
683  *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
684  *
685  *	Returns 0 on success, -1 on failure.
686  *
687  * The caller is responsible for avoiding races in the specified range.
688  * No other requirements.
689  */
690 int
691 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
692 {
693 	int n = 0;
694 	swblk_t blk = SWAPBLK_NONE;
695 	vm_pindex_t beg = start;	/* save start index */
696 
697 	vm_object_hold(object);
698 
699 	while (size) {
700 		if (n == 0) {
701 			n = BLIST_MAX_ALLOC;
702 			while ((blk = swp_pager_getswapspace(object, n)) ==
703 			       SWAPBLK_NONE)
704 			{
705 				n >>= 1;
706 				if (n == 0) {
707 					swp_pager_meta_free(object, beg,
708 							    start - beg);
709 					vm_object_drop(object);
710 					return(-1);
711 				}
712 			}
713 		}
714 		swp_pager_meta_build(object, start, blk);
715 		--size;
716 		++start;
717 		++blk;
718 		--n;
719 	}
720 	swp_pager_meta_free(object, start, n);
721 	vm_object_drop(object);
722 	return(0);
723 }
724 
725 /*
726  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
727  *			and destroy the source.
728  *
729  *	Copy any valid swapblks from the source to the destination.  In
730  *	cases where both the source and destination have a valid swapblk,
731  *	we keep the destination's.
732  *
733  *	This routine is allowed to block.  It may block allocating metadata
734  *	indirectly through swp_pager_meta_build() or if paging is still in
735  *	progress on the source.
736  *
737  *	XXX vm_page_collapse() kinda expects us not to block because we
738  *	supposedly do not need to allocate memory, but for the moment we
739  *	*may* have to get a little memory from the zone allocator, but
740  *	it is taken from the interrupt memory.  We should be ok.
741  *
742  *	The source object contains no vm_page_t's (which is just as well)
743  *	The source object is of type OBJT_SWAP.
744  *
745  *	The source and destination objects must be held by the caller.
746  */
747 void
748 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
749 		vm_pindex_t base_index, int destroysource)
750 {
751 	vm_pindex_t i;
752 
753 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(srcobject));
754 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(dstobject));
755 
756 	/*
757 	 * transfer source to destination.
758 	 */
759 	for (i = 0; i < dstobject->size; ++i) {
760 		swblk_t dstaddr;
761 
762 		/*
763 		 * Locate (without changing) the swapblk on the destination,
764 		 * unless it is invalid in which case free it silently, or
765 		 * if the destination is a resident page, in which case the
766 		 * source is thrown away.
767 		 */
768 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
769 
770 		if (dstaddr == SWAPBLK_NONE) {
771 			/*
772 			 * Destination has no swapblk and is not resident,
773 			 * copy source.
774 			 */
775 			swblk_t srcaddr;
776 
777 			srcaddr = swp_pager_meta_ctl(srcobject,
778 						     base_index + i, SWM_POP);
779 
780 			if (srcaddr != SWAPBLK_NONE)
781 				swp_pager_meta_build(dstobject, i, srcaddr);
782 		} else {
783 			/*
784 			 * Destination has valid swapblk or it is represented
785 			 * by a resident page.  We destroy the sourceblock.
786 			 */
787 			swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
788 		}
789 	}
790 
791 	/*
792 	 * Free left over swap blocks in source.
793 	 *
794 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
795 	 * double-remove the object from the swap queues.
796 	 */
797 	if (destroysource) {
798 		/*
799 		 * Reverting the type is not necessary, the caller is going
800 		 * to destroy srcobject directly, but I'm doing it here
801 		 * for consistency since we've removed the object from its
802 		 * queues.
803 		 */
804 		swp_pager_meta_free_all(srcobject);
805 		if (srcobject->type == OBJT_SWAP)
806 			srcobject->type = OBJT_DEFAULT;
807 	}
808 }
809 
810 /*
811  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
812  *				the requested page.
813  *
814  *	We determine whether good backing store exists for the requested
815  *	page and return TRUE if it does, FALSE if it doesn't.
816  *
817  *	If TRUE, we also try to determine how much valid, contiguous backing
818  *	store exists before and after the requested page within a reasonable
819  *	distance.  We do not try to restrict it to the swap device stripe
820  *	(that is handled in getpages/putpages).  It probably isn't worth
821  *	doing here.
822  *
823  * No requirements.
824  */
825 boolean_t
826 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
827 {
828 	swblk_t blk0;
829 
830 	/*
831 	 * do we have good backing store at the requested index ?
832 	 */
833 	vm_object_hold(object);
834 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
835 
836 	if (blk0 == SWAPBLK_NONE) {
837 		vm_object_drop(object);
838 		return (FALSE);
839 	}
840 	vm_object_drop(object);
841 	return (TRUE);
842 }
843 
844 /*
845  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
846  *
847  * This removes any associated swap backing store, whether valid or
848  * not, from the page.  This operates on any VM object, not just OBJT_SWAP
849  * objects.
850  *
851  * This routine is typically called when a page is made dirty, at
852  * which point any associated swap can be freed.  MADV_FREE also
853  * calls us in a special-case situation
854  *
855  * NOTE!!!  If the page is clean and the swap was valid, the caller
856  * should make the page dirty before calling this routine.  This routine
857  * does NOT change the m->dirty status of the page.  Also: MADV_FREE
858  * depends on it.
859  *
860  * The page must be busied or soft-busied.
861  * The caller can hold the object to avoid blocking, else we might block.
862  * No other requirements.
863  */
864 void
865 swap_pager_unswapped(vm_page_t m)
866 {
867 	if (m->flags & PG_SWAPPED) {
868 		vm_object_hold(m->object);
869 		KKASSERT(m->flags & PG_SWAPPED);
870 		swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
871 		vm_page_flag_clear(m, PG_SWAPPED);
872 		vm_object_drop(m->object);
873 	}
874 }
875 
876 /*
877  * SWAP_PAGER_STRATEGY() - read, write, free blocks
878  *
879  * This implements a VM OBJECT strategy function using swap backing store.
880  * This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
881  * types.
882  *
883  * This is intended to be a cacheless interface (i.e. caching occurs at
884  * higher levels), and is also used as a swap-based SSD cache for vnode
885  * and device objects.
886  *
887  * All I/O goes directly to and from the swap device.
888  *
889  * We currently attempt to run I/O synchronously or asynchronously as
890  * the caller requests.  This isn't perfect because we loose error
891  * sequencing when we run multiple ops in parallel to satisfy a request.
892  * But this is swap, so we let it all hang out.
893  *
894  * No requirements.
895  */
896 void
897 swap_pager_strategy(vm_object_t object, struct bio *bio)
898 {
899 	struct buf *bp = bio->bio_buf;
900 	struct bio *nbio;
901 	vm_pindex_t start;
902 	vm_pindex_t biox_blkno = 0;
903 	int count;
904 	char *data;
905 	struct bio *biox;
906 	struct buf *bufx;
907 #if 0
908 	struct bio_track *track;
909 #endif
910 
911 #if 0
912 	/*
913 	 * tracking for swapdev vnode I/Os
914 	 */
915 	if (bp->b_cmd == BUF_CMD_READ)
916 		track = &swapdev_vp->v_track_read;
917 	else
918 		track = &swapdev_vp->v_track_write;
919 #endif
920 
921 	if (bp->b_bcount & PAGE_MASK) {
922 		bp->b_error = EINVAL;
923 		bp->b_flags |= B_ERROR | B_INVAL;
924 		biodone(bio);
925 		kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
926 			"not page bounded\n",
927 			bp, (long long)bio->bio_offset, (int)bp->b_bcount);
928 		return;
929 	}
930 
931 	/*
932 	 * Clear error indication, initialize page index, count, data pointer.
933 	 */
934 	bp->b_error = 0;
935 	bp->b_flags &= ~B_ERROR;
936 	bp->b_resid = bp->b_bcount;
937 
938 	start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT);
939 	count = howmany(bp->b_bcount, PAGE_SIZE);
940 	data = bp->b_data;
941 
942 	/*
943 	 * Deal with BUF_CMD_FREEBLKS
944 	 */
945 	if (bp->b_cmd == BUF_CMD_FREEBLKS) {
946 		/*
947 		 * FREE PAGE(s) - destroy underlying swap that is no longer
948 		 *		  needed.
949 		 */
950 		vm_object_hold(object);
951 		swp_pager_meta_free(object, start, count);
952 		vm_object_drop(object);
953 		bp->b_resid = 0;
954 		biodone(bio);
955 		return;
956 	}
957 
958 	/*
959 	 * We need to be able to create a new cluster of I/O's.  We cannot
960 	 * use the caller fields of the passed bio so push a new one.
961 	 *
962 	 * Because nbio is just a placeholder for the cluster links,
963 	 * we can biodone() the original bio instead of nbio to make
964 	 * things a bit more efficient.
965 	 */
966 	nbio = push_bio(bio);
967 	nbio->bio_offset = bio->bio_offset;
968 	nbio->bio_caller_info1.cluster_head = NULL;
969 	nbio->bio_caller_info2.cluster_tail = NULL;
970 
971 	biox = NULL;
972 	bufx = NULL;
973 
974 	/*
975 	 * Execute read or write
976 	 */
977 	vm_object_hold(object);
978 
979 	while (count > 0) {
980 		swblk_t blk;
981 
982 		/*
983 		 * Obtain block.  If block not found and writing, allocate a
984 		 * new block and build it into the object.
985 		 */
986 		blk = swp_pager_meta_ctl(object, start, 0);
987 		if ((blk == SWAPBLK_NONE) && bp->b_cmd != BUF_CMD_READ) {
988 			blk = swp_pager_getswapspace(object, 1);
989 			if (blk == SWAPBLK_NONE) {
990 				bp->b_error = ENOMEM;
991 				bp->b_flags |= B_ERROR;
992 				break;
993 			}
994 			swp_pager_meta_build(object, start, blk);
995 		}
996 
997 		/*
998 		 * Do we have to flush our current collection?  Yes if:
999 		 *
1000 		 *	- no swap block at this index
1001 		 *	- swap block is not contiguous
1002 		 *	- we cross a physical disk boundry in the
1003 		 *	  stripe.
1004 		 */
1005 		if (
1006 		    biox && (biox_blkno + btoc(bufx->b_bcount) != blk ||
1007 		     ((biox_blkno ^ blk) & dmmax_mask)
1008 		    )
1009 		) {
1010 			if (bp->b_cmd == BUF_CMD_READ) {
1011 				++mycpu->gd_cnt.v_swapin;
1012 				mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1013 			} else {
1014 				++mycpu->gd_cnt.v_swapout;
1015 				mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1016 				bufx->b_dirtyend = bufx->b_bcount;
1017 			}
1018 
1019 			/*
1020 			 * Finished with this buf.
1021 			 */
1022 			KKASSERT(bufx->b_bcount != 0);
1023 			if (bufx->b_cmd != BUF_CMD_READ)
1024 				bufx->b_dirtyend = bufx->b_bcount;
1025 			biox = NULL;
1026 			bufx = NULL;
1027 		}
1028 
1029 		/*
1030 		 * Add new swapblk to biox, instantiating biox if necessary.
1031 		 * Zero-fill reads are able to take a shortcut.
1032 		 */
1033 		if (blk == SWAPBLK_NONE) {
1034 			/*
1035 			 * We can only get here if we are reading.  Since
1036 			 * we are at splvm() we can safely modify b_resid,
1037 			 * even if chain ops are in progress.
1038 			 */
1039 			bzero(data, PAGE_SIZE);
1040 			bp->b_resid -= PAGE_SIZE;
1041 		} else {
1042 			if (biox == NULL) {
1043 				/* XXX chain count > 4, wait to <= 4 */
1044 
1045 				bufx = getpbuf(NULL);
1046 				biox = &bufx->b_bio1;
1047 				cluster_append(nbio, bufx);
1048 				bufx->b_flags |= (bp->b_flags & B_ORDERED);
1049 				bufx->b_cmd = bp->b_cmd;
1050 				biox->bio_done = swap_chain_iodone;
1051 				biox->bio_offset = (off_t)blk << PAGE_SHIFT;
1052 				biox->bio_caller_info1.cluster_parent = nbio;
1053 				biox_blkno = blk;
1054 				bufx->b_bcount = 0;
1055 				bufx->b_data = data;
1056 			}
1057 			bufx->b_bcount += PAGE_SIZE;
1058 		}
1059 		--count;
1060 		++start;
1061 		data += PAGE_SIZE;
1062 	}
1063 
1064 	vm_object_drop(object);
1065 
1066 	/*
1067 	 *  Flush out last buffer
1068 	 */
1069 	if (biox) {
1070 		if (bufx->b_cmd == BUF_CMD_READ) {
1071 			++mycpu->gd_cnt.v_swapin;
1072 			mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1073 		} else {
1074 			++mycpu->gd_cnt.v_swapout;
1075 			mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1076 			bufx->b_dirtyend = bufx->b_bcount;
1077 		}
1078 		KKASSERT(bufx->b_bcount);
1079 		if (bufx->b_cmd != BUF_CMD_READ)
1080 			bufx->b_dirtyend = bufx->b_bcount;
1081 		/* biox, bufx = NULL */
1082 	}
1083 
1084 	/*
1085 	 * Now initiate all the I/O.  Be careful looping on our chain as
1086 	 * I/O's may complete while we are still initiating them.
1087 	 *
1088 	 * If the request is a 100% sparse read no bios will be present
1089 	 * and we just biodone() the buffer.
1090 	 */
1091 	nbio->bio_caller_info2.cluster_tail = NULL;
1092 	bufx = nbio->bio_caller_info1.cluster_head;
1093 
1094 	if (bufx) {
1095 		while (bufx) {
1096 			biox = &bufx->b_bio1;
1097 			BUF_KERNPROC(bufx);
1098 			bufx = bufx->b_cluster_next;
1099 			vn_strategy(swapdev_vp, biox);
1100 		}
1101 	} else {
1102 		biodone(bio);
1103 	}
1104 
1105 	/*
1106 	 * Completion of the cluster will also call biodone_chain(nbio).
1107 	 * We never call biodone(nbio) so we don't have to worry about
1108 	 * setting up a bio_done callback.  It's handled in the sub-IO.
1109 	 */
1110 	/**/
1111 }
1112 
1113 /*
1114  * biodone callback
1115  *
1116  * No requirements.
1117  */
1118 static void
1119 swap_chain_iodone(struct bio *biox)
1120 {
1121 	struct buf **nextp;
1122 	struct buf *bufx;	/* chained sub-buffer */
1123 	struct bio *nbio;	/* parent nbio with chain glue */
1124 	struct buf *bp;		/* original bp associated with nbio */
1125 	int chain_empty;
1126 
1127 	bufx = biox->bio_buf;
1128 	nbio = biox->bio_caller_info1.cluster_parent;
1129 	bp = nbio->bio_buf;
1130 
1131 	/*
1132 	 * Update the original buffer
1133 	 */
1134         KKASSERT(bp != NULL);
1135 	if (bufx->b_flags & B_ERROR) {
1136 		atomic_set_int(&bufx->b_flags, B_ERROR);
1137 		bp->b_error = bufx->b_error;	/* race ok */
1138 	} else if (bufx->b_resid != 0) {
1139 		atomic_set_int(&bufx->b_flags, B_ERROR);
1140 		bp->b_error = EINVAL;		/* race ok */
1141 	} else {
1142 		atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
1143 	}
1144 
1145 	/*
1146 	 * Remove us from the chain.
1147 	 */
1148 	spin_lock(&bp->b_lock.lk_spinlock);
1149 	nextp = &nbio->bio_caller_info1.cluster_head;
1150 	while (*nextp != bufx) {
1151 		KKASSERT(*nextp != NULL);
1152 		nextp = &(*nextp)->b_cluster_next;
1153 	}
1154 	*nextp = bufx->b_cluster_next;
1155 	chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
1156 	spin_unlock(&bp->b_lock.lk_spinlock);
1157 
1158 	/*
1159 	 * Clean up bufx.  If the chain is now empty we finish out
1160 	 * the parent.  Note that we may be racing other completions
1161 	 * so we must use the chain_empty status from above.
1162 	 */
1163 	if (chain_empty) {
1164 		if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
1165 			atomic_set_int(&bp->b_flags, B_ERROR);
1166 			bp->b_error = EINVAL;
1167 		}
1168 		biodone_chain(nbio);
1169         }
1170         relpbuf(bufx, NULL);
1171 }
1172 
1173 /*
1174  * SWAP_PAGER_GETPAGES() - bring page in from swap
1175  *
1176  * The requested page may have to be brought in from swap.  Calculate the
1177  * swap block and bring in additional pages if possible.  All pages must
1178  * have contiguous swap block assignments and reside in the same object.
1179  *
1180  * The caller has a single vm_object_pip_add() reference prior to
1181  * calling us and we should return with the same.
1182  *
1183  * The caller has BUSY'd the page.  We should return with (*mpp) left busy,
1184  * and any additinal pages unbusied.
1185  *
1186  * If the caller encounters a PG_RAM page it will pass it to us even though
1187  * it may be valid and dirty.  We cannot overwrite the page in this case!
1188  * The case is used to allow us to issue pure read-aheads.
1189  *
1190  * NOTE! XXX This code does not entirely pipeline yet due to the fact that
1191  *       the PG_RAM page is validated at the same time as mreq.  What we
1192  *	 really need to do is issue a separate read-ahead pbuf.
1193  *
1194  * No requirements.
1195  */
1196 static int
1197 swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
1198 {
1199 	struct buf *bp;
1200 	struct bio *bio;
1201 	vm_page_t mreq;
1202 	vm_page_t m;
1203 	vm_offset_t kva;
1204 	swblk_t blk;
1205 	int i;
1206 	int j;
1207 	int raonly;
1208 	int error;
1209 	u_int32_t flags;
1210 	vm_page_t marray[XIO_INTERNAL_PAGES];
1211 
1212 	mreq = *mpp;
1213 
1214 	vm_object_hold(object);
1215 	if (mreq->object != object) {
1216 		panic("swap_pager_getpages: object mismatch %p/%p",
1217 		    object,
1218 		    mreq->object
1219 		);
1220 	}
1221 
1222 	/*
1223 	 * We don't want to overwrite a fully valid page as it might be
1224 	 * dirty.  This case can occur when e.g. vm_fault hits a perfectly
1225 	 * valid page with PG_RAM set.
1226 	 *
1227 	 * In this case we see if the next page is a suitable page-in
1228 	 * candidate and if it is we issue read-ahead.  PG_RAM will be
1229 	 * set on the last page of the read-ahead to continue the pipeline.
1230 	 */
1231 	if (mreq->valid == VM_PAGE_BITS_ALL) {
1232 		if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size) {
1233 			vm_object_drop(object);
1234 			return(VM_PAGER_OK);
1235 		}
1236 		blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
1237 		if (blk == SWAPBLK_NONE) {
1238 			vm_object_drop(object);
1239 			return(VM_PAGER_OK);
1240 		}
1241 		m = vm_page_lookup_busy_try(object, mreq->pindex + 1,
1242 					    TRUE, &error);
1243 		if (error) {
1244 			vm_object_drop(object);
1245 			return(VM_PAGER_OK);
1246 		} else if (m == NULL) {
1247 			/*
1248 			 * Use VM_ALLOC_QUICK to avoid blocking on cache
1249 			 * page reuse.
1250 			 */
1251 			m = vm_page_alloc(object, mreq->pindex + 1,
1252 					  VM_ALLOC_QUICK);
1253 			if (m == NULL) {
1254 				vm_object_drop(object);
1255 				return(VM_PAGER_OK);
1256 			}
1257 		} else {
1258 			if (m->valid) {
1259 				vm_page_wakeup(m);
1260 				vm_object_drop(object);
1261 				return(VM_PAGER_OK);
1262 			}
1263 			vm_page_unqueue_nowakeup(m);
1264 		}
1265 		/* page is busy */
1266 		mreq = m;
1267 		raonly = 1;
1268 	} else {
1269 		raonly = 0;
1270 	}
1271 
1272 	/*
1273 	 * Try to block-read contiguous pages from swap if sequential,
1274 	 * otherwise just read one page.  Contiguous pages from swap must
1275 	 * reside within a single device stripe because the I/O cannot be
1276 	 * broken up across multiple stripes.
1277 	 *
1278 	 * Note that blk and iblk can be SWAPBLK_NONE but the loop is
1279 	 * set up such that the case(s) are handled implicitly.
1280 	 */
1281 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1282 	marray[0] = mreq;
1283 
1284 	for (i = 1; swap_burst_read &&
1285 		    i < XIO_INTERNAL_PAGES &&
1286 		    mreq->pindex + i < object->size; ++i) {
1287 		swblk_t iblk;
1288 
1289 		iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
1290 		if (iblk != blk + i)
1291 			break;
1292 		if ((blk ^ iblk) & dmmax_mask)
1293 			break;
1294 		m = vm_page_lookup_busy_try(object, mreq->pindex + i,
1295 					    TRUE, &error);
1296 		if (error) {
1297 			break;
1298 		} else if (m == NULL) {
1299 			/*
1300 			 * Use VM_ALLOC_QUICK to avoid blocking on cache
1301 			 * page reuse.
1302 			 */
1303 			m = vm_page_alloc(object, mreq->pindex + i,
1304 					  VM_ALLOC_QUICK);
1305 			if (m == NULL)
1306 				break;
1307 		} else {
1308 			if (m->valid) {
1309 				vm_page_wakeup(m);
1310 				break;
1311 			}
1312 			vm_page_unqueue_nowakeup(m);
1313 		}
1314 		/* page is busy */
1315 		marray[i] = m;
1316 	}
1317 	if (i > 1)
1318 		vm_page_flag_set(marray[i - 1], PG_RAM);
1319 
1320 	/*
1321 	 * If mreq is the requested page and we have nothing to do return
1322 	 * VM_PAGER_FAIL.  If raonly is set mreq is just another read-ahead
1323 	 * page and must be cleaned up.
1324 	 */
1325 	if (blk == SWAPBLK_NONE) {
1326 		KKASSERT(i == 1);
1327 		if (raonly) {
1328 			vnode_pager_freepage(mreq);
1329 			vm_object_drop(object);
1330 			return(VM_PAGER_OK);
1331 		} else {
1332 			vm_object_drop(object);
1333 			return(VM_PAGER_FAIL);
1334 		}
1335 	}
1336 
1337 	/*
1338 	 * map our page(s) into kva for input
1339 	 */
1340 	bp = getpbuf_kva(&nsw_rcount);
1341 	bio = &bp->b_bio1;
1342 	kva = (vm_offset_t) bp->b_kvabase;
1343 	bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
1344 	pmap_qenter(kva, bp->b_xio.xio_pages, i);
1345 
1346 	bp->b_data = (caddr_t)kva;
1347 	bp->b_bcount = PAGE_SIZE * i;
1348 	bp->b_xio.xio_npages = i;
1349 	bio->bio_done = swp_pager_async_iodone;
1350 	bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1351 	bio->bio_caller_info1.index = SWBIO_READ;
1352 
1353 	/*
1354 	 * Set index.  If raonly set the index beyond the array so all
1355 	 * the pages are treated the same, otherwise the original mreq is
1356 	 * at index 0.
1357 	 */
1358 	if (raonly)
1359 		bio->bio_driver_info = (void *)(intptr_t)i;
1360 	else
1361 		bio->bio_driver_info = (void *)(intptr_t)0;
1362 
1363 	for (j = 0; j < i; ++j)
1364 		vm_page_flag_set(bp->b_xio.xio_pages[j], PG_SWAPINPROG);
1365 
1366 	mycpu->gd_cnt.v_swapin++;
1367 	mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
1368 
1369 	/*
1370 	 * We still hold the lock on mreq, and our automatic completion routine
1371 	 * does not remove it.
1372 	 */
1373 	vm_object_pip_add(object, bp->b_xio.xio_npages);
1374 
1375 	/*
1376 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
1377 	 * this point because we automatically release it on completion.
1378 	 * Instead, we look at the one page we are interested in which we
1379 	 * still hold a lock on even through the I/O completion.
1380 	 *
1381 	 * The other pages in our m[] array are also released on completion,
1382 	 * so we cannot assume they are valid anymore either.
1383 	 */
1384 	bp->b_cmd = BUF_CMD_READ;
1385 	BUF_KERNPROC(bp);
1386 	vn_strategy(swapdev_vp, bio);
1387 
1388 	/*
1389 	 * Wait for the page we want to complete.  PG_SWAPINPROG is always
1390 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
1391 	 * is set in the meta-data.
1392 	 *
1393 	 * If this is a read-ahead only we return immediately without
1394 	 * waiting for I/O.
1395 	 */
1396 	if (raonly) {
1397 		vm_object_drop(object);
1398 		return(VM_PAGER_OK);
1399 	}
1400 
1401 	/*
1402 	 * Read-ahead includes originally requested page case.
1403 	 */
1404 	for (;;) {
1405 		flags = mreq->flags;
1406 		cpu_ccfence();
1407 		if ((flags & PG_SWAPINPROG) == 0)
1408 			break;
1409 		tsleep_interlock(mreq, 0);
1410 		if (!atomic_cmpset_int(&mreq->flags, flags,
1411 				       flags | PG_WANTED | PG_REFERENCED)) {
1412 			continue;
1413 		}
1414 		mycpu->gd_cnt.v_intrans++;
1415 		if (tsleep(mreq, PINTERLOCKED, "swread", hz*20)) {
1416 			kprintf(
1417 			    "swap_pager: indefinite wait buffer: "
1418 				" offset: %lld, size: %ld\n",
1419 			    (long long)bio->bio_offset,
1420 			    (long)bp->b_bcount
1421 			);
1422 		}
1423 	}
1424 
1425 	/*
1426 	 * mreq is left bussied after completion, but all the other pages
1427 	 * are freed.  If we had an unrecoverable read error the page will
1428 	 * not be valid.
1429 	 */
1430 	vm_object_drop(object);
1431 	if (mreq->valid != VM_PAGE_BITS_ALL)
1432 		return(VM_PAGER_ERROR);
1433 	else
1434 		return(VM_PAGER_OK);
1435 
1436 	/*
1437 	 * A final note: in a low swap situation, we cannot deallocate swap
1438 	 * and mark a page dirty here because the caller is likely to mark
1439 	 * the page clean when we return, causing the page to possibly revert
1440 	 * to all-zero's later.
1441 	 */
1442 }
1443 
1444 /*
1445  *	swap_pager_putpages:
1446  *
1447  *	Assign swap (if necessary) and initiate I/O on the specified pages.
1448  *
1449  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
1450  *	are automatically converted to SWAP objects.
1451  *
1452  *	In a low memory situation we may block in vn_strategy(), but the new
1453  *	vm_page reservation system coupled with properly written VFS devices
1454  *	should ensure that no low-memory deadlock occurs.  This is an area
1455  *	which needs work.
1456  *
1457  *	The parent has N vm_object_pip_add() references prior to
1458  *	calling us and will remove references for rtvals[] that are
1459  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
1460  *	completion.
1461  *
1462  *	The parent has soft-busy'd the pages it passes us and will unbusy
1463  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1464  *	We need to unbusy the rest on I/O completion.
1465  *
1466  * No requirements.
1467  */
1468 void
1469 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
1470 		    boolean_t sync, int *rtvals)
1471 {
1472 	int i;
1473 	int n = 0;
1474 
1475 	vm_object_hold(object);
1476 
1477 	if (count && m[0]->object != object) {
1478 		panic("swap_pager_getpages: object mismatch %p/%p",
1479 		    object,
1480 		    m[0]->object
1481 		);
1482 	}
1483 
1484 	/*
1485 	 * Step 1
1486 	 *
1487 	 * Turn object into OBJT_SWAP
1488 	 * check for bogus sysops
1489 	 * force sync if not pageout process
1490 	 */
1491 	if (object->type == OBJT_DEFAULT) {
1492 		if (object->type == OBJT_DEFAULT)
1493 			swp_pager_meta_convert(object);
1494 	}
1495 
1496 	if (curthread != pagethread)
1497 		sync = TRUE;
1498 
1499 	/*
1500 	 * Step 2
1501 	 *
1502 	 * Update nsw parameters from swap_async_max sysctl values.
1503 	 * Do not let the sysop crash the machine with bogus numbers.
1504 	 */
1505 	if (swap_async_max != nsw_wcount_async_max) {
1506 		int n;
1507 
1508 		/*
1509 		 * limit range
1510 		 */
1511 		if ((n = swap_async_max) > nswbuf / 2)
1512 			n = nswbuf / 2;
1513 		if (n < 1)
1514 			n = 1;
1515 		swap_async_max = n;
1516 
1517 		/*
1518 		 * Adjust difference ( if possible ).  If the current async
1519 		 * count is too low, we may not be able to make the adjustment
1520 		 * at this time.
1521 		 *
1522 		 * vm_token needed for nsw_wcount sleep interlock
1523 		 */
1524 		lwkt_gettoken(&vm_token);
1525 		n -= nsw_wcount_async_max;
1526 		if (nsw_wcount_async + n >= 0) {
1527 			nsw_wcount_async_max += n;
1528 			pbuf_adjcount(&nsw_wcount_async, n);
1529 		}
1530 		lwkt_reltoken(&vm_token);
1531 	}
1532 
1533 	/*
1534 	 * Step 3
1535 	 *
1536 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
1537 	 * The page is left dirty until the pageout operation completes
1538 	 * successfully.
1539 	 */
1540 
1541 	for (i = 0; i < count; i += n) {
1542 		struct buf *bp;
1543 		struct bio *bio;
1544 		swblk_t blk;
1545 		int j;
1546 
1547 		/*
1548 		 * Maximum I/O size is limited by a number of factors.
1549 		 */
1550 
1551 		n = min(BLIST_MAX_ALLOC, count - i);
1552 		n = min(n, nsw_cluster_max);
1553 
1554 		lwkt_gettoken(&vm_token);
1555 
1556 		/*
1557 		 * Get biggest block of swap we can.  If we fail, fall
1558 		 * back and try to allocate a smaller block.  Don't go
1559 		 * overboard trying to allocate space if it would overly
1560 		 * fragment swap.
1561 		 */
1562 		while (
1563 		    (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
1564 		    n > 4
1565 		) {
1566 			n >>= 1;
1567 		}
1568 		if (blk == SWAPBLK_NONE) {
1569 			for (j = 0; j < n; ++j)
1570 				rtvals[i+j] = VM_PAGER_FAIL;
1571 			lwkt_reltoken(&vm_token);
1572 			continue;
1573 		}
1574 
1575 		/*
1576 		 * The I/O we are constructing cannot cross a physical
1577 		 * disk boundry in the swap stripe.  Note: we are still
1578 		 * at splvm().
1579 		 */
1580 		if ((blk ^ (blk + n)) & dmmax_mask) {
1581 			j = ((blk + dmmax) & dmmax_mask) - blk;
1582 			swp_pager_freeswapspace(object, blk + j, n - j);
1583 			n = j;
1584 		}
1585 
1586 		/*
1587 		 * All I/O parameters have been satisfied, build the I/O
1588 		 * request and assign the swap space.
1589 		 */
1590 		if (sync == TRUE)
1591 			bp = getpbuf_kva(&nsw_wcount_sync);
1592 		else
1593 			bp = getpbuf_kva(&nsw_wcount_async);
1594 		bio = &bp->b_bio1;
1595 
1596 		lwkt_reltoken(&vm_token);
1597 
1598 		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
1599 
1600 		bp->b_bcount = PAGE_SIZE * n;
1601 		bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1602 
1603 		for (j = 0; j < n; ++j) {
1604 			vm_page_t mreq = m[i+j];
1605 
1606 			swp_pager_meta_build(mreq->object, mreq->pindex,
1607 					     blk + j);
1608 			if (object->type == OBJT_SWAP)
1609 				vm_page_dirty(mreq);
1610 			rtvals[i+j] = VM_PAGER_OK;
1611 
1612 			vm_page_flag_set(mreq, PG_SWAPINPROG);
1613 			bp->b_xio.xio_pages[j] = mreq;
1614 		}
1615 		bp->b_xio.xio_npages = n;
1616 
1617 		mycpu->gd_cnt.v_swapout++;
1618 		mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
1619 
1620 		bp->b_dirtyoff = 0;		/* req'd for NFS */
1621 		bp->b_dirtyend = bp->b_bcount;	/* req'd for NFS */
1622 		bp->b_cmd = BUF_CMD_WRITE;
1623 		bio->bio_caller_info1.index = SWBIO_WRITE;
1624 
1625 		/*
1626 		 * asynchronous
1627 		 */
1628 		if (sync == FALSE) {
1629 			bio->bio_done = swp_pager_async_iodone;
1630 			BUF_KERNPROC(bp);
1631 			vn_strategy(swapdev_vp, bio);
1632 
1633 			for (j = 0; j < n; ++j)
1634 				rtvals[i+j] = VM_PAGER_PEND;
1635 			continue;
1636 		}
1637 
1638 		/*
1639 		 * Issue synchrnously.
1640 		 *
1641 		 * Wait for the sync I/O to complete, then update rtvals.
1642 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
1643 		 * our async completion routine at the end, thus avoiding a
1644 		 * double-free.
1645 		 */
1646 		bio->bio_caller_info1.index |= SWBIO_SYNC;
1647 		bio->bio_done = biodone_sync;
1648 		bio->bio_flags |= BIO_SYNC;
1649 		vn_strategy(swapdev_vp, bio);
1650 		biowait(bio, "swwrt");
1651 
1652 		for (j = 0; j < n; ++j)
1653 			rtvals[i+j] = VM_PAGER_PEND;
1654 
1655 		/*
1656 		 * Now that we are through with the bp, we can call the
1657 		 * normal async completion, which frees everything up.
1658 		 */
1659 		swp_pager_async_iodone(bio);
1660 	}
1661 	vm_object_drop(object);
1662 }
1663 
1664 /*
1665  * No requirements.
1666  */
1667 void
1668 swap_pager_newswap(void)
1669 {
1670 	swp_sizecheck();
1671 }
1672 
1673 /*
1674  *	swp_pager_async_iodone:
1675  *
1676  *	Completion routine for asynchronous reads and writes from/to swap.
1677  *	Also called manually by synchronous code to finish up a bp.
1678  *
1679  *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations,
1680  *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY
1681  *	unbusy all pages except the 'main' request page.  For WRITE
1682  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this
1683  *	because we marked them all VM_PAGER_PEND on return from putpages ).
1684  *
1685  *	This routine may not block.
1686  *
1687  * No requirements.
1688  */
1689 static void
1690 swp_pager_async_iodone(struct bio *bio)
1691 {
1692 	struct buf *bp = bio->bio_buf;
1693 	vm_object_t object = NULL;
1694 	int i;
1695 	int *nswptr;
1696 
1697 	/*
1698 	 * report error
1699 	 */
1700 	if (bp->b_flags & B_ERROR) {
1701 		kprintf(
1702 		    "swap_pager: I/O error - %s failed; offset %lld,"
1703 			"size %ld, error %d\n",
1704 		    ((bio->bio_caller_info1.index & SWBIO_READ) ?
1705 			"pagein" : "pageout"),
1706 		    (long long)bio->bio_offset,
1707 		    (long)bp->b_bcount,
1708 		    bp->b_error
1709 		);
1710 	}
1711 
1712 	/*
1713 	 * set object, raise to splvm().
1714 	 */
1715 	if (bp->b_xio.xio_npages)
1716 		object = bp->b_xio.xio_pages[0]->object;
1717 
1718 	/*
1719 	 * remove the mapping for kernel virtual
1720 	 */
1721 	pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
1722 
1723 	/*
1724 	 * cleanup pages.  If an error occurs writing to swap, we are in
1725 	 * very serious trouble.  If it happens to be a disk error, though,
1726 	 * we may be able to recover by reassigning the swap later on.  So
1727 	 * in this case we remove the m->swapblk assignment for the page
1728 	 * but do not free it in the rlist.  The errornous block(s) are thus
1729 	 * never reallocated as swap.  Redirty the page and continue.
1730 	 */
1731 	for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1732 		vm_page_t m = bp->b_xio.xio_pages[i];
1733 
1734 		if (bp->b_flags & B_ERROR) {
1735 			/*
1736 			 * If an error occurs I'd love to throw the swapblk
1737 			 * away without freeing it back to swapspace, so it
1738 			 * can never be used again.  But I can't from an
1739 			 * interrupt.
1740 			 */
1741 
1742 			if (bio->bio_caller_info1.index & SWBIO_READ) {
1743 				/*
1744 				 * When reading, reqpage needs to stay
1745 				 * locked for the parent, but all other
1746 				 * pages can be freed.  We still want to
1747 				 * wakeup the parent waiting on the page,
1748 				 * though.  ( also: pg_reqpage can be -1 and
1749 				 * not match anything ).
1750 				 *
1751 				 * We have to wake specifically requested pages
1752 				 * up too because we cleared PG_SWAPINPROG and
1753 				 * someone may be waiting for that.
1754 				 *
1755 				 * NOTE: for reads, m->dirty will probably
1756 				 * be overridden by the original caller of
1757 				 * getpages so don't play cute tricks here.
1758 				 *
1759 				 * NOTE: We can't actually free the page from
1760 				 * here, because this is an interrupt.  It
1761 				 * is not legal to mess with object->memq
1762 				 * from an interrupt.  Deactivate the page
1763 				 * instead.
1764 				 */
1765 
1766 				m->valid = 0;
1767 				vm_page_flag_clear(m, PG_ZERO);
1768 				vm_page_flag_clear(m, PG_SWAPINPROG);
1769 
1770 				/*
1771 				 * bio_driver_info holds the requested page
1772 				 * index.
1773 				 */
1774 				if (i != (int)(intptr_t)bio->bio_driver_info) {
1775 					vm_page_deactivate(m);
1776 					vm_page_wakeup(m);
1777 				} else {
1778 					vm_page_flash(m);
1779 				}
1780 				/*
1781 				 * If i == bp->b_pager.pg_reqpage, do not wake
1782 				 * the page up.  The caller needs to.
1783 				 */
1784 			} else {
1785 				/*
1786 				 * If a write error occurs remove the swap
1787 				 * assignment (note that PG_SWAPPED may or
1788 				 * may not be set depending on prior activity).
1789 				 *
1790 				 * Re-dirty OBJT_SWAP pages as there is no
1791 				 * other backing store, we can't throw the
1792 				 * page away.
1793 				 *
1794 				 * Non-OBJT_SWAP pages (aka swapcache) must
1795 				 * not be dirtied since they may not have
1796 				 * been dirty in the first place, and they
1797 				 * do have backing store (the vnode).
1798 				 */
1799 				vm_page_busy_wait(m, FALSE, "swadpg");
1800 				swp_pager_meta_ctl(m->object, m->pindex,
1801 						   SWM_FREE);
1802 				vm_page_flag_clear(m, PG_SWAPPED);
1803 				if (m->object->type == OBJT_SWAP) {
1804 					vm_page_dirty(m);
1805 					vm_page_activate(m);
1806 				}
1807 				vm_page_flag_clear(m, PG_SWAPINPROG);
1808 				vm_page_io_finish(m);
1809 				vm_page_wakeup(m);
1810 			}
1811 		} else if (bio->bio_caller_info1.index & SWBIO_READ) {
1812 			/*
1813 			 * NOTE: for reads, m->dirty will probably be
1814 			 * overridden by the original caller of getpages so
1815 			 * we cannot set them in order to free the underlying
1816 			 * swap in a low-swap situation.  I don't think we'd
1817 			 * want to do that anyway, but it was an optimization
1818 			 * that existed in the old swapper for a time before
1819 			 * it got ripped out due to precisely this problem.
1820 			 *
1821 			 * clear PG_ZERO in page.
1822 			 *
1823 			 * If not the requested page then deactivate it.
1824 			 *
1825 			 * Note that the requested page, reqpage, is left
1826 			 * busied, but we still have to wake it up.  The
1827 			 * other pages are released (unbusied) by
1828 			 * vm_page_wakeup().  We do not set reqpage's
1829 			 * valid bits here, it is up to the caller.
1830 			 */
1831 
1832 			/*
1833 			 * NOTE: can't call pmap_clear_modify(m) from an
1834 			 * interrupt thread, the pmap code may have to map
1835 			 * non-kernel pmaps and currently asserts the case.
1836 			 */
1837 			/*pmap_clear_modify(m);*/
1838 			m->valid = VM_PAGE_BITS_ALL;
1839 			vm_page_undirty(m);
1840 			vm_page_flag_clear(m, PG_ZERO | PG_SWAPINPROG);
1841 			vm_page_flag_set(m, PG_SWAPPED);
1842 
1843 			/*
1844 			 * We have to wake specifically requested pages
1845 			 * up too because we cleared PG_SWAPINPROG and
1846 			 * could be waiting for it in getpages.  However,
1847 			 * be sure to not unbusy getpages specifically
1848 			 * requested page - getpages expects it to be
1849 			 * left busy.
1850 			 *
1851 			 * bio_driver_info holds the requested page
1852 			 */
1853 			if (i != (int)(intptr_t)bio->bio_driver_info) {
1854 				vm_page_deactivate(m);
1855 				vm_page_wakeup(m);
1856 			} else {
1857 				vm_page_flash(m);
1858 			}
1859 		} else {
1860 			/*
1861 			 * Mark the page clean but do not mess with the
1862 			 * pmap-layer's modified state.  That state should
1863 			 * also be clear since the caller protected the
1864 			 * page VM_PROT_READ, but allow the case.
1865 			 *
1866 			 * We are in an interrupt, avoid pmap operations.
1867 			 *
1868 			 * If we have a severe page deficit, deactivate the
1869 			 * page.  Do not try to cache it (which would also
1870 			 * involve a pmap op), because the page might still
1871 			 * be read-heavy.
1872 			 *
1873 			 * When using the swap to cache clean vnode pages
1874 			 * we do not mess with the page dirty bits.
1875 			 */
1876 			vm_page_busy_wait(m, FALSE, "swadpg");
1877 			if (m->object->type == OBJT_SWAP)
1878 				vm_page_undirty(m);
1879 			vm_page_flag_clear(m, PG_SWAPINPROG);
1880 			vm_page_flag_set(m, PG_SWAPPED);
1881 			if (vm_page_count_severe())
1882 				vm_page_deactivate(m);
1883 #if 0
1884 			if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
1885 				vm_page_protect(m, VM_PROT_READ);
1886 #endif
1887 			vm_page_io_finish(m);
1888 			vm_page_wakeup(m);
1889 		}
1890 	}
1891 
1892 	/*
1893 	 * adjust pip.  NOTE: the original parent may still have its own
1894 	 * pip refs on the object.
1895 	 */
1896 
1897 	if (object)
1898 		vm_object_pip_wakeup_n(object, bp->b_xio.xio_npages);
1899 
1900 	/*
1901 	 * Release the physical I/O buffer.
1902 	 *
1903 	 * NOTE: Due to synchronous operations in the write case b_cmd may
1904 	 *	 already be set to BUF_CMD_DONE and BIO_SYNC may have already
1905 	 *	 been cleared.
1906 	 *
1907 	 * Use vm_token to interlock nsw_rcount/wcount wakeup?
1908 	 */
1909 	lwkt_gettoken(&vm_token);
1910 	if (bio->bio_caller_info1.index & SWBIO_READ)
1911 		nswptr = &nsw_rcount;
1912 	else if (bio->bio_caller_info1.index & SWBIO_SYNC)
1913 		nswptr = &nsw_wcount_sync;
1914 	else
1915 		nswptr = &nsw_wcount_async;
1916 	bp->b_cmd = BUF_CMD_DONE;
1917 	relpbuf(bp, nswptr);
1918 	lwkt_reltoken(&vm_token);
1919 }
1920 
1921 /*
1922  * Fault-in a potentially swapped page and remove the swap reference.
1923  * (used by swapoff code)
1924  *
1925  * object must be held.
1926  */
1927 static __inline void
1928 swp_pager_fault_page(vm_object_t object, vm_pindex_t pindex)
1929 {
1930 	struct vnode *vp;
1931 	vm_page_t m;
1932 	int error;
1933 
1934 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1935 
1936 	if (object->type == OBJT_VNODE) {
1937 		/*
1938 		 * Any swap related to a vnode is due to swapcache.  We must
1939 		 * vget() the vnode in case it is not active (otherwise
1940 		 * vref() will panic).  Calling vm_object_page_remove() will
1941 		 * ensure that any swap ref is removed interlocked with the
1942 		 * page.  clean_only is set to TRUE so we don't throw away
1943 		 * dirty pages.
1944 		 */
1945 		vp = object->handle;
1946 		error = vget(vp, LK_SHARED | LK_RETRY | LK_CANRECURSE);
1947 		if (error == 0) {
1948 			vm_object_page_remove(object, pindex, pindex + 1, TRUE);
1949 			vput(vp);
1950 		}
1951 	} else {
1952 		/*
1953 		 * Otherwise it is a normal OBJT_SWAP object and we can
1954 		 * fault the page in and remove the swap.
1955 		 */
1956 		m = vm_fault_object_page(object, IDX_TO_OFF(pindex),
1957 					 VM_PROT_NONE,
1958 					 VM_FAULT_DIRTY | VM_FAULT_UNSWAP,
1959 					 0, &error);
1960 		if (m)
1961 			vm_page_unhold(m);
1962 	}
1963 }
1964 
1965 /*
1966  * This removes all swap blocks related to a particular device.  We have
1967  * to be careful of ripups during the scan.
1968  */
1969 static int swp_pager_swapoff_callback(struct swblock *swap, void *data);
1970 
1971 int
1972 swap_pager_swapoff(int devidx)
1973 {
1974 	struct vm_object marker;
1975 	vm_object_t object;
1976 	struct swswapoffinfo info;
1977 
1978 	bzero(&marker, sizeof(marker));
1979 	marker.type = OBJT_MARKER;
1980 
1981 	lwkt_gettoken(&vmobj_token);
1982 	TAILQ_INSERT_HEAD(&vm_object_list, &marker, object_list);
1983 
1984 	while ((object = TAILQ_NEXT(&marker, object_list)) != NULL) {
1985 		if (object->type == OBJT_MARKER)
1986 			goto skip;
1987 		if (object->type != OBJT_SWAP && object->type != OBJT_VNODE)
1988 			goto skip;
1989 		vm_object_hold(object);
1990 		if (object->type != OBJT_SWAP && object->type != OBJT_VNODE) {
1991 			vm_object_drop(object);
1992 			goto skip;
1993 		}
1994 		info.object = object;
1995 		info.devidx = devidx;
1996 		swblock_rb_tree_RB_SCAN(&object->swblock_root,
1997 					NULL,
1998 					swp_pager_swapoff_callback,
1999 					&info);
2000 		vm_object_drop(object);
2001 skip:
2002 		if (object == TAILQ_NEXT(&marker, object_list)) {
2003 			TAILQ_REMOVE(&vm_object_list, &marker, object_list);
2004 			TAILQ_INSERT_AFTER(&vm_object_list, object,
2005 					   &marker, object_list);
2006 		}
2007 	}
2008 	TAILQ_REMOVE(&vm_object_list, &marker, object_list);
2009 	lwkt_reltoken(&vmobj_token);
2010 
2011 	/*
2012 	 * If we fail to locate all swblocks we just fail gracefully and
2013 	 * do not bother to restore paging on the swap device.  If the
2014 	 * user wants to retry the user can retry.
2015 	 */
2016 	if (swdevt[devidx].sw_nused)
2017 		return (1);
2018 	else
2019 		return (0);
2020 }
2021 
2022 static
2023 int
2024 swp_pager_swapoff_callback(struct swblock *swap, void *data)
2025 {
2026 	struct swswapoffinfo *info = data;
2027 	vm_object_t object = info->object;
2028 	vm_pindex_t index;
2029 	swblk_t v;
2030 	int i;
2031 
2032 	index = swap->swb_index;
2033 	for (i = 0; i < SWAP_META_PAGES; ++i) {
2034 		/*
2035 		 * Make sure we don't race a dying object.  This will
2036 		 * kill the scan of the object's swap blocks entirely.
2037 		 */
2038 		if (object->flags & OBJ_DEAD)
2039 			return(-1);
2040 
2041 		/*
2042 		 * Fault the page, which can obviously block.  If the swap
2043 		 * structure disappears break out.
2044 		 */
2045 		v = swap->swb_pages[i];
2046 		if (v != SWAPBLK_NONE && BLK2DEVIDX(v) == info->devidx) {
2047 			swp_pager_fault_page(object, swap->swb_index + i);
2048 			/* swap ptr might go away */
2049 			if (RB_LOOKUP(swblock_rb_tree,
2050 				      &object->swblock_root, index) != swap) {
2051 				break;
2052 			}
2053 		}
2054 	}
2055 	return(0);
2056 }
2057 
2058 /************************************************************************
2059  *				SWAP META DATA 				*
2060  ************************************************************************
2061  *
2062  *	These routines manipulate the swap metadata stored in the
2063  *	OBJT_SWAP object.  All swp_*() routines must be called at
2064  *	splvm() because swap can be freed up by the low level vm_page
2065  *	code which might be called from interrupts beyond what splbio() covers.
2066  *
2067  *	Swap metadata is implemented with a global hash and not directly
2068  *	linked into the object.  Instead the object simply contains
2069  *	appropriate tracking counters.
2070  */
2071 
2072 /*
2073  * Lookup the swblock containing the specified swap block index.
2074  *
2075  * The caller must hold the object.
2076  */
2077 static __inline
2078 struct swblock *
2079 swp_pager_lookup(vm_object_t object, vm_pindex_t index)
2080 {
2081 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2082 	index &= ~(vm_pindex_t)SWAP_META_MASK;
2083 	return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index));
2084 }
2085 
2086 /*
2087  * Remove a swblock from the RB tree.
2088  *
2089  * The caller must hold the object.
2090  */
2091 static __inline
2092 void
2093 swp_pager_remove(vm_object_t object, struct swblock *swap)
2094 {
2095 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2096 	RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap);
2097 }
2098 
2099 /*
2100  * Convert default object to swap object if necessary
2101  *
2102  * The caller must hold the object.
2103  */
2104 static void
2105 swp_pager_meta_convert(vm_object_t object)
2106 {
2107 	if (object->type == OBJT_DEFAULT) {
2108 		object->type = OBJT_SWAP;
2109 		KKASSERT(object->swblock_count == 0);
2110 	}
2111 }
2112 
2113 /*
2114  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
2115  *
2116  *	We first convert the object to a swap object if it is a default
2117  *	object.  Vnode objects do not need to be converted.
2118  *
2119  *	The specified swapblk is added to the object's swap metadata.  If
2120  *	the swapblk is not valid, it is freed instead.  Any previously
2121  *	assigned swapblk is freed.
2122  *
2123  * The caller must hold the object.
2124  */
2125 static void
2126 swp_pager_meta_build(vm_object_t object, vm_pindex_t index, swblk_t swapblk)
2127 {
2128 	struct swblock *swap;
2129 	struct swblock *oswap;
2130 	vm_pindex_t v;
2131 
2132 	KKASSERT(swapblk != SWAPBLK_NONE);
2133 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2134 
2135 	/*
2136 	 * Convert object if necessary
2137 	 */
2138 	if (object->type == OBJT_DEFAULT)
2139 		swp_pager_meta_convert(object);
2140 
2141 	/*
2142 	 * Locate swblock.  If not found create, but if we aren't adding
2143 	 * anything just return.  If we run out of space in the map we wait
2144 	 * and, since the hash table may have changed, retry.
2145 	 */
2146 retry:
2147 	swap = swp_pager_lookup(object, index);
2148 
2149 	if (swap == NULL) {
2150 		int i;
2151 
2152 		swap = zalloc(swap_zone);
2153 		if (swap == NULL) {
2154 			vm_wait(0);
2155 			goto retry;
2156 		}
2157 		swap->swb_index = index & ~(vm_pindex_t)SWAP_META_MASK;
2158 		swap->swb_count = 0;
2159 
2160 		++object->swblock_count;
2161 
2162 		for (i = 0; i < SWAP_META_PAGES; ++i)
2163 			swap->swb_pages[i] = SWAPBLK_NONE;
2164 		oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap);
2165 		KKASSERT(oswap == NULL);
2166 	}
2167 
2168 	/*
2169 	 * Delete prior contents of metadata.
2170 	 *
2171 	 * NOTE: Decrement swb_count after the freeing operation (which
2172 	 *	 might block) to prevent racing destruction of the swblock.
2173 	 */
2174 	index &= SWAP_META_MASK;
2175 
2176 	while ((v = swap->swb_pages[index]) != SWAPBLK_NONE) {
2177 		swap->swb_pages[index] = SWAPBLK_NONE;
2178 		/* can block */
2179 		swp_pager_freeswapspace(object, v, 1);
2180 		--swap->swb_count;
2181 	}
2182 
2183 	/*
2184 	 * Enter block into metadata
2185 	 */
2186 	swap->swb_pages[index] = swapblk;
2187 	if (swapblk != SWAPBLK_NONE)
2188 		++swap->swb_count;
2189 }
2190 
2191 /*
2192  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
2193  *
2194  *	The requested range of blocks is freed, with any associated swap
2195  *	returned to the swap bitmap.
2196  *
2197  *	This routine will free swap metadata structures as they are cleaned
2198  *	out.  This routine does *NOT* operate on swap metadata associated
2199  *	with resident pages.
2200  *
2201  * The caller must hold the object.
2202  */
2203 static int swp_pager_meta_free_callback(struct swblock *swb, void *data);
2204 
2205 static void
2206 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
2207 {
2208 	struct swfreeinfo info;
2209 
2210 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2211 
2212 	/*
2213 	 * Nothing to do
2214 	 */
2215 	if (object->swblock_count == 0) {
2216 		KKASSERT(RB_EMPTY(&object->swblock_root));
2217 		return;
2218 	}
2219 	if (count == 0)
2220 		return;
2221 
2222 	/*
2223 	 * Setup for RB tree scan.  Note that the pindex range can be huge
2224 	 * due to the 64 bit page index space so we cannot safely iterate.
2225 	 */
2226 	info.object = object;
2227 	info.basei = index & ~(vm_pindex_t)SWAP_META_MASK;
2228 	info.begi = index;
2229 	info.endi = index + count - 1;
2230 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp,
2231 				swp_pager_meta_free_callback, &info);
2232 }
2233 
2234 /*
2235  * The caller must hold the object.
2236  */
2237 static
2238 int
2239 swp_pager_meta_free_callback(struct swblock *swap, void *data)
2240 {
2241 	struct swfreeinfo *info = data;
2242 	vm_object_t object = info->object;
2243 	int index;
2244 	int eindex;
2245 
2246 	/*
2247 	 * Figure out the range within the swblock.  The wider scan may
2248 	 * return edge-case swap blocks when the start and/or end points
2249 	 * are in the middle of a block.
2250 	 */
2251 	if (swap->swb_index < info->begi)
2252 		index = (int)info->begi & SWAP_META_MASK;
2253 	else
2254 		index = 0;
2255 
2256 	if (swap->swb_index + SWAP_META_PAGES > info->endi)
2257 		eindex = (int)info->endi & SWAP_META_MASK;
2258 	else
2259 		eindex = SWAP_META_MASK;
2260 
2261 	/*
2262 	 * Scan and free the blocks.  The loop terminates early
2263 	 * if (swap) runs out of blocks and could be freed.
2264 	 *
2265 	 * NOTE: Decrement swb_count after swp_pager_freeswapspace()
2266 	 *	 to deal with a zfree race.
2267 	 */
2268 	while (index <= eindex) {
2269 		swblk_t v = swap->swb_pages[index];
2270 
2271 		if (v != SWAPBLK_NONE) {
2272 			swap->swb_pages[index] = SWAPBLK_NONE;
2273 			/* can block */
2274 			swp_pager_freeswapspace(object, v, 1);
2275 			if (--swap->swb_count == 0) {
2276 				swp_pager_remove(object, swap);
2277 				zfree(swap_zone, swap);
2278 				--object->swblock_count;
2279 				break;
2280 			}
2281 		}
2282 		++index;
2283 	}
2284 
2285 	/* swap may be invalid here due to zfree above */
2286 	lwkt_yield();
2287 
2288 	return(0);
2289 }
2290 
2291 /*
2292  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
2293  *
2294  *	This routine locates and destroys all swap metadata associated with
2295  *	an object.
2296  *
2297  * NOTE: Decrement swb_count after the freeing operation (which
2298  *	 might block) to prevent racing destruction of the swblock.
2299  *
2300  * The caller must hold the object.
2301  */
2302 static void
2303 swp_pager_meta_free_all(vm_object_t object)
2304 {
2305 	struct swblock *swap;
2306 	int i;
2307 
2308 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2309 
2310 	while ((swap = RB_ROOT(&object->swblock_root)) != NULL) {
2311 		swp_pager_remove(object, swap);
2312 		for (i = 0; i < SWAP_META_PAGES; ++i) {
2313 			swblk_t v = swap->swb_pages[i];
2314 			if (v != SWAPBLK_NONE) {
2315 				/* can block */
2316 				swp_pager_freeswapspace(object, v, 1);
2317 				--swap->swb_count;
2318 			}
2319 		}
2320 		if (swap->swb_count != 0)
2321 			panic("swap_pager_meta_free_all: swb_count != 0");
2322 		zfree(swap_zone, swap);
2323 		--object->swblock_count;
2324 		lwkt_yield();
2325 	}
2326 	KKASSERT(object->swblock_count == 0);
2327 }
2328 
2329 /*
2330  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
2331  *
2332  *	This routine is capable of looking up, popping, or freeing
2333  *	swapblk assignments in the swap meta data or in the vm_page_t.
2334  *	The routine typically returns the swapblk being looked-up, or popped,
2335  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
2336  *	was invalid.  This routine will automatically free any invalid
2337  *	meta-data swapblks.
2338  *
2339  *	It is not possible to store invalid swapblks in the swap meta data
2340  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
2341  *
2342  *	When acting on a busy resident page and paging is in progress, we
2343  *	have to wait until paging is complete but otherwise can act on the
2344  *	busy page.
2345  *
2346  *	SWM_FREE	remove and free swap block from metadata
2347  *	SWM_POP		remove from meta data but do not free.. pop it out
2348  *
2349  * The caller must hold the object.
2350  */
2351 static swblk_t
2352 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
2353 {
2354 	struct swblock *swap;
2355 	swblk_t r1;
2356 
2357 	if (object->swblock_count == 0)
2358 		return(SWAPBLK_NONE);
2359 
2360 	r1 = SWAPBLK_NONE;
2361 	swap = swp_pager_lookup(object, index);
2362 
2363 	if (swap != NULL) {
2364 		index &= SWAP_META_MASK;
2365 		r1 = swap->swb_pages[index];
2366 
2367 		if (r1 != SWAPBLK_NONE) {
2368 			if (flags & (SWM_FREE|SWM_POP)) {
2369 				swap->swb_pages[index] = SWAPBLK_NONE;
2370 				if (--swap->swb_count == 0) {
2371 					swp_pager_remove(object, swap);
2372 					zfree(swap_zone, swap);
2373 					--object->swblock_count;
2374 				}
2375 			}
2376 			/* swap ptr may be invalid */
2377 			if (flags & SWM_FREE) {
2378 				swp_pager_freeswapspace(object, r1, 1);
2379 				r1 = SWAPBLK_NONE;
2380 			}
2381 		}
2382 		/* swap ptr may be invalid */
2383 	}
2384 	return(r1);
2385 }
2386