xref: /dragonfly/sys/vm/swap_pager.c (revision 36a3d1d6)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1998-2010 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Copyright (c) 1994 John S. Dyson
37  * Copyright (c) 1990 University of Utah.
38  * Copyright (c) 1991, 1993
39  *	The Regents of the University of California.  All rights reserved.
40  *
41  * This code is derived from software contributed to Berkeley by
42  * the Systems Programming Group of the University of Utah Computer
43  * Science Department.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  * 3. All advertising materials mentioning features or use of this software
54  *    must display the following acknowledgement:
55  *	This product includes software developed by the University of
56  *	California, Berkeley and its contributors.
57  * 4. Neither the name of the University nor the names of its contributors
58  *    may be used to endorse or promote products derived from this software
59  *    without specific prior written permission.
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71  * SUCH DAMAGE.
72  *
73  *				New Swap System
74  *				Matthew Dillon
75  *
76  * Radix Bitmap 'blists'.
77  *
78  *	- The new swapper uses the new radix bitmap code.  This should scale
79  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
80  *	  arbitrary degree of fragmentation.
81  *
82  * Features:
83  *
84  *	- on the fly reallocation of swap during putpages.  The new system
85  *	  does not try to keep previously allocated swap blocks for dirty
86  *	  pages.
87  *
88  *	- on the fly deallocation of swap
89  *
90  *	- No more garbage collection required.  Unnecessarily allocated swap
91  *	  blocks only exist for dirty vm_page_t's now and these are already
92  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
93  *	  removal of invalidated swap blocks when a page is destroyed
94  *	  or renamed.
95  *
96  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
97  * @(#)swap_pager.c	8.9 (Berkeley) 3/21/94
98  * $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
99  */
100 
101 #include <sys/param.h>
102 #include <sys/systm.h>
103 #include <sys/conf.h>
104 #include <sys/kernel.h>
105 #include <sys/proc.h>
106 #include <sys/buf.h>
107 #include <sys/vnode.h>
108 #include <sys/malloc.h>
109 #include <sys/vmmeter.h>
110 #include <sys/sysctl.h>
111 #include <sys/blist.h>
112 #include <sys/lock.h>
113 #include <sys/thread2.h>
114 
115 #ifndef MAX_PAGEOUT_CLUSTER
116 #define MAX_PAGEOUT_CLUSTER 16
117 #endif
118 
119 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
120 
121 #include "opt_swap.h"
122 #include <vm/vm.h>
123 #include <vm/vm_object.h>
124 #include <vm/vm_page.h>
125 #include <vm/vm_pager.h>
126 #include <vm/vm_pageout.h>
127 #include <vm/swap_pager.h>
128 #include <vm/vm_extern.h>
129 #include <vm/vm_zone.h>
130 #include <vm/vnode_pager.h>
131 
132 #include <sys/buf2.h>
133 #include <vm/vm_page2.h>
134 
135 #define SWM_FREE	0x02	/* free, period			*/
136 #define SWM_POP		0x04	/* pop out			*/
137 
138 #define SWBIO_READ	0x01
139 #define SWBIO_WRITE	0x02
140 #define SWBIO_SYNC	0x04
141 
142 struct swfreeinfo {
143 	vm_object_t	object;
144 	vm_pindex_t	basei;
145 	vm_pindex_t	begi;
146 	vm_pindex_t	endi;	/* inclusive */
147 };
148 
149 /*
150  * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
151  * in the old system.
152  */
153 
154 int swap_pager_full;		/* swap space exhaustion (task killing) */
155 int vm_swap_cache_use;
156 int vm_swap_anon_use;
157 
158 static int swap_pager_almost_full; /* swap space exhaustion (w/ hysteresis)*/
159 static int nsw_rcount;		/* free read buffers			*/
160 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
161 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
162 static int nsw_wcount_async_max;/* assigned maximum			*/
163 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
164 
165 struct blist *swapblist;
166 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
167 static int swap_burst_read = 0;	/* allow burst reading */
168 
169 extern struct vnode *swapdev_vp;	/* from vm_swap.c */
170 
171 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
172         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
173 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
174         CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
175 
176 SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
177         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
178 SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
179         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
180 SYSCTL_INT(_vm, OID_AUTO, swap_size,
181         CTLFLAG_RD, &vm_swap_size, 0, "");
182 
183 vm_zone_t		swap_zone;
184 
185 /*
186  * Red-Black tree for swblock entries
187  *
188  * The caller must hold vm_token
189  */
190 RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
191 	     vm_pindex_t, swb_index);
192 
193 int
194 rb_swblock_compare(struct swblock *swb1, struct swblock *swb2)
195 {
196 	if (swb1->swb_index < swb2->swb_index)
197 		return(-1);
198 	if (swb1->swb_index > swb2->swb_index)
199 		return(1);
200 	return(0);
201 }
202 
203 static
204 int
205 rb_swblock_scancmp(struct swblock *swb, void *data)
206 {
207 	struct swfreeinfo *info = data;
208 
209 	if (swb->swb_index < info->basei)
210 		return(-1);
211 	if (swb->swb_index > info->endi)
212 		return(1);
213 	return(0);
214 }
215 
216 static
217 int
218 rb_swblock_condcmp(struct swblock *swb, void *data)
219 {
220 	struct swfreeinfo *info = data;
221 
222 	if (swb->swb_index < info->basei)
223 		return(-1);
224 	return(0);
225 }
226 
227 /*
228  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
229  * calls hooked from other parts of the VM system and do not appear here.
230  * (see vm/swap_pager.h).
231  */
232 
233 static void	swap_pager_dealloc (vm_object_t object);
234 static int	swap_pager_getpage (vm_object_t, vm_page_t *, int);
235 static void	swap_chain_iodone(struct bio *biox);
236 
237 struct pagerops swappagerops = {
238 	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
239 	swap_pager_getpage,	/* pagein				*/
240 	swap_pager_putpages,	/* pageout				*/
241 	swap_pager_haspage	/* get backing store status for page	*/
242 };
243 
244 /*
245  * dmmax is in page-sized chunks with the new swap system.  It was
246  * dev-bsized chunks in the old.  dmmax is always a power of 2.
247  *
248  * swap_*() routines are externally accessible.  swp_*() routines are
249  * internal.
250  */
251 
252 int dmmax;
253 static int dmmax_mask;
254 int nswap_lowat = 128;		/* in pages, swap_pager_almost_full warn */
255 int nswap_hiwat = 512;		/* in pages, swap_pager_almost_full warn */
256 
257 static __inline void	swp_sizecheck (void);
258 static void	swp_pager_async_iodone (struct bio *bio);
259 
260 /*
261  * Swap bitmap functions
262  */
263 
264 static __inline void	swp_pager_freeswapspace(vm_object_t object,
265 						swblk_t blk, int npages);
266 static __inline swblk_t	swp_pager_getswapspace(vm_object_t object, int npages);
267 
268 /*
269  * Metadata functions
270  */
271 
272 static void swp_pager_meta_convert(vm_object_t);
273 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, swblk_t);
274 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
275 static void swp_pager_meta_free_all(vm_object_t);
276 static swblk_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
277 
278 /*
279  * SWP_SIZECHECK() -	update swap_pager_full indication
280  *
281  *	update the swap_pager_almost_full indication and warn when we are
282  *	about to run out of swap space, using lowat/hiwat hysteresis.
283  *
284  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
285  *
286  * No restrictions on call
287  * This routine may not block.
288  * SMP races are ok.
289  */
290 static __inline void
291 swp_sizecheck(void)
292 {
293 	if (vm_swap_size < nswap_lowat) {
294 		if (swap_pager_almost_full == 0) {
295 			kprintf("swap_pager: out of swap space\n");
296 			swap_pager_almost_full = 1;
297 		}
298 	} else {
299 		swap_pager_full = 0;
300 		if (vm_swap_size > nswap_hiwat)
301 			swap_pager_almost_full = 0;
302 	}
303 }
304 
305 /*
306  * SWAP_PAGER_INIT() -	initialize the swap pager!
307  *
308  *	Expected to be started from system init.  NOTE:  This code is run
309  *	before much else so be careful what you depend on.  Most of the VM
310  *	system has yet to be initialized at this point.
311  *
312  * Called from the low level boot code only.
313  */
314 static void
315 swap_pager_init(void *arg __unused)
316 {
317 	/*
318 	 * Device Stripe, in PAGE_SIZE'd blocks
319 	 */
320 	dmmax = SWB_NPAGES * 2;
321 	dmmax_mask = ~(dmmax - 1);
322 }
323 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL)
324 
325 /*
326  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
327  *
328  *	Expected to be started from pageout process once, prior to entering
329  *	its main loop.
330  *
331  * Called from the low level boot code only.
332  */
333 void
334 swap_pager_swap_init(void)
335 {
336 	int n, n2;
337 
338 	/*
339 	 * Number of in-transit swap bp operations.  Don't
340 	 * exhaust the pbufs completely.  Make sure we
341 	 * initialize workable values (0 will work for hysteresis
342 	 * but it isn't very efficient).
343 	 *
344 	 * The nsw_cluster_max is constrained by the number of pages an XIO
345 	 * holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
346 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
347 	 * constrained by the swap device interleave stripe size.
348 	 *
349 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
350 	 * designed to prevent other I/O from having high latencies due to
351 	 * our pageout I/O.  The value 4 works well for one or two active swap
352 	 * devices but is probably a little low if you have more.  Even so,
353 	 * a higher value would probably generate only a limited improvement
354 	 * with three or four active swap devices since the system does not
355 	 * typically have to pageout at extreme bandwidths.   We will want
356 	 * at least 2 per swap devices, and 4 is a pretty good value if you
357 	 * have one NFS swap device due to the command/ack latency over NFS.
358 	 * So it all works out pretty well.
359 	 */
360 
361 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
362 
363 	nsw_rcount = (nswbuf + 1) / 2;
364 	nsw_wcount_sync = (nswbuf + 3) / 4;
365 	nsw_wcount_async = 4;
366 	nsw_wcount_async_max = nsw_wcount_async;
367 
368 	/*
369 	 * The zone is dynamically allocated so generally size it to
370 	 * maxswzone (32MB to 512MB of KVM).  Set a minimum size based
371 	 * on physical memory of around 8x (each swblock can hold 16 pages).
372 	 *
373 	 * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
374 	 * has increased dramatically.
375 	 */
376 	n = vmstats.v_page_count / 2;
377 	if (maxswzone && n < maxswzone / sizeof(struct swblock))
378 		n = maxswzone / sizeof(struct swblock);
379 	n2 = n;
380 
381 	do {
382 		swap_zone = zinit(
383 			"SWAPMETA",
384 			sizeof(struct swblock),
385 			n,
386 			ZONE_INTERRUPT,
387 			1);
388 		if (swap_zone != NULL)
389 			break;
390 		/*
391 		 * if the allocation failed, try a zone two thirds the
392 		 * size of the previous attempt.
393 		 */
394 		n -= ((n + 2) / 3);
395 	} while (n > 0);
396 
397 	if (swap_zone == NULL)
398 		panic("swap_pager_swap_init: swap_zone == NULL");
399 	if (n2 != n)
400 		kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
401 }
402 
403 /*
404  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
405  *			its metadata structures.
406  *
407  *	This routine is called from the mmap and fork code to create a new
408  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
409  *	and then converting it with swp_pager_meta_convert().
410  *
411  *	We only support unnamed objects.
412  *
413  * No restrictions.
414  */
415 vm_object_t
416 swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
417 {
418 	vm_object_t object;
419 
420 	KKASSERT(handle == NULL);
421 	lwkt_gettoken(&vm_token);
422 	object = vm_object_allocate(OBJT_DEFAULT,
423 				    OFF_TO_IDX(offset + PAGE_MASK + size));
424 	swp_pager_meta_convert(object);
425 	lwkt_reltoken(&vm_token);
426 
427 	return (object);
428 }
429 
430 /*
431  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
432  *
433  *	The swap backing for the object is destroyed.  The code is
434  *	designed such that we can reinstantiate it later, but this
435  *	routine is typically called only when the entire object is
436  *	about to be destroyed.
437  *
438  * The object must be locked or unreferenceable.
439  * No other requirements.
440  */
441 static void
442 swap_pager_dealloc(vm_object_t object)
443 {
444 	lwkt_gettoken(&vm_token);
445 	vm_object_pip_wait(object, "swpdea");
446 
447 	/*
448 	 * Free all remaining metadata.  We only bother to free it from
449 	 * the swap meta data.  We do not attempt to free swapblk's still
450 	 * associated with vm_page_t's for this object.  We do not care
451 	 * if paging is still in progress on some objects.
452 	 */
453 	crit_enter();
454 	swp_pager_meta_free_all(object);
455 	crit_exit();
456 	lwkt_reltoken(&vm_token);
457 }
458 
459 /************************************************************************
460  *			SWAP PAGER BITMAP ROUTINES			*
461  ************************************************************************/
462 
463 /*
464  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
465  *
466  *	Allocate swap for the requested number of pages.  The starting
467  *	swap block number (a page index) is returned or SWAPBLK_NONE
468  *	if the allocation failed.
469  *
470  *	Also has the side effect of advising that somebody made a mistake
471  *	when they configured swap and didn't configure enough.
472  *
473  * The caller must hold vm_token.
474  * This routine may not block.
475  *
476  * NOTE: vm_token must be held to avoid races with bitmap frees from
477  *	 vm_page_remove() via swap_pager_page_removed().
478  */
479 static __inline swblk_t
480 swp_pager_getswapspace(vm_object_t object, int npages)
481 {
482 	swblk_t blk;
483 
484 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
485 
486 	if ((blk = blist_alloc(swapblist, npages)) == SWAPBLK_NONE) {
487 		if (swap_pager_full != 2) {
488 			kprintf("swap_pager_getswapspace: failed\n");
489 			swap_pager_full = 2;
490 			swap_pager_almost_full = 1;
491 		}
492 	} else {
493 		swapacctspace(blk, -npages);
494 		if (object->type == OBJT_SWAP)
495 			vm_swap_anon_use += npages;
496 		else
497 			vm_swap_cache_use += npages;
498 		swp_sizecheck();
499 	}
500 	return(blk);
501 }
502 
503 /*
504  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
505  *
506  *	This routine returns the specified swap blocks back to the bitmap.
507  *
508  *	Note:  This routine may not block (it could in the old swap code),
509  *	and through the use of the new blist routines it does not block.
510  *
511  *	We must be called at splvm() to avoid races with bitmap frees from
512  *	vm_page_remove() aka swap_pager_page_removed().
513  *
514  * The caller must hold vm_token.
515  * This routine may not block.
516  */
517 
518 static __inline void
519 swp_pager_freeswapspace(vm_object_t object, swblk_t blk, int npages)
520 {
521 	blist_free(swapblist, blk, npages);
522 	swapacctspace(blk, npages);
523 	if (object->type == OBJT_SWAP)
524 		vm_swap_anon_use -= npages;
525 	else
526 		vm_swap_cache_use -= npages;
527 	swp_sizecheck();
528 }
529 
530 /*
531  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
532  *				range within an object.
533  *
534  *	This is a globally accessible routine.
535  *
536  *	This routine removes swapblk assignments from swap metadata.
537  *
538  *	The external callers of this routine typically have already destroyed
539  *	or renamed vm_page_t's associated with this range in the object so
540  *	we should be ok.
541  *
542  * No requirements.
543  */
544 void
545 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
546 {
547 	crit_enter();
548 	lwkt_gettoken(&vm_token);
549 	swp_pager_meta_free(object, start, size);
550 	lwkt_reltoken(&vm_token);
551 	crit_exit();
552 }
553 
554 /*
555  * No requirements.
556  */
557 void
558 swap_pager_freespace_all(vm_object_t object)
559 {
560 	crit_enter();
561 	lwkt_gettoken(&vm_token);
562 	swp_pager_meta_free_all(object);
563 	lwkt_reltoken(&vm_token);
564 	crit_exit();
565 }
566 
567 /*
568  * This function conditionally frees swap cache swap starting at
569  * (*basei) in the object.  (count) swap blocks will be nominally freed.
570  * The actual number of blocks freed can be more or less than the
571  * requested number.
572  *
573  * This function nominally returns the number of blocks freed.  However,
574  * the actual number of blocks freed may be less then the returned value.
575  * If the function is unable to exhaust the object or if it is able to
576  * free (approximately) the requested number of blocks it returns
577  * a value n > count.
578  *
579  * If we exhaust the object we will return a value n <= count.
580  *
581  * The caller must hold vm_token.
582  */
583 static int swap_pager_condfree_callback(struct swblock *swap, void *data);
584 
585 int
586 swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count)
587 {
588 	struct swfreeinfo info;
589 
590 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
591 
592 	info.object = object;
593 	info.basei = *basei;	/* skip up to this page index */
594 	info.begi = count;	/* max swap pages to destroy */
595 	info.endi = count * 8;	/* max swblocks to scan */
596 
597 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
598 				swap_pager_condfree_callback, &info);
599 	*basei = info.basei;
600 	if (info.endi < 0 && info.begi <= count)
601 		info.begi = count + 1;
602 	return(count - (int)info.begi);
603 }
604 
605 /*
606  * The idea is to free whole meta-block to avoid fragmenting
607  * the swap space or disk I/O.  We only do this if NO VM pages
608  * are present.
609  *
610  * We do not have to deal with clearing PG_SWAPPED in related VM
611  * pages because there are no related VM pages.
612  *
613  * The caller must hold vm_token.
614  */
615 static int
616 swap_pager_condfree_callback(struct swblock *swap, void *data)
617 {
618 	struct swfreeinfo *info = data;
619 	vm_object_t object = info->object;
620 	int i;
621 
622 	for (i = 0; i < SWAP_META_PAGES; ++i) {
623 		if (vm_page_lookup(object, swap->swb_index + i))
624 			break;
625 	}
626 	info->basei = swap->swb_index + SWAP_META_PAGES;
627 	if (i == SWAP_META_PAGES) {
628 		info->begi -= swap->swb_count;
629 		swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
630 	}
631 	--info->endi;
632 	if ((int)info->begi < 0 || (int)info->endi < 0)
633 		return(-1);
634 	return(0);
635 }
636 
637 /*
638  * Called by vm_page_alloc() when a new VM page is inserted
639  * into a VM object.  Checks whether swap has been assigned to
640  * the page and sets PG_SWAPPED as necessary.
641  *
642  * No requirements.
643  */
644 void
645 swap_pager_page_inserted(vm_page_t m)
646 {
647 	if (m->object->swblock_count) {
648 		crit_enter();
649 		lwkt_gettoken(&vm_token);
650 		if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
651 			vm_page_flag_set(m, PG_SWAPPED);
652 		lwkt_reltoken(&vm_token);
653 		crit_exit();
654 	}
655 }
656 
657 /*
658  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
659  *
660  *	Assigns swap blocks to the specified range within the object.  The
661  *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
662  *
663  *	Returns 0 on success, -1 on failure.
664  *
665  * The caller is responsible for avoiding races in the specified range.
666  * No other requirements.
667  */
668 int
669 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
670 {
671 	int n = 0;
672 	swblk_t blk = SWAPBLK_NONE;
673 	vm_pindex_t beg = start;	/* save start index */
674 
675 	crit_enter();
676 	lwkt_gettoken(&vm_token);
677 	while (size) {
678 		if (n == 0) {
679 			n = BLIST_MAX_ALLOC;
680 			while ((blk = swp_pager_getswapspace(object, n)) ==
681 			       SWAPBLK_NONE)
682 			{
683 				n >>= 1;
684 				if (n == 0) {
685 					swp_pager_meta_free(object, beg,
686 							    start - beg);
687 					lwkt_reltoken(&vm_token);
688 					crit_exit();
689 					return(-1);
690 				}
691 			}
692 		}
693 		swp_pager_meta_build(object, start, blk);
694 		--size;
695 		++start;
696 		++blk;
697 		--n;
698 	}
699 	swp_pager_meta_free(object, start, n);
700 	lwkt_reltoken(&vm_token);
701 	crit_exit();
702 	return(0);
703 }
704 
705 /*
706  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
707  *			and destroy the source.
708  *
709  *	Copy any valid swapblks from the source to the destination.  In
710  *	cases where both the source and destination have a valid swapblk,
711  *	we keep the destination's.
712  *
713  *	This routine is allowed to block.  It may block allocating metadata
714  *	indirectly through swp_pager_meta_build() or if paging is still in
715  *	progress on the source.
716  *
717  *	This routine can be called at any spl
718  *
719  *	XXX vm_page_collapse() kinda expects us not to block because we
720  *	supposedly do not need to allocate memory, but for the moment we
721  *	*may* have to get a little memory from the zone allocator, but
722  *	it is taken from the interrupt memory.  We should be ok.
723  *
724  *	The source object contains no vm_page_t's (which is just as well)
725  *
726  *	The source object is of type OBJT_SWAP.
727  *
728  *	The source and destination objects must be locked or
729  *	inaccessible (XXX are they ?)
730  *
731  * The caller must hold vm_token.
732  */
733 void
734 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
735 		vm_pindex_t base_index, int destroysource)
736 {
737 	vm_pindex_t i;
738 
739 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
740 	crit_enter();
741 
742 	/*
743 	 * transfer source to destination.
744 	 */
745 	for (i = 0; i < dstobject->size; ++i) {
746 		swblk_t dstaddr;
747 
748 		/*
749 		 * Locate (without changing) the swapblk on the destination,
750 		 * unless it is invalid in which case free it silently, or
751 		 * if the destination is a resident page, in which case the
752 		 * source is thrown away.
753 		 */
754 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
755 
756 		if (dstaddr == SWAPBLK_NONE) {
757 			/*
758 			 * Destination has no swapblk and is not resident,
759 			 * copy source.
760 			 */
761 			swblk_t srcaddr;
762 
763 			srcaddr = swp_pager_meta_ctl(srcobject,
764 						     base_index + i, SWM_POP);
765 
766 			if (srcaddr != SWAPBLK_NONE)
767 				swp_pager_meta_build(dstobject, i, srcaddr);
768 		} else {
769 			/*
770 			 * Destination has valid swapblk or it is represented
771 			 * by a resident page.  We destroy the sourceblock.
772 			 */
773 			swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
774 		}
775 	}
776 
777 	/*
778 	 * Free left over swap blocks in source.
779 	 *
780 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
781 	 * double-remove the object from the swap queues.
782 	 */
783 	if (destroysource) {
784 		/*
785 		 * Reverting the type is not necessary, the caller is going
786 		 * to destroy srcobject directly, but I'm doing it here
787 		 * for consistency since we've removed the object from its
788 		 * queues.
789 		 */
790 		swp_pager_meta_free_all(srcobject);
791 		if (srcobject->type == OBJT_SWAP)
792 			srcobject->type = OBJT_DEFAULT;
793 	}
794 	crit_exit();
795 }
796 
797 /*
798  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
799  *				the requested page.
800  *
801  *	We determine whether good backing store exists for the requested
802  *	page and return TRUE if it does, FALSE if it doesn't.
803  *
804  *	If TRUE, we also try to determine how much valid, contiguous backing
805  *	store exists before and after the requested page within a reasonable
806  *	distance.  We do not try to restrict it to the swap device stripe
807  *	(that is handled in getpages/putpages).  It probably isn't worth
808  *	doing here.
809  *
810  * No requirements.
811  */
812 boolean_t
813 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
814 {
815 	swblk_t blk0;
816 
817 	/*
818 	 * do we have good backing store at the requested index ?
819 	 */
820 
821 	crit_enter();
822 	lwkt_gettoken(&vm_token);
823 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
824 
825 	if (blk0 == SWAPBLK_NONE) {
826 		lwkt_reltoken(&vm_token);
827 		crit_exit();
828 		return (FALSE);
829 	}
830 	lwkt_reltoken(&vm_token);
831 	crit_exit();
832 	return (TRUE);
833 }
834 
835 /*
836  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
837  *
838  * This removes any associated swap backing store, whether valid or
839  * not, from the page.  This operates on any VM object, not just OBJT_SWAP
840  * objects.
841  *
842  * This routine is typically called when a page is made dirty, at
843  * which point any associated swap can be freed.  MADV_FREE also
844  * calls us in a special-case situation
845  *
846  * NOTE!!!  If the page is clean and the swap was valid, the caller
847  * should make the page dirty before calling this routine.  This routine
848  * does NOT change the m->dirty status of the page.  Also: MADV_FREE
849  * depends on it.
850  *
851  * The page must be busied or soft-busied.
852  * The caller must hold vm_token if the caller does not wish to block here.
853  * No other requirements.
854  */
855 void
856 swap_pager_unswapped(vm_page_t m)
857 {
858 	if (m->flags & PG_SWAPPED) {
859 		crit_enter();
860 		lwkt_gettoken(&vm_token);
861 		KKASSERT(m->flags & PG_SWAPPED);
862 		swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
863 		vm_page_flag_clear(m, PG_SWAPPED);
864 		lwkt_reltoken(&vm_token);
865 		crit_exit();
866 	}
867 }
868 
869 /*
870  * SWAP_PAGER_STRATEGY() - read, write, free blocks
871  *
872  * This implements a VM OBJECT strategy function using swap backing store.
873  * This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
874  * types.
875  *
876  * This is intended to be a cacheless interface (i.e. caching occurs at
877  * higher levels), and is also used as a swap-based SSD cache for vnode
878  * and device objects.
879  *
880  * All I/O goes directly to and from the swap device.
881  *
882  * We currently attempt to run I/O synchronously or asynchronously as
883  * the caller requests.  This isn't perfect because we loose error
884  * sequencing when we run multiple ops in parallel to satisfy a request.
885  * But this is swap, so we let it all hang out.
886  *
887  * No requirements.
888  */
889 void
890 swap_pager_strategy(vm_object_t object, struct bio *bio)
891 {
892 	struct buf *bp = bio->bio_buf;
893 	struct bio *nbio;
894 	vm_pindex_t start;
895 	vm_pindex_t biox_blkno = 0;
896 	int count;
897 	char *data;
898 	struct bio *biox;
899 	struct buf *bufx;
900 	struct bio_track *track;
901 
902 	/*
903 	 * tracking for swapdev vnode I/Os
904 	 */
905 	if (bp->b_cmd == BUF_CMD_READ)
906 		track = &swapdev_vp->v_track_read;
907 	else
908 		track = &swapdev_vp->v_track_write;
909 
910 	if (bp->b_bcount & PAGE_MASK) {
911 		bp->b_error = EINVAL;
912 		bp->b_flags |= B_ERROR | B_INVAL;
913 		biodone(bio);
914 		kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
915 			"not page bounded\n",
916 			bp, (long long)bio->bio_offset, (int)bp->b_bcount);
917 		return;
918 	}
919 
920 	/*
921 	 * Clear error indication, initialize page index, count, data pointer.
922 	 */
923 	bp->b_error = 0;
924 	bp->b_flags &= ~B_ERROR;
925 	bp->b_resid = bp->b_bcount;
926 
927 	start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT);
928 	count = howmany(bp->b_bcount, PAGE_SIZE);
929 	data = bp->b_data;
930 
931 	/*
932 	 * Deal with BUF_CMD_FREEBLKS
933 	 */
934 	if (bp->b_cmd == BUF_CMD_FREEBLKS) {
935 		/*
936 		 * FREE PAGE(s) - destroy underlying swap that is no longer
937 		 *		  needed.
938 		 */
939 		crit_enter();
940 		lwkt_gettoken(&vm_token);
941 		swp_pager_meta_free(object, start, count);
942 		lwkt_reltoken(&vm_token);
943 		crit_exit();
944 		bp->b_resid = 0;
945 		biodone(bio);
946 		return;
947 	}
948 
949 	/*
950 	 * We need to be able to create a new cluster of I/O's.  We cannot
951 	 * use the caller fields of the passed bio so push a new one.
952 	 *
953 	 * Because nbio is just a placeholder for the cluster links,
954 	 * we can biodone() the original bio instead of nbio to make
955 	 * things a bit more efficient.
956 	 */
957 	nbio = push_bio(bio);
958 	nbio->bio_offset = bio->bio_offset;
959 	nbio->bio_caller_info1.cluster_head = NULL;
960 	nbio->bio_caller_info2.cluster_tail = NULL;
961 
962 	biox = NULL;
963 	bufx = NULL;
964 
965 	/*
966 	 * Execute read or write
967 	 */
968 	crit_enter();
969 	lwkt_gettoken(&vm_token);
970 	while (count > 0) {
971 		swblk_t blk;
972 
973 		/*
974 		 * Obtain block.  If block not found and writing, allocate a
975 		 * new block and build it into the object.
976 		 */
977 		blk = swp_pager_meta_ctl(object, start, 0);
978 		if ((blk == SWAPBLK_NONE) && bp->b_cmd != BUF_CMD_READ) {
979 			blk = swp_pager_getswapspace(object, 1);
980 			if (blk == SWAPBLK_NONE) {
981 				bp->b_error = ENOMEM;
982 				bp->b_flags |= B_ERROR;
983 				break;
984 			}
985 			swp_pager_meta_build(object, start, blk);
986 		}
987 
988 		/*
989 		 * Do we have to flush our current collection?  Yes if:
990 		 *
991 		 *	- no swap block at this index
992 		 *	- swap block is not contiguous
993 		 *	- we cross a physical disk boundry in the
994 		 *	  stripe.
995 		 */
996 		if (
997 		    biox && (biox_blkno + btoc(bufx->b_bcount) != blk ||
998 		     ((biox_blkno ^ blk) & dmmax_mask)
999 		    )
1000 		) {
1001 			if (bp->b_cmd == BUF_CMD_READ) {
1002 				++mycpu->gd_cnt.v_swapin;
1003 				mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1004 			} else {
1005 				++mycpu->gd_cnt.v_swapout;
1006 				mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1007 				bufx->b_dirtyend = bufx->b_bcount;
1008 			}
1009 
1010 			/*
1011 			 * Finished with this buf.
1012 			 */
1013 			KKASSERT(bufx->b_bcount != 0);
1014 			if (bufx->b_cmd != BUF_CMD_READ)
1015 				bufx->b_dirtyend = bufx->b_bcount;
1016 			biox = NULL;
1017 			bufx = NULL;
1018 		}
1019 
1020 		/*
1021 		 * Add new swapblk to biox, instantiating biox if necessary.
1022 		 * Zero-fill reads are able to take a shortcut.
1023 		 */
1024 		if (blk == SWAPBLK_NONE) {
1025 			/*
1026 			 * We can only get here if we are reading.  Since
1027 			 * we are at splvm() we can safely modify b_resid,
1028 			 * even if chain ops are in progress.
1029 			 */
1030 			bzero(data, PAGE_SIZE);
1031 			bp->b_resid -= PAGE_SIZE;
1032 		} else {
1033 			if (biox == NULL) {
1034 				/* XXX chain count > 4, wait to <= 4 */
1035 
1036 				bufx = getpbuf(NULL);
1037 				biox = &bufx->b_bio1;
1038 				cluster_append(nbio, bufx);
1039 				bufx->b_flags |= (bufx->b_flags & B_ORDERED);
1040 				bufx->b_cmd = bp->b_cmd;
1041 				biox->bio_done = swap_chain_iodone;
1042 				biox->bio_offset = (off_t)blk << PAGE_SHIFT;
1043 				biox->bio_caller_info1.cluster_parent = nbio;
1044 				biox_blkno = blk;
1045 				bufx->b_bcount = 0;
1046 				bufx->b_data = data;
1047 			}
1048 			bufx->b_bcount += PAGE_SIZE;
1049 		}
1050 		--count;
1051 		++start;
1052 		data += PAGE_SIZE;
1053 	}
1054 	lwkt_reltoken(&vm_token);
1055 	crit_exit();
1056 
1057 	/*
1058 	 *  Flush out last buffer
1059 	 */
1060 	if (biox) {
1061 		if (bufx->b_cmd == BUF_CMD_READ) {
1062 			++mycpu->gd_cnt.v_swapin;
1063 			mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1064 		} else {
1065 			++mycpu->gd_cnt.v_swapout;
1066 			mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1067 			bufx->b_dirtyend = bufx->b_bcount;
1068 		}
1069 		KKASSERT(bufx->b_bcount);
1070 		if (bufx->b_cmd != BUF_CMD_READ)
1071 			bufx->b_dirtyend = bufx->b_bcount;
1072 		/* biox, bufx = NULL */
1073 	}
1074 
1075 	/*
1076 	 * Now initiate all the I/O.  Be careful looping on our chain as
1077 	 * I/O's may complete while we are still initiating them.
1078 	 *
1079 	 * If the request is a 100% sparse read no bios will be present
1080 	 * and we just biodone() the buffer.
1081 	 */
1082 	nbio->bio_caller_info2.cluster_tail = NULL;
1083 	bufx = nbio->bio_caller_info1.cluster_head;
1084 
1085 	if (bufx) {
1086 		while (bufx) {
1087 			biox = &bufx->b_bio1;
1088 			BUF_KERNPROC(bufx);
1089 			bufx = bufx->b_cluster_next;
1090 			vn_strategy(swapdev_vp, biox);
1091 		}
1092 	} else {
1093 		biodone(bio);
1094 	}
1095 
1096 	/*
1097 	 * Completion of the cluster will also call biodone_chain(nbio).
1098 	 * We never call biodone(nbio) so we don't have to worry about
1099 	 * setting up a bio_done callback.  It's handled in the sub-IO.
1100 	 */
1101 	/**/
1102 }
1103 
1104 /*
1105  * biodone callback
1106  *
1107  * No requirements.
1108  */
1109 static void
1110 swap_chain_iodone(struct bio *biox)
1111 {
1112 	struct buf **nextp;
1113 	struct buf *bufx;	/* chained sub-buffer */
1114 	struct bio *nbio;	/* parent nbio with chain glue */
1115 	struct buf *bp;		/* original bp associated with nbio */
1116 	int chain_empty;
1117 
1118 	bufx = biox->bio_buf;
1119 	nbio = biox->bio_caller_info1.cluster_parent;
1120 	bp = nbio->bio_buf;
1121 
1122 	/*
1123 	 * Update the original buffer
1124 	 */
1125         KKASSERT(bp != NULL);
1126 	if (bufx->b_flags & B_ERROR) {
1127 		atomic_set_int(&bufx->b_flags, B_ERROR);
1128 		bp->b_error = bufx->b_error;	/* race ok */
1129 	} else if (bufx->b_resid != 0) {
1130 		atomic_set_int(&bufx->b_flags, B_ERROR);
1131 		bp->b_error = EINVAL;		/* race ok */
1132 	} else {
1133 		atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
1134 	}
1135 
1136 	/*
1137 	 * Remove us from the chain.
1138 	 */
1139 	spin_lock(&bp->b_lock.lk_spinlock);
1140 	nextp = &nbio->bio_caller_info1.cluster_head;
1141 	while (*nextp != bufx) {
1142 		KKASSERT(*nextp != NULL);
1143 		nextp = &(*nextp)->b_cluster_next;
1144 	}
1145 	*nextp = bufx->b_cluster_next;
1146 	chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
1147 	spin_unlock(&bp->b_lock.lk_spinlock);
1148 
1149 	/*
1150 	 * Clean up bufx.  If the chain is now empty we finish out
1151 	 * the parent.  Note that we may be racing other completions
1152 	 * so we must use the chain_empty status from above.
1153 	 */
1154 	if (chain_empty) {
1155 		if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
1156 			atomic_set_int(&bp->b_flags, B_ERROR);
1157 			bp->b_error = EINVAL;
1158 		}
1159 		biodone_chain(nbio);
1160         }
1161         relpbuf(bufx, NULL);
1162 }
1163 
1164 /*
1165  * SWAP_PAGER_GETPAGES() - bring page in from swap
1166  *
1167  * The requested page may have to be brought in from swap.  Calculate the
1168  * swap block and bring in additional pages if possible.  All pages must
1169  * have contiguous swap block assignments and reside in the same object.
1170  *
1171  * The caller has a single vm_object_pip_add() reference prior to
1172  * calling us and we should return with the same.
1173  *
1174  * The caller has BUSY'd the page.  We should return with (*mpp) left busy,
1175  * and any additinal pages unbusied.
1176  *
1177  * If the caller encounters a PG_RAM page it will pass it to us even though
1178  * it may be valid and dirty.  We cannot overwrite the page in this case!
1179  * The case is used to allow us to issue pure read-aheads.
1180  *
1181  * NOTE! XXX This code does not entirely pipeline yet due to the fact that
1182  *       the PG_RAM page is validated at the same time as mreq.  What we
1183  *	 really need to do is issue a separate read-ahead pbuf.
1184  *
1185  * No requirements.
1186  */
1187 static int
1188 swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
1189 {
1190 	struct buf *bp;
1191 	struct bio *bio;
1192 	vm_page_t mreq;
1193 	vm_page_t m;
1194 	vm_offset_t kva;
1195 	swblk_t blk;
1196 	int i;
1197 	int j;
1198 	int raonly;
1199 	vm_page_t marray[XIO_INTERNAL_PAGES];
1200 
1201 	mreq = *mpp;
1202 
1203 	if (mreq->object != object) {
1204 		panic("swap_pager_getpages: object mismatch %p/%p",
1205 		    object,
1206 		    mreq->object
1207 		);
1208 	}
1209 
1210 	/*
1211 	 * We don't want to overwrite a fully valid page as it might be
1212 	 * dirty.  This case can occur when e.g. vm_fault hits a perfectly
1213 	 * valid page with PG_RAM set.
1214 	 *
1215 	 * In this case we see if the next page is a suitable page-in
1216 	 * candidate and if it is we issue read-ahead.  PG_RAM will be
1217 	 * set on the last page of the read-ahead to continue the pipeline.
1218 	 */
1219 	if (mreq->valid == VM_PAGE_BITS_ALL) {
1220 		if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size)
1221 			return(VM_PAGER_OK);
1222 		crit_enter();
1223 		lwkt_gettoken(&vm_token);
1224 		blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
1225 		if (blk == SWAPBLK_NONE) {
1226 			lwkt_reltoken(&vm_token);
1227 			crit_exit();
1228 			return(VM_PAGER_OK);
1229 		}
1230 		m = vm_page_lookup(object, mreq->pindex + 1);
1231 		if (m == NULL) {
1232 			m = vm_page_alloc(object, mreq->pindex + 1,
1233 					  VM_ALLOC_QUICK);
1234 			if (m == NULL) {
1235 				lwkt_reltoken(&vm_token);
1236 				crit_exit();
1237 				return(VM_PAGER_OK);
1238 			}
1239 		} else {
1240 			if ((m->flags & PG_BUSY) || m->busy || m->valid) {
1241 				lwkt_reltoken(&vm_token);
1242 				crit_exit();
1243 				return(VM_PAGER_OK);
1244 			}
1245 			vm_page_unqueue_nowakeup(m);
1246 			vm_page_busy(m);
1247 		}
1248 		mreq = m;
1249 		raonly = 1;
1250 		lwkt_reltoken(&vm_token);
1251 		crit_exit();
1252 	} else {
1253 		raonly = 0;
1254 	}
1255 
1256 	/*
1257 	 * Try to block-read contiguous pages from swap if sequential,
1258 	 * otherwise just read one page.  Contiguous pages from swap must
1259 	 * reside within a single device stripe because the I/O cannot be
1260 	 * broken up across multiple stripes.
1261 	 *
1262 	 * Note that blk and iblk can be SWAPBLK_NONE but the loop is
1263 	 * set up such that the case(s) are handled implicitly.
1264 	 */
1265 	crit_enter();
1266 	lwkt_gettoken(&vm_token);
1267 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1268 	marray[0] = mreq;
1269 
1270 	for (i = 1; swap_burst_read &&
1271 		    i < XIO_INTERNAL_PAGES &&
1272 		    mreq->pindex + i < object->size; ++i) {
1273 		swblk_t iblk;
1274 
1275 		iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
1276 		if (iblk != blk + i)
1277 			break;
1278 		if ((blk ^ iblk) & dmmax_mask)
1279 			break;
1280 		m = vm_page_lookup(object, mreq->pindex + i);
1281 		if (m == NULL) {
1282 			m = vm_page_alloc(object, mreq->pindex + i,
1283 					  VM_ALLOC_QUICK);
1284 			if (m == NULL)
1285 				break;
1286 		} else {
1287 			if ((m->flags & PG_BUSY) || m->busy || m->valid)
1288 				break;
1289 			vm_page_unqueue_nowakeup(m);
1290 			vm_page_busy(m);
1291 		}
1292 		marray[i] = m;
1293 	}
1294 	if (i > 1)
1295 		vm_page_flag_set(marray[i - 1], PG_RAM);
1296 
1297 	lwkt_reltoken(&vm_token);
1298 	crit_exit();
1299 
1300 	/*
1301 	 * If mreq is the requested page and we have nothing to do return
1302 	 * VM_PAGER_FAIL.  If raonly is set mreq is just another read-ahead
1303 	 * page and must be cleaned up.
1304 	 */
1305 	if (blk == SWAPBLK_NONE) {
1306 		KKASSERT(i == 1);
1307 		if (raonly) {
1308 			vnode_pager_freepage(mreq);
1309 			return(VM_PAGER_OK);
1310 		} else {
1311 			return(VM_PAGER_FAIL);
1312 		}
1313 	}
1314 
1315 	/*
1316 	 * map our page(s) into kva for input
1317 	 */
1318 	bp = getpbuf_kva(&nsw_rcount);
1319 	bio = &bp->b_bio1;
1320 	kva = (vm_offset_t) bp->b_kvabase;
1321 	bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
1322 	pmap_qenter(kva, bp->b_xio.xio_pages, i);
1323 
1324 	bp->b_data = (caddr_t)kva;
1325 	bp->b_bcount = PAGE_SIZE * i;
1326 	bp->b_xio.xio_npages = i;
1327 	bio->bio_done = swp_pager_async_iodone;
1328 	bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1329 	bio->bio_caller_info1.index = SWBIO_READ;
1330 
1331 	/*
1332 	 * Set index.  If raonly set the index beyond the array so all
1333 	 * the pages are treated the same, otherwise the original mreq is
1334 	 * at index 0.
1335 	 */
1336 	if (raonly)
1337 		bio->bio_driver_info = (void *)(intptr_t)i;
1338 	else
1339 		bio->bio_driver_info = (void *)(intptr_t)0;
1340 
1341 	for (j = 0; j < i; ++j)
1342 		vm_page_flag_set(bp->b_xio.xio_pages[j], PG_SWAPINPROG);
1343 
1344 	mycpu->gd_cnt.v_swapin++;
1345 	mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
1346 
1347 	/*
1348 	 * We still hold the lock on mreq, and our automatic completion routine
1349 	 * does not remove it.
1350 	 */
1351 	vm_object_pip_add(object, bp->b_xio.xio_npages);
1352 
1353 	/*
1354 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
1355 	 * this point because we automatically release it on completion.
1356 	 * Instead, we look at the one page we are interested in which we
1357 	 * still hold a lock on even through the I/O completion.
1358 	 *
1359 	 * The other pages in our m[] array are also released on completion,
1360 	 * so we cannot assume they are valid anymore either.
1361 	 */
1362 	bp->b_cmd = BUF_CMD_READ;
1363 	BUF_KERNPROC(bp);
1364 	vn_strategy(swapdev_vp, bio);
1365 
1366 	/*
1367 	 * Wait for the page we want to complete.  PG_SWAPINPROG is always
1368 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
1369 	 * is set in the meta-data.
1370 	 *
1371 	 * If this is a read-ahead only we return immediately without
1372 	 * waiting for I/O.
1373 	 */
1374 	if (raonly)
1375 		return(VM_PAGER_OK);
1376 
1377 	/*
1378 	 * Read-ahead includes originally requested page case.
1379 	 */
1380 	crit_enter();
1381 	lwkt_gettoken(&vm_token);
1382 	while ((mreq->flags & PG_SWAPINPROG) != 0) {
1383 		vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
1384 		mycpu->gd_cnt.v_intrans++;
1385 		if (tsleep(mreq, 0, "swread", hz*20)) {
1386 			kprintf(
1387 			    "swap_pager: indefinite wait buffer: "
1388 				" offset: %lld, size: %ld\n",
1389 			    (long long)bio->bio_offset,
1390 			    (long)bp->b_bcount
1391 			);
1392 		}
1393 	}
1394 	lwkt_reltoken(&vm_token);
1395 	crit_exit();
1396 
1397 	/*
1398 	 * mreq is left bussied after completion, but all the other pages
1399 	 * are freed.  If we had an unrecoverable read error the page will
1400 	 * not be valid.
1401 	 */
1402 	if (mreq->valid != VM_PAGE_BITS_ALL)
1403 		return(VM_PAGER_ERROR);
1404 	else
1405 		return(VM_PAGER_OK);
1406 
1407 	/*
1408 	 * A final note: in a low swap situation, we cannot deallocate swap
1409 	 * and mark a page dirty here because the caller is likely to mark
1410 	 * the page clean when we return, causing the page to possibly revert
1411 	 * to all-zero's later.
1412 	 */
1413 }
1414 
1415 /*
1416  *	swap_pager_putpages:
1417  *
1418  *	Assign swap (if necessary) and initiate I/O on the specified pages.
1419  *
1420  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
1421  *	are automatically converted to SWAP objects.
1422  *
1423  *	In a low memory situation we may block in vn_strategy(), but the new
1424  *	vm_page reservation system coupled with properly written VFS devices
1425  *	should ensure that no low-memory deadlock occurs.  This is an area
1426  *	which needs work.
1427  *
1428  *	The parent has N vm_object_pip_add() references prior to
1429  *	calling us and will remove references for rtvals[] that are
1430  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
1431  *	completion.
1432  *
1433  *	The parent has soft-busy'd the pages it passes us and will unbusy
1434  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1435  *	We need to unbusy the rest on I/O completion.
1436  *
1437  * No requirements.
1438  */
1439 void
1440 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
1441 		    boolean_t sync, int *rtvals)
1442 {
1443 	int i;
1444 	int n = 0;
1445 
1446 	if (count && m[0]->object != object) {
1447 		panic("swap_pager_getpages: object mismatch %p/%p",
1448 		    object,
1449 		    m[0]->object
1450 		);
1451 	}
1452 
1453 	/*
1454 	 * Step 1
1455 	 *
1456 	 * Turn object into OBJT_SWAP
1457 	 * check for bogus sysops
1458 	 * force sync if not pageout process
1459 	 */
1460 	if (object->type == OBJT_DEFAULT) {
1461 		lwkt_gettoken(&vm_token);
1462 		if (object->type == OBJT_DEFAULT)
1463 			swp_pager_meta_convert(object);
1464 		lwkt_reltoken(&vm_token);
1465 	}
1466 
1467 	if (curthread != pagethread)
1468 		sync = TRUE;
1469 
1470 	/*
1471 	 * Step 2
1472 	 *
1473 	 * Update nsw parameters from swap_async_max sysctl values.
1474 	 * Do not let the sysop crash the machine with bogus numbers.
1475 	 */
1476 
1477 	if (swap_async_max != nsw_wcount_async_max) {
1478 		int n;
1479 
1480 		/*
1481 		 * limit range
1482 		 */
1483 		if ((n = swap_async_max) > nswbuf / 2)
1484 			n = nswbuf / 2;
1485 		if (n < 1)
1486 			n = 1;
1487 		swap_async_max = n;
1488 
1489 		/*
1490 		 * Adjust difference ( if possible ).  If the current async
1491 		 * count is too low, we may not be able to make the adjustment
1492 		 * at this time.
1493 		 */
1494 		crit_enter();
1495 		lwkt_gettoken(&vm_token);
1496 		n -= nsw_wcount_async_max;
1497 		if (nsw_wcount_async + n >= 0) {
1498 			nsw_wcount_async += n;
1499 			nsw_wcount_async_max += n;
1500 			wakeup(&nsw_wcount_async);
1501 		}
1502 		lwkt_reltoken(&vm_token);
1503 		crit_exit();
1504 	}
1505 
1506 	/*
1507 	 * Step 3
1508 	 *
1509 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
1510 	 * The page is left dirty until the pageout operation completes
1511 	 * successfully.
1512 	 */
1513 
1514 	for (i = 0; i < count; i += n) {
1515 		struct buf *bp;
1516 		struct bio *bio;
1517 		swblk_t blk;
1518 		int j;
1519 
1520 		/*
1521 		 * Maximum I/O size is limited by a number of factors.
1522 		 */
1523 
1524 		n = min(BLIST_MAX_ALLOC, count - i);
1525 		n = min(n, nsw_cluster_max);
1526 
1527 		crit_enter();
1528 		lwkt_gettoken(&vm_token);
1529 
1530 		/*
1531 		 * Get biggest block of swap we can.  If we fail, fall
1532 		 * back and try to allocate a smaller block.  Don't go
1533 		 * overboard trying to allocate space if it would overly
1534 		 * fragment swap.
1535 		 */
1536 		while (
1537 		    (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
1538 		    n > 4
1539 		) {
1540 			n >>= 1;
1541 		}
1542 		if (blk == SWAPBLK_NONE) {
1543 			for (j = 0; j < n; ++j)
1544 				rtvals[i+j] = VM_PAGER_FAIL;
1545 			lwkt_reltoken(&vm_token);
1546 			crit_exit();
1547 			continue;
1548 		}
1549 
1550 		/*
1551 		 * The I/O we are constructing cannot cross a physical
1552 		 * disk boundry in the swap stripe.  Note: we are still
1553 		 * at splvm().
1554 		 */
1555 		if ((blk ^ (blk + n)) & dmmax_mask) {
1556 			j = ((blk + dmmax) & dmmax_mask) - blk;
1557 			swp_pager_freeswapspace(object, blk + j, n - j);
1558 			n = j;
1559 		}
1560 
1561 		/*
1562 		 * All I/O parameters have been satisfied, build the I/O
1563 		 * request and assign the swap space.
1564 		 */
1565 		if (sync == TRUE)
1566 			bp = getpbuf_kva(&nsw_wcount_sync);
1567 		else
1568 			bp = getpbuf_kva(&nsw_wcount_async);
1569 		bio = &bp->b_bio1;
1570 
1571 		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
1572 
1573 		bp->b_bcount = PAGE_SIZE * n;
1574 		bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1575 
1576 		for (j = 0; j < n; ++j) {
1577 			vm_page_t mreq = m[i+j];
1578 
1579 			swp_pager_meta_build(mreq->object, mreq->pindex,
1580 					     blk + j);
1581 			if (object->type == OBJT_SWAP)
1582 				vm_page_dirty(mreq);
1583 			rtvals[i+j] = VM_PAGER_OK;
1584 
1585 			vm_page_flag_set(mreq, PG_SWAPINPROG);
1586 			bp->b_xio.xio_pages[j] = mreq;
1587 		}
1588 		bp->b_xio.xio_npages = n;
1589 
1590 		mycpu->gd_cnt.v_swapout++;
1591 		mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
1592 
1593 		lwkt_reltoken(&vm_token);
1594 		crit_exit();
1595 
1596 		bp->b_dirtyoff = 0;		/* req'd for NFS */
1597 		bp->b_dirtyend = bp->b_bcount;	/* req'd for NFS */
1598 		bp->b_cmd = BUF_CMD_WRITE;
1599 		bio->bio_caller_info1.index = SWBIO_WRITE;
1600 
1601 		/*
1602 		 * asynchronous
1603 		 */
1604 		if (sync == FALSE) {
1605 			bio->bio_done = swp_pager_async_iodone;
1606 			BUF_KERNPROC(bp);
1607 			vn_strategy(swapdev_vp, bio);
1608 
1609 			for (j = 0; j < n; ++j)
1610 				rtvals[i+j] = VM_PAGER_PEND;
1611 			continue;
1612 		}
1613 
1614 		/*
1615 		 * Issue synchrnously.
1616 		 *
1617 		 * Wait for the sync I/O to complete, then update rtvals.
1618 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
1619 		 * our async completion routine at the end, thus avoiding a
1620 		 * double-free.
1621 		 */
1622 		bio->bio_caller_info1.index |= SWBIO_SYNC;
1623 		bio->bio_done = biodone_sync;
1624 		bio->bio_flags |= BIO_SYNC;
1625 		vn_strategy(swapdev_vp, bio);
1626 		biowait(bio, "swwrt");
1627 
1628 		for (j = 0; j < n; ++j)
1629 			rtvals[i+j] = VM_PAGER_PEND;
1630 
1631 		/*
1632 		 * Now that we are through with the bp, we can call the
1633 		 * normal async completion, which frees everything up.
1634 		 */
1635 		swp_pager_async_iodone(bio);
1636 	}
1637 }
1638 
1639 /*
1640  * No requirements.
1641  */
1642 void
1643 swap_pager_newswap(void)
1644 {
1645 	swp_sizecheck();
1646 }
1647 
1648 /*
1649  *	swp_pager_async_iodone:
1650  *
1651  *	Completion routine for asynchronous reads and writes from/to swap.
1652  *	Also called manually by synchronous code to finish up a bp.
1653  *
1654  *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations,
1655  *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY
1656  *	unbusy all pages except the 'main' request page.  For WRITE
1657  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this
1658  *	because we marked them all VM_PAGER_PEND on return from putpages ).
1659  *
1660  *	This routine may not block.
1661  *
1662  * No requirements.
1663  */
1664 static void
1665 swp_pager_async_iodone(struct bio *bio)
1666 {
1667 	struct buf *bp = bio->bio_buf;
1668 	vm_object_t object = NULL;
1669 	int i;
1670 	int *nswptr;
1671 
1672 	/*
1673 	 * report error
1674 	 */
1675 	if (bp->b_flags & B_ERROR) {
1676 		kprintf(
1677 		    "swap_pager: I/O error - %s failed; offset %lld,"
1678 			"size %ld, error %d\n",
1679 		    ((bio->bio_caller_info1.index & SWBIO_READ) ?
1680 			"pagein" : "pageout"),
1681 		    (long long)bio->bio_offset,
1682 		    (long)bp->b_bcount,
1683 		    bp->b_error
1684 		);
1685 	}
1686 
1687 	/*
1688 	 * set object, raise to splvm().
1689 	 */
1690 	if (bp->b_xio.xio_npages)
1691 		object = bp->b_xio.xio_pages[0]->object;
1692 	crit_enter();
1693 	lwkt_gettoken(&vm_token);
1694 
1695 	/*
1696 	 * remove the mapping for kernel virtual
1697 	 */
1698 	pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
1699 
1700 	/*
1701 	 * cleanup pages.  If an error occurs writing to swap, we are in
1702 	 * very serious trouble.  If it happens to be a disk error, though,
1703 	 * we may be able to recover by reassigning the swap later on.  So
1704 	 * in this case we remove the m->swapblk assignment for the page
1705 	 * but do not free it in the rlist.  The errornous block(s) are thus
1706 	 * never reallocated as swap.  Redirty the page and continue.
1707 	 */
1708 	for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1709 		vm_page_t m = bp->b_xio.xio_pages[i];
1710 
1711 		if (bp->b_flags & B_ERROR) {
1712 			/*
1713 			 * If an error occurs I'd love to throw the swapblk
1714 			 * away without freeing it back to swapspace, so it
1715 			 * can never be used again.  But I can't from an
1716 			 * interrupt.
1717 			 */
1718 
1719 			if (bio->bio_caller_info1.index & SWBIO_READ) {
1720 				/*
1721 				 * When reading, reqpage needs to stay
1722 				 * locked for the parent, but all other
1723 				 * pages can be freed.  We still want to
1724 				 * wakeup the parent waiting on the page,
1725 				 * though.  ( also: pg_reqpage can be -1 and
1726 				 * not match anything ).
1727 				 *
1728 				 * We have to wake specifically requested pages
1729 				 * up too because we cleared PG_SWAPINPROG and
1730 				 * someone may be waiting for that.
1731 				 *
1732 				 * NOTE: for reads, m->dirty will probably
1733 				 * be overridden by the original caller of
1734 				 * getpages so don't play cute tricks here.
1735 				 *
1736 				 * NOTE: We can't actually free the page from
1737 				 * here, because this is an interrupt.  It
1738 				 * is not legal to mess with object->memq
1739 				 * from an interrupt.  Deactivate the page
1740 				 * instead.
1741 				 */
1742 
1743 				m->valid = 0;
1744 				vm_page_flag_clear(m, PG_ZERO);
1745 				vm_page_flag_clear(m, PG_SWAPINPROG);
1746 
1747 				/*
1748 				 * bio_driver_info holds the requested page
1749 				 * index.
1750 				 */
1751 				if (i != (int)(intptr_t)bio->bio_driver_info) {
1752 					vm_page_deactivate(m);
1753 					vm_page_wakeup(m);
1754 				} else {
1755 					vm_page_flash(m);
1756 				}
1757 				/*
1758 				 * If i == bp->b_pager.pg_reqpage, do not wake
1759 				 * the page up.  The caller needs to.
1760 				 */
1761 			} else {
1762 				/*
1763 				 * If a write error occurs remove the swap
1764 				 * assignment (note that PG_SWAPPED may or
1765 				 * may not be set depending on prior activity).
1766 				 *
1767 				 * Re-dirty OBJT_SWAP pages as there is no
1768 				 * other backing store, we can't throw the
1769 				 * page away.
1770 				 *
1771 				 * Non-OBJT_SWAP pages (aka swapcache) must
1772 				 * not be dirtied since they may not have
1773 				 * been dirty in the first place, and they
1774 				 * do have backing store (the vnode).
1775 				 */
1776 				swp_pager_meta_ctl(m->object, m->pindex,
1777 						   SWM_FREE);
1778 				vm_page_flag_clear(m, PG_SWAPPED);
1779 				if (m->object->type == OBJT_SWAP) {
1780 					vm_page_dirty(m);
1781 					vm_page_activate(m);
1782 				}
1783 				vm_page_flag_clear(m, PG_SWAPINPROG);
1784 				vm_page_io_finish(m);
1785 			}
1786 		} else if (bio->bio_caller_info1.index & SWBIO_READ) {
1787 			/*
1788 			 * NOTE: for reads, m->dirty will probably be
1789 			 * overridden by the original caller of getpages so
1790 			 * we cannot set them in order to free the underlying
1791 			 * swap in a low-swap situation.  I don't think we'd
1792 			 * want to do that anyway, but it was an optimization
1793 			 * that existed in the old swapper for a time before
1794 			 * it got ripped out due to precisely this problem.
1795 			 *
1796 			 * clear PG_ZERO in page.
1797 			 *
1798 			 * If not the requested page then deactivate it.
1799 			 *
1800 			 * Note that the requested page, reqpage, is left
1801 			 * busied, but we still have to wake it up.  The
1802 			 * other pages are released (unbusied) by
1803 			 * vm_page_wakeup().  We do not set reqpage's
1804 			 * valid bits here, it is up to the caller.
1805 			 */
1806 
1807 			/*
1808 			 * NOTE: can't call pmap_clear_modify(m) from an
1809 			 * interrupt thread, the pmap code may have to map
1810 			 * non-kernel pmaps and currently asserts the case.
1811 			 */
1812 			/*pmap_clear_modify(m);*/
1813 			m->valid = VM_PAGE_BITS_ALL;
1814 			vm_page_undirty(m);
1815 			vm_page_flag_clear(m, PG_ZERO | PG_SWAPINPROG);
1816 			vm_page_flag_set(m, PG_SWAPPED);
1817 
1818 			/*
1819 			 * We have to wake specifically requested pages
1820 			 * up too because we cleared PG_SWAPINPROG and
1821 			 * could be waiting for it in getpages.  However,
1822 			 * be sure to not unbusy getpages specifically
1823 			 * requested page - getpages expects it to be
1824 			 * left busy.
1825 			 *
1826 			 * bio_driver_info holds the requested page
1827 			 */
1828 			if (i != (int)(intptr_t)bio->bio_driver_info) {
1829 				vm_page_deactivate(m);
1830 				vm_page_wakeup(m);
1831 			} else {
1832 				vm_page_flash(m);
1833 			}
1834 		} else {
1835 			/*
1836 			 * Mark the page clean but do not mess with the
1837 			 * pmap-layer's modified state.  That state should
1838 			 * also be clear since the caller protected the
1839 			 * page VM_PROT_READ, but allow the case.
1840 			 *
1841 			 * We are in an interrupt, avoid pmap operations.
1842 			 *
1843 			 * If we have a severe page deficit, deactivate the
1844 			 * page.  Do not try to cache it (which would also
1845 			 * involve a pmap op), because the page might still
1846 			 * be read-heavy.
1847 			 *
1848 			 * When using the swap to cache clean vnode pages
1849 			 * we do not mess with the page dirty bits.
1850 			 */
1851 			if (m->object->type == OBJT_SWAP)
1852 				vm_page_undirty(m);
1853 			vm_page_flag_clear(m, PG_SWAPINPROG);
1854 			vm_page_flag_set(m, PG_SWAPPED);
1855 			vm_page_io_finish(m);
1856 			if (vm_page_count_severe())
1857 				vm_page_deactivate(m);
1858 #if 0
1859 			if (!vm_page_count_severe() || !vm_page_try_to_cache(m))
1860 				vm_page_protect(m, VM_PROT_READ);
1861 #endif
1862 		}
1863 	}
1864 
1865 	/*
1866 	 * adjust pip.  NOTE: the original parent may still have its own
1867 	 * pip refs on the object.
1868 	 */
1869 
1870 	if (object)
1871 		vm_object_pip_wakeupn(object, bp->b_xio.xio_npages);
1872 
1873 	/*
1874 	 * Release the physical I/O buffer.
1875 	 *
1876 	 * NOTE: Due to synchronous operations in the write case b_cmd may
1877 	 *	 already be set to BUF_CMD_DONE and BIO_SYNC may have already
1878 	 *	 been cleared.
1879 	 */
1880 	if (bio->bio_caller_info1.index & SWBIO_READ)
1881 		nswptr = &nsw_rcount;
1882 	else if (bio->bio_caller_info1.index & SWBIO_SYNC)
1883 		nswptr = &nsw_wcount_sync;
1884 	else
1885 		nswptr = &nsw_wcount_async;
1886 	bp->b_cmd = BUF_CMD_DONE;
1887 	relpbuf(bp, nswptr);
1888 	lwkt_reltoken(&vm_token);
1889 	crit_exit();
1890 }
1891 
1892 /************************************************************************
1893  *				SWAP META DATA 				*
1894  ************************************************************************
1895  *
1896  *	These routines manipulate the swap metadata stored in the
1897  *	OBJT_SWAP object.  All swp_*() routines must be called at
1898  *	splvm() because swap can be freed up by the low level vm_page
1899  *	code which might be called from interrupts beyond what splbio() covers.
1900  *
1901  *	Swap metadata is implemented with a global hash and not directly
1902  *	linked into the object.  Instead the object simply contains
1903  *	appropriate tracking counters.
1904  */
1905 
1906 /*
1907  * Lookup the swblock containing the specified swap block index.
1908  *
1909  * The caller must hold vm_token.
1910  */
1911 static __inline
1912 struct swblock *
1913 swp_pager_lookup(vm_object_t object, vm_pindex_t index)
1914 {
1915 	index &= ~SWAP_META_MASK;
1916 	return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index));
1917 }
1918 
1919 /*
1920  * Remove a swblock from the RB tree.
1921  *
1922  * The caller must hold vm_token.
1923  */
1924 static __inline
1925 void
1926 swp_pager_remove(vm_object_t object, struct swblock *swap)
1927 {
1928 	RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap);
1929 }
1930 
1931 /*
1932  * Convert default object to swap object if necessary
1933  *
1934  * The caller must hold vm_token.
1935  */
1936 static void
1937 swp_pager_meta_convert(vm_object_t object)
1938 {
1939 	if (object->type == OBJT_DEFAULT) {
1940 		object->type = OBJT_SWAP;
1941 		KKASSERT(object->swblock_count == 0);
1942 	}
1943 }
1944 
1945 /*
1946  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
1947  *
1948  *	We first convert the object to a swap object if it is a default
1949  *	object.  Vnode objects do not need to be converted.
1950  *
1951  *	The specified swapblk is added to the object's swap metadata.  If
1952  *	the swapblk is not valid, it is freed instead.  Any previously
1953  *	assigned swapblk is freed.
1954  *
1955  * The caller must hold vm_token.
1956  */
1957 static void
1958 swp_pager_meta_build(vm_object_t object, vm_pindex_t index, swblk_t swapblk)
1959 {
1960 	struct swblock *swap;
1961 	struct swblock *oswap;
1962 
1963 	KKASSERT(swapblk != SWAPBLK_NONE);
1964 
1965 	/*
1966 	 * Convert object if necessary
1967 	 */
1968 	if (object->type == OBJT_DEFAULT)
1969 		swp_pager_meta_convert(object);
1970 
1971 	/*
1972 	 * Locate swblock.  If not found create, but if we aren't adding
1973 	 * anything just return.  If we run out of space in the map we wait
1974 	 * and, since the hash table may have changed, retry.
1975 	 */
1976 retry:
1977 	swap = swp_pager_lookup(object, index);
1978 
1979 	if (swap == NULL) {
1980 		int i;
1981 
1982 		swap = zalloc(swap_zone);
1983 		if (swap == NULL) {
1984 			vm_wait(0);
1985 			goto retry;
1986 		}
1987 		swap->swb_index = index & ~SWAP_META_MASK;
1988 		swap->swb_count = 0;
1989 
1990 		++object->swblock_count;
1991 
1992 		for (i = 0; i < SWAP_META_PAGES; ++i)
1993 			swap->swb_pages[i] = SWAPBLK_NONE;
1994 		oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap);
1995 		KKASSERT(oswap == NULL);
1996 	}
1997 
1998 	/*
1999 	 * Delete prior contents of metadata
2000 	 */
2001 
2002 	index &= SWAP_META_MASK;
2003 
2004 	if (swap->swb_pages[index] != SWAPBLK_NONE) {
2005 		swp_pager_freeswapspace(object, swap->swb_pages[index], 1);
2006 		--swap->swb_count;
2007 	}
2008 
2009 	/*
2010 	 * Enter block into metadata
2011 	 */
2012 	swap->swb_pages[index] = swapblk;
2013 	if (swapblk != SWAPBLK_NONE)
2014 		++swap->swb_count;
2015 }
2016 
2017 /*
2018  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
2019  *
2020  *	The requested range of blocks is freed, with any associated swap
2021  *	returned to the swap bitmap.
2022  *
2023  *	This routine will free swap metadata structures as they are cleaned
2024  *	out.  This routine does *NOT* operate on swap metadata associated
2025  *	with resident pages.
2026  *
2027  * The caller must hold vm_token.
2028  */
2029 static int swp_pager_meta_free_callback(struct swblock *swb, void *data);
2030 
2031 static void
2032 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
2033 {
2034 	struct swfreeinfo info;
2035 
2036 	/*
2037 	 * Nothing to do
2038 	 */
2039 	if (object->swblock_count == 0) {
2040 		KKASSERT(RB_EMPTY(&object->swblock_root));
2041 		return;
2042 	}
2043 	if (count == 0)
2044 		return;
2045 
2046 	/*
2047 	 * Setup for RB tree scan.  Note that the pindex range can be huge
2048 	 * due to the 64 bit page index space so we cannot safely iterate.
2049 	 */
2050 	info.object = object;
2051 	info.basei = index & ~SWAP_META_MASK;
2052 	info.begi = index;
2053 	info.endi = index + count - 1;
2054 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp,
2055 				swp_pager_meta_free_callback, &info);
2056 }
2057 
2058 /*
2059  * The caller must hold vm_token.
2060  */
2061 static
2062 int
2063 swp_pager_meta_free_callback(struct swblock *swap, void *data)
2064 {
2065 	struct swfreeinfo *info = data;
2066 	vm_object_t object = info->object;
2067 	int index;
2068 	int eindex;
2069 
2070 	/*
2071 	 * Figure out the range within the swblock.  The wider scan may
2072 	 * return edge-case swap blocks when the start and/or end points
2073 	 * are in the middle of a block.
2074 	 */
2075 	if (swap->swb_index < info->begi)
2076 		index = (int)info->begi & SWAP_META_MASK;
2077 	else
2078 		index = 0;
2079 
2080 	if (swap->swb_index + SWAP_META_PAGES > info->endi)
2081 		eindex = (int)info->endi & SWAP_META_MASK;
2082 	else
2083 		eindex = SWAP_META_MASK;
2084 
2085 	/*
2086 	 * Scan and free the blocks.  The loop terminates early
2087 	 * if (swap) runs out of blocks and could be freed.
2088 	 */
2089 	while (index <= eindex) {
2090 		swblk_t v = swap->swb_pages[index];
2091 
2092 		if (v != SWAPBLK_NONE) {
2093 			swp_pager_freeswapspace(object, v, 1);
2094 			swap->swb_pages[index] = SWAPBLK_NONE;
2095 			if (--swap->swb_count == 0) {
2096 				swp_pager_remove(object, swap);
2097 				zfree(swap_zone, swap);
2098 				--object->swblock_count;
2099 				break;
2100 			}
2101 		}
2102 		++index;
2103 	}
2104 	/* swap may be invalid here due to zfree above */
2105 	return(0);
2106 }
2107 
2108 /*
2109  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
2110  *
2111  *	This routine locates and destroys all swap metadata associated with
2112  *	an object.
2113  *
2114  * The caller must hold vm_token.
2115  */
2116 static void
2117 swp_pager_meta_free_all(vm_object_t object)
2118 {
2119 	struct swblock *swap;
2120 	int i;
2121 
2122 	while ((swap = RB_ROOT(&object->swblock_root)) != NULL) {
2123 		swp_pager_remove(object, swap);
2124 		for (i = 0; i < SWAP_META_PAGES; ++i) {
2125 			swblk_t v = swap->swb_pages[i];
2126 			if (v != SWAPBLK_NONE) {
2127 				--swap->swb_count;
2128 				swp_pager_freeswapspace(object, v, 1);
2129 			}
2130 		}
2131 		if (swap->swb_count != 0)
2132 			panic("swap_pager_meta_free_all: swb_count != 0");
2133 		zfree(swap_zone, swap);
2134 		--object->swblock_count;
2135 	}
2136 	KKASSERT(object->swblock_count == 0);
2137 }
2138 
2139 /*
2140  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
2141  *
2142  *	This routine is capable of looking up, popping, or freeing
2143  *	swapblk assignments in the swap meta data or in the vm_page_t.
2144  *	The routine typically returns the swapblk being looked-up, or popped,
2145  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
2146  *	was invalid.  This routine will automatically free any invalid
2147  *	meta-data swapblks.
2148  *
2149  *	It is not possible to store invalid swapblks in the swap meta data
2150  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
2151  *
2152  *	When acting on a busy resident page and paging is in progress, we
2153  *	have to wait until paging is complete but otherwise can act on the
2154  *	busy page.
2155  *
2156  *	SWM_FREE	remove and free swap block from metadata
2157  *	SWM_POP		remove from meta data but do not free.. pop it out
2158  *
2159  * The caller must hold vm_token.
2160  */
2161 static swblk_t
2162 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
2163 {
2164 	struct swblock *swap;
2165 	swblk_t r1;
2166 
2167 	if (object->swblock_count == 0)
2168 		return(SWAPBLK_NONE);
2169 
2170 	r1 = SWAPBLK_NONE;
2171 	swap = swp_pager_lookup(object, index);
2172 
2173 	if (swap != NULL) {
2174 		index &= SWAP_META_MASK;
2175 		r1 = swap->swb_pages[index];
2176 
2177 		if (r1 != SWAPBLK_NONE) {
2178 			if (flags & SWM_FREE) {
2179 				swp_pager_freeswapspace(object, r1, 1);
2180 				r1 = SWAPBLK_NONE;
2181 			}
2182 			if (flags & (SWM_FREE|SWM_POP)) {
2183 				swap->swb_pages[index] = SWAPBLK_NONE;
2184 				if (--swap->swb_count == 0) {
2185 					swp_pager_remove(object, swap);
2186 					zfree(swap_zone, swap);
2187 					--object->swblock_count;
2188 				}
2189 			}
2190 		}
2191 	}
2192 	return(r1);
2193 }
2194