xref: /dragonfly/sys/vm/swap_pager.c (revision d0be0ca8)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1998-2010 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Copyright (c) 1994 John S. Dyson
37  * Copyright (c) 1990 University of Utah.
38  * Copyright (c) 1991, 1993
39  *	The Regents of the University of California.  All rights reserved.
40  *
41  * This code is derived from software contributed to Berkeley by
42  * the Systems Programming Group of the University of Utah Computer
43  * Science Department.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  * 3. Neither the name of the University nor the names of its contributors
54  *    may be used to endorse or promote products derived from this software
55  *    without specific prior written permission.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67  * SUCH DAMAGE.
68  *
69  *				New Swap System
70  *				Matthew Dillon
71  *
72  * Radix Bitmap 'blists'.
73  *
74  *	- The new swapper uses the new radix bitmap code.  This should scale
75  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
76  *	  arbitrary degree of fragmentation.
77  *
78  * Features:
79  *
80  *	- on the fly reallocation of swap during putpages.  The new system
81  *	  does not try to keep previously allocated swap blocks for dirty
82  *	  pages.
83  *
84  *	- on the fly deallocation of swap
85  *
86  *	- No more garbage collection required.  Unnecessarily allocated swap
87  *	  blocks only exist for dirty vm_page_t's now and these are already
88  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
89  *	  removal of invalidated swap blocks when a page is destroyed
90  *	  or renamed.
91  *
92  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
93  * @(#)swap_pager.c	8.9 (Berkeley) 3/21/94
94  * $FreeBSD: src/sys/vm/swap_pager.c,v 1.130.2.12 2002/08/31 21:15:55 dillon Exp $
95  */
96 
97 #include "opt_swap.h"
98 #include <sys/param.h>
99 #include <sys/systm.h>
100 #include <sys/conf.h>
101 #include <sys/kernel.h>
102 #include <sys/proc.h>
103 #include <sys/buf.h>
104 #include <sys/vnode.h>
105 #include <sys/malloc.h>
106 #include <sys/vmmeter.h>
107 #include <sys/sysctl.h>
108 #include <sys/blist.h>
109 #include <sys/lock.h>
110 #include <sys/kcollect.h>
111 
112 #include <vm/vm.h>
113 #include <vm/vm_object.h>
114 #include <vm/vm_page.h>
115 #include <vm/vm_pager.h>
116 #include <vm/vm_pageout.h>
117 #include <vm/swap_pager.h>
118 #include <vm/vm_extern.h>
119 #include <vm/vm_zone.h>
120 #include <vm/vnode_pager.h>
121 
122 #include <sys/buf2.h>
123 #include <vm/vm_page2.h>
124 
125 #ifndef MAX_PAGEOUT_CLUSTER
126 #define MAX_PAGEOUT_CLUSTER	SWB_NPAGES
127 #endif
128 
129 #define SWM_FREE	0x02	/* free, period			*/
130 #define SWM_POP		0x04	/* pop out			*/
131 
132 #define SWBIO_READ	0x01
133 #define SWBIO_WRITE	0x02
134 #define SWBIO_SYNC	0x04
135 #define SWBIO_TTC	0x08	/* for VM_PAGER_TRY_TO_CACHE */
136 
137 struct swfreeinfo {
138 	vm_object_t	object;
139 	vm_pindex_t	basei;
140 	vm_pindex_t	begi;
141 	vm_pindex_t	endi;	/* inclusive */
142 };
143 
144 struct swswapoffinfo {
145 	vm_object_t	object;
146 	int		devidx;
147 	int		shared;
148 };
149 
150 /*
151  * vm_swap_size is in page-sized chunks now.  It was DEV_BSIZE'd chunks
152  * in the old system.
153  */
154 
155 int swap_pager_full;		/* swap space exhaustion (task killing) */
156 int swap_fail_ticks;		/* when we became exhausted */
157 int swap_pager_almost_full;	/* swap space exhaustion (w/ hysteresis)*/
158 swblk_t vm_swap_cache_use;
159 swblk_t vm_swap_anon_use;
160 static int vm_report_swap_allocs;
161 
162 static struct krate kswaprate = { 1 };
163 static int nsw_rcount;		/* free read buffers			*/
164 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
165 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
166 static int nsw_wcount_async_max;/* assigned maximum			*/
167 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
168 
169 struct blist *swapblist;
170 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
171 static int swap_burst_read = 0;	/* allow burst reading */
172 static swblk_t swapiterator;	/* linearize allocations */
173 int swap_user_async = 0;	/* user swap pager operation can be async */
174 
175 static struct spinlock swapbp_spin = SPINLOCK_INITIALIZER(&swapbp_spin, "swapbp_spin");
176 
177 /* from vm_swap.c */
178 extern struct vnode *swapdev_vp;
179 extern struct swdevt *swdevt;
180 extern int nswdev;
181 
182 #define BLK2DEVIDX(blk) (nswdev > 1 ? blk / SWB_DMMAX % nswdev : 0)
183 
184 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
185         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
186 SYSCTL_INT(_vm, OID_AUTO, swap_burst_read,
187         CTLFLAG_RW, &swap_burst_read, 0, "Allow burst reads for pageins");
188 SYSCTL_INT(_vm, OID_AUTO, swap_user_async,
189         CTLFLAG_RW, &swap_user_async, 0, "Allow async uuser swap write I/O");
190 
191 #if SWBLK_BITS == 64
192 SYSCTL_LONG(_vm, OID_AUTO, swap_cache_use,
193         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
194 SYSCTL_LONG(_vm, OID_AUTO, swap_anon_use,
195         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
196 SYSCTL_LONG(_vm, OID_AUTO, swap_size,
197         CTLFLAG_RD, &vm_swap_size, 0, "");
198 #else
199 SYSCTL_INT(_vm, OID_AUTO, swap_cache_use,
200         CTLFLAG_RD, &vm_swap_cache_use, 0, "");
201 SYSCTL_INT(_vm, OID_AUTO, swap_anon_use,
202         CTLFLAG_RD, &vm_swap_anon_use, 0, "");
203 SYSCTL_INT(_vm, OID_AUTO, swap_size,
204         CTLFLAG_RD, &vm_swap_size, 0, "");
205 #endif
206 SYSCTL_INT(_vm, OID_AUTO, report_swap_allocs,
207         CTLFLAG_RW, &vm_report_swap_allocs, 0, "");
208 
209 __read_mostly vm_zone_t	swap_zone;
210 
211 /*
212  * Red-Black tree for swblock entries
213  *
214  * The caller must hold vm_token
215  */
216 RB_GENERATE2(swblock_rb_tree, swblock, swb_entry, rb_swblock_compare,
217 	     vm_pindex_t, swb_index);
218 
219 int
220 rb_swblock_compare(struct swblock *swb1, struct swblock *swb2)
221 {
222 	if (swb1->swb_index < swb2->swb_index)
223 		return(-1);
224 	if (swb1->swb_index > swb2->swb_index)
225 		return(1);
226 	return(0);
227 }
228 
229 static
230 int
231 rb_swblock_scancmp(struct swblock *swb, void *data)
232 {
233 	struct swfreeinfo *info = data;
234 
235 	if (swb->swb_index < info->basei)
236 		return(-1);
237 	if (swb->swb_index > info->endi)
238 		return(1);
239 	return(0);
240 }
241 
242 static
243 int
244 rb_swblock_condcmp(struct swblock *swb, void *data)
245 {
246 	struct swfreeinfo *info = data;
247 
248 	if (swb->swb_index < info->basei)
249 		return(-1);
250 	return(0);
251 }
252 
253 /*
254  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
255  * calls hooked from other parts of the VM system and do not appear here.
256  * (see vm/swap_pager.h).
257  */
258 
259 static void	swap_pager_dealloc (vm_object_t object);
260 static int	swap_pager_getpage (vm_object_t, vm_page_t *, int);
261 static void	swap_chain_iodone(struct bio *biox);
262 
263 struct pagerops swappagerops = {
264 	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
265 	swap_pager_getpage,	/* pagein				*/
266 	swap_pager_putpages,	/* pageout				*/
267 	swap_pager_haspage	/* get backing store status for page	*/
268 };
269 
270 /*
271  * SWB_DMMAX is in page-sized chunks with the new swap system.  It was
272  * dev-bsized chunks in the old.  SWB_DMMAX is always a power of 2.
273  *
274  * swap_*() routines are externally accessible.  swp_*() routines are
275  * internal.
276  */
277 
278 int nswap_lowat = 128;		/* in pages, swap_pager_almost_full warn */
279 int nswap_hiwat = 512;		/* in pages, swap_pager_almost_full warn */
280 
281 static __inline void	swp_sizecheck (void);
282 static void	swp_pager_async_iodone (struct bio *bio);
283 
284 /*
285  * Swap bitmap functions
286  */
287 
288 static __inline void	swp_pager_freeswapspace(vm_object_t object,
289 						swblk_t blk, int npages);
290 static __inline swblk_t	swp_pager_getswapspace(vm_object_t object, int npages);
291 
292 /*
293  * Metadata functions
294  */
295 
296 static void swp_pager_meta_convert(vm_object_t);
297 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, swblk_t);
298 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
299 static void swp_pager_meta_free_all(vm_object_t);
300 static swblk_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
301 
302 /*
303  * SWP_SIZECHECK() -	update swap_pager_full indication
304  *
305  *	update the swap_pager_almost_full indication and warn when we are
306  *	about to run out of swap space, using lowat/hiwat hysteresis.
307  *
308  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
309  *
310  * No restrictions on call
311  * This routine may not block.
312  * SMP races are ok.
313  */
314 static __inline void
315 swp_sizecheck(void)
316 {
317 	if (vm_swap_size < nswap_lowat) {
318 		if (swap_pager_almost_full == 0) {
319 			kprintf("swap_pager: out of swap space\n");
320 			swap_pager_almost_full = 1;
321 			swap_fail_ticks = ticks;
322 		}
323 	} else {
324 		swap_pager_full = 0;
325 		if (vm_swap_size > nswap_hiwat)
326 			swap_pager_almost_full = 0;
327 	}
328 }
329 
330 /*
331  * Long-term data collection on 10-second interval.  Return the value
332  * for KCOLLECT_SWAPPCT and set the values for SWAPANO and SWAPCCAC.
333  *
334  * Return total swap in the scale field.  This can change if swap is
335  * regularly added or removed and may cause some historical confusion
336  * in that case, but SWAPPCT will always be historically accurate.
337  */
338 
339 #define PTOB(value)	((uint64_t)(value) << PAGE_SHIFT)
340 
341 static uint64_t
342 collect_swap_callback(int n)
343 {
344 	uint64_t total = vm_swap_max;
345 	uint64_t anon = vm_swap_anon_use;
346 	uint64_t cache = vm_swap_cache_use;
347 
348 	if (total == 0)		/* avoid divide by zero */
349 		total = 1;
350 	kcollect_setvalue(KCOLLECT_SWAPANO, PTOB(anon));
351 	kcollect_setvalue(KCOLLECT_SWAPCAC, PTOB(cache));
352 	kcollect_setscale(KCOLLECT_SWAPANO,
353 			  KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, PTOB(total)));
354 	kcollect_setscale(KCOLLECT_SWAPCAC,
355 			  KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, PTOB(total)));
356 	return (((anon + cache) * 10000 + (total >> 1)) / total);
357 }
358 
359 /*
360  * SWAP_PAGER_INIT() -	initialize the swap pager!
361  *
362  *	Expected to be started from system init.  NOTE:  This code is run
363  *	before much else so be careful what you depend on.  Most of the VM
364  *	system has yet to be initialized at this point.
365  *
366  * Called from the low level boot code only.
367  */
368 static void
369 swap_pager_init(void *arg __unused)
370 {
371 	kcollect_register(KCOLLECT_SWAPPCT, "swapuse", collect_swap_callback,
372 			  KCOLLECT_SCALE(KCOLLECT_SWAPPCT_FORMAT, 0));
373 	kcollect_register(KCOLLECT_SWAPANO, "swapano", NULL,
374 			  KCOLLECT_SCALE(KCOLLECT_SWAPANO_FORMAT, 0));
375 	kcollect_register(KCOLLECT_SWAPCAC, "swapcac", NULL,
376 			  KCOLLECT_SCALE(KCOLLECT_SWAPCAC_FORMAT, 0));
377 }
378 SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_THIRD, swap_pager_init, NULL);
379 
380 /*
381  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
382  *
383  *	Expected to be started from pageout process once, prior to entering
384  *	its main loop.
385  *
386  * Called from the low level boot code only.
387  */
388 void
389 swap_pager_swap_init(void)
390 {
391 	int n, n2;
392 
393 	/*
394 	 * Number of in-transit swap bp operations.  Don't
395 	 * exhaust the pbufs completely.  Make sure we
396 	 * initialize workable values (0 will work for hysteresis
397 	 * but it isn't very efficient).
398 	 *
399 	 * The nsw_cluster_max is constrained by the number of pages an XIO
400 	 * holds, i.e., (MAXPHYS/PAGE_SIZE) and our locally defined
401 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
402 	 * constrained by the swap device interleave stripe size.
403 	 *
404 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
405 	 * designed to prevent other I/O from having high latencies due to
406 	 * our pageout I/O.  The value 4 works well for one or two active swap
407 	 * devices but is probably a little low if you have more.  Even so,
408 	 * a higher value would probably generate only a limited improvement
409 	 * with three or four active swap devices since the system does not
410 	 * typically have to pageout at extreme bandwidths.   We will want
411 	 * at least 2 per swap devices, and 4 is a pretty good value if you
412 	 * have one NFS swap device due to the command/ack latency over NFS.
413 	 * So it all works out pretty well.
414 	 */
415 
416 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
417 
418 	nsw_rcount = (nswbuf_kva + 1) / 2;
419 	nsw_wcount_sync = (nswbuf_kva + 3) / 4;
420 	nsw_wcount_async = 4;
421 	nsw_wcount_async_max = nsw_wcount_async;
422 
423 	/*
424 	 * The zone is dynamically allocated so generally size it to
425 	 * maxswzone (32MB to 256GB of KVM).  Set a minimum size based
426 	 * on physical memory of around 8x (each swblock can hold 16 pages).
427 	 *
428 	 * With the advent of SSDs (vs HDs) the practical (swap:memory) ratio
429 	 * has increased dramatically.
430 	 */
431 	n = vmstats.v_page_count / 2;
432 	if (maxswzone && n < maxswzone / sizeof(struct swblock))
433 		n = maxswzone / sizeof(struct swblock);
434 	n2 = n;
435 
436 	do {
437 		swap_zone = zinit(
438 			"SWAPMETA",
439 			sizeof(struct swblock),
440 			n,
441 			ZONE_INTERRUPT);
442 		if (swap_zone != NULL)
443 			break;
444 		/*
445 		 * if the allocation failed, try a zone two thirds the
446 		 * size of the previous attempt.
447 		 */
448 		n -= ((n + 2) / 3);
449 	} while (n > 0);
450 
451 	if (swap_zone == NULL)
452 		panic("swap_pager_swap_init: swap_zone == NULL");
453 	if (n2 != n)
454 		kprintf("Swap zone entries reduced from %d to %d.\n", n2, n);
455 }
456 
457 /*
458  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
459  *			its metadata structures.
460  *
461  *	This routine is called from the mmap and fork code to create a new
462  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
463  *	and then converting it with swp_pager_meta_convert().
464  *
465  *	We only support unnamed objects.
466  *
467  * No restrictions.
468  */
469 vm_object_t
470 swap_pager_alloc(void *handle, off_t size, vm_prot_t prot, off_t offset)
471 {
472 	vm_object_t object;
473 
474 	KKASSERT(handle == NULL);
475 	object = vm_object_allocate_hold(OBJT_DEFAULT,
476 					 OFF_TO_IDX(offset + PAGE_MASK + size));
477 	swp_pager_meta_convert(object);
478 	vm_object_drop(object);
479 
480 	return (object);
481 }
482 
483 /*
484  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
485  *
486  *	The swap backing for the object is destroyed.  The code is
487  *	designed such that we can reinstantiate it later, but this
488  *	routine is typically called only when the entire object is
489  *	about to be destroyed.
490  *
491  * The object must be locked or unreferenceable.
492  * No other requirements.
493  */
494 static void
495 swap_pager_dealloc(vm_object_t object)
496 {
497 	vm_object_hold(object);
498 	vm_object_pip_wait(object, "swpdea");
499 
500 	/*
501 	 * Free all remaining metadata.  We only bother to free it from
502 	 * the swap meta data.  We do not attempt to free swapblk's still
503 	 * associated with vm_page_t's for this object.  We do not care
504 	 * if paging is still in progress on some objects.
505 	 */
506 	swp_pager_meta_free_all(object);
507 	vm_object_drop(object);
508 }
509 
510 /************************************************************************
511  *			SWAP PAGER BITMAP ROUTINES			*
512  ************************************************************************/
513 
514 /*
515  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
516  *
517  *	Allocate swap for the requested number of pages.  The starting
518  *	swap block number (a page index) is returned or SWAPBLK_NONE
519  *	if the allocation failed.
520  *
521  *	Also has the side effect of advising that somebody made a mistake
522  *	when they configured swap and didn't configure enough.
523  *
524  * The caller must hold the object.
525  * This routine may not block.
526  */
527 static __inline swblk_t
528 swp_pager_getswapspace(vm_object_t object, int npages)
529 {
530 	swblk_t blk;
531 
532 	lwkt_gettoken(&vm_token);
533 	blk = blist_allocat(swapblist, npages, swapiterator);
534 	if (blk == SWAPBLK_NONE)
535 		blk = blist_allocat(swapblist, npages, 0);
536 	if (blk == SWAPBLK_NONE) {
537 		if (swap_pager_full != 2) {
538 			if (vm_swap_max == 0) {
539 				krateprintf(&kswaprate,
540 					"Warning: The system would like to "
541 					"page to swap but no swap space "
542 					"is configured!\n");
543 			} else {
544 				krateprintf(&kswaprate,
545 					"swap_pager_getswapspace: "
546 					"swap full allocating %d pages\n",
547 					npages);
548 			}
549 			swap_pager_full = 2;
550 			if (swap_pager_almost_full == 0)
551 				swap_fail_ticks = ticks;
552 			swap_pager_almost_full = 1;
553 		}
554 	} else {
555 		/* swapiterator = blk; disable for now, doesn't work well */
556 		swapacctspace(blk, -npages);
557 		if (object->type == OBJT_SWAP)
558 			vm_swap_anon_use += npages;
559 		else
560 			vm_swap_cache_use += npages;
561 		swp_sizecheck();
562 	}
563 	lwkt_reltoken(&vm_token);
564 	return(blk);
565 }
566 
567 /*
568  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
569  *
570  *	This routine returns the specified swap blocks back to the bitmap.
571  *
572  *	Note:  This routine may not block (it could in the old swap code),
573  *	and through the use of the new blist routines it does not block.
574  *
575  * This routine may not block.
576  */
577 
578 static __inline void
579 swp_pager_freeswapspace(vm_object_t object, swblk_t blk, int npages)
580 {
581 	struct swdevt *sp = &swdevt[BLK2DEVIDX(blk)];
582 
583 	lwkt_gettoken(&vm_token);
584 	sp->sw_nused -= npages;
585 	if (object->type == OBJT_SWAP)
586 		vm_swap_anon_use -= npages;
587 	else
588 		vm_swap_cache_use -= npages;
589 
590 	if (sp->sw_flags & SW_CLOSING) {
591 		lwkt_reltoken(&vm_token);
592 		return;
593 	}
594 
595 	blist_free(swapblist, blk, npages);
596 	vm_swap_size += npages;
597 	swp_sizecheck();
598 	lwkt_reltoken(&vm_token);
599 }
600 
601 /*
602  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
603  *				range within an object.
604  *
605  *	This is a globally accessible routine.
606  *
607  *	This routine removes swapblk assignments from swap metadata.
608  *
609  *	The external callers of this routine typically have already destroyed
610  *	or renamed vm_page_t's associated with this range in the object so
611  *	we should be ok.
612  *
613  * No requirements.
614  */
615 void
616 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
617 {
618 	vm_object_hold(object);
619 	swp_pager_meta_free(object, start, size);
620 	vm_object_drop(object);
621 }
622 
623 /*
624  * No requirements.
625  */
626 void
627 swap_pager_freespace_all(vm_object_t object)
628 {
629 	vm_object_hold(object);
630 	swp_pager_meta_free_all(object);
631 	vm_object_drop(object);
632 }
633 
634 /*
635  * This function conditionally frees swap cache swap starting at
636  * (*basei) in the object.  (count) swap blocks will be nominally freed.
637  * The actual number of blocks freed can be more or less than the
638  * requested number.
639  *
640  * This function nominally returns the number of blocks freed.  However,
641  * the actual number of blocks freed may be less then the returned value.
642  * If the function is unable to exhaust the object or if it is able to
643  * free (approximately) the requested number of blocks it returns
644  * a value n > count.
645  *
646  * If we exhaust the object we will return a value n <= count.
647  *
648  * The caller must hold the object.
649  *
650  * WARNING!  If count == 0 then -1 can be returned as a degenerate case,
651  *	     callers should always pass a count value > 0.
652  */
653 static int swap_pager_condfree_callback(struct swblock *swap, void *data);
654 
655 int
656 swap_pager_condfree(vm_object_t object, vm_pindex_t *basei, int count)
657 {
658 	struct swfreeinfo info;
659 	int n;
660 	int t;
661 
662 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
663 
664 	info.object = object;
665 	info.basei = *basei;	/* skip up to this page index */
666 	info.begi = count;	/* max swap pages to destroy */
667 	info.endi = count * 8;	/* max swblocks to scan */
668 
669 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_condcmp,
670 				swap_pager_condfree_callback, &info);
671 	*basei = info.basei;
672 
673 	/*
674 	 * Take the higher difference swblocks vs pages
675 	 */
676 	n = count - (int)info.begi;
677 	t = count * 8 - (int)info.endi;
678 	if (n < t)
679 		n = t;
680 	if (n < 1)
681 		n = 1;
682 	return(n);
683 }
684 
685 /*
686  * The idea is to free whole meta-block to avoid fragmenting
687  * the swap space or disk I/O.  We only do this if NO VM pages
688  * are present.
689  *
690  * We do not have to deal with clearing PG_SWAPPED in related VM
691  * pages because there are no related VM pages.
692  *
693  * The caller must hold the object.
694  */
695 static int
696 swap_pager_condfree_callback(struct swblock *swap, void *data)
697 {
698 	struct swfreeinfo *info = data;
699 	vm_object_t object = info->object;
700 	int i;
701 
702 	for (i = 0; i < SWAP_META_PAGES; ++i) {
703 		if (vm_page_lookup(object, swap->swb_index + i))
704 			break;
705 	}
706 	info->basei = swap->swb_index + SWAP_META_PAGES;
707 	if (i == SWAP_META_PAGES) {
708 		info->begi -= swap->swb_count;
709 		swap_pager_freespace(object, swap->swb_index, SWAP_META_PAGES);
710 	}
711 	--info->endi;
712 	if ((int)info->begi < 0 || (int)info->endi < 0)
713 		return(-1);
714 	lwkt_yield();
715 	return(0);
716 }
717 
718 /*
719  * Called by vm_page_alloc() when a new VM page is inserted
720  * into a VM object.  Checks whether swap has been assigned to
721  * the page and sets PG_SWAPPED as necessary.
722  *
723  * (m) must be busied by caller and remains busied on return.
724  */
725 void
726 swap_pager_page_inserted(vm_page_t m)
727 {
728 	if (m->object->swblock_count) {
729 		vm_object_hold(m->object);
730 		if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
731 			vm_page_flag_set(m, PG_SWAPPED);
732 		vm_object_drop(m->object);
733 	}
734 }
735 
736 /*
737  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
738  *
739  *	Assigns swap blocks to the specified range within the object.  The
740  *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
741  *
742  *	Returns 0 on success, -1 on failure.
743  *
744  * The caller is responsible for avoiding races in the specified range.
745  * No other requirements.
746  */
747 int
748 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
749 {
750 	int n = 0;
751 	swblk_t blk = SWAPBLK_NONE;
752 	vm_pindex_t beg = start;	/* save start index */
753 
754 	vm_object_hold(object);
755 
756 	while (size) {
757 		if (n == 0) {
758 			n = BLIST_MAX_ALLOC;
759 			while ((blk = swp_pager_getswapspace(object, n)) ==
760 			       SWAPBLK_NONE)
761 			{
762 				n >>= 1;
763 				if (n == 0) {
764 					swp_pager_meta_free(object, beg,
765 							    start - beg);
766 					vm_object_drop(object);
767 					return(-1);
768 				}
769 			}
770 		}
771 		swp_pager_meta_build(object, start, blk);
772 		--size;
773 		++start;
774 		++blk;
775 		--n;
776 	}
777 	swp_pager_meta_free(object, start, n);
778 	vm_object_drop(object);
779 	return(0);
780 }
781 
782 /*
783  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
784  *			and destroy the source.
785  *
786  *	Copy any valid swapblks from the source to the destination.  In
787  *	cases where both the source and destination have a valid swapblk,
788  *	we keep the destination's.
789  *
790  *	This routine is allowed to block.  It may block allocating metadata
791  *	indirectly through swp_pager_meta_build() or if paging is still in
792  *	progress on the source.
793  *
794  *	XXX vm_page_collapse() kinda expects us not to block because we
795  *	supposedly do not need to allocate memory, but for the moment we
796  *	*may* have to get a little memory from the zone allocator, but
797  *	it is taken from the interrupt memory.  We should be ok.
798  *
799  *	The source object contains no vm_page_t's (which is just as well)
800  *	The source object is of type OBJT_SWAP.
801  *
802  *	The source and destination objects must be held by the caller.
803  */
804 void
805 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
806 		vm_pindex_t base_index, int destroysource)
807 {
808 	vm_pindex_t i;
809 
810 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(srcobject));
811 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(dstobject));
812 
813 	/*
814 	 * transfer source to destination.
815 	 */
816 	for (i = 0; i < dstobject->size; ++i) {
817 		swblk_t dstaddr;
818 
819 		/*
820 		 * Locate (without changing) the swapblk on the destination,
821 		 * unless it is invalid in which case free it silently, or
822 		 * if the destination is a resident page, in which case the
823 		 * source is thrown away.
824 		 */
825 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
826 
827 		if (dstaddr == SWAPBLK_NONE) {
828 			/*
829 			 * Destination has no swapblk and is not resident,
830 			 * copy source.
831 			 */
832 			swblk_t srcaddr;
833 
834 			srcaddr = swp_pager_meta_ctl(srcobject,
835 						     base_index + i, SWM_POP);
836 
837 			if (srcaddr != SWAPBLK_NONE)
838 				swp_pager_meta_build(dstobject, i, srcaddr);
839 		} else {
840 			/*
841 			 * Destination has valid swapblk or it is represented
842 			 * by a resident page.  We destroy the sourceblock.
843 			 */
844 			swp_pager_meta_ctl(srcobject, base_index + i, SWM_FREE);
845 		}
846 	}
847 
848 	/*
849 	 * Free left over swap blocks in source.
850 	 *
851 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
852 	 * double-remove the object from the swap queues.
853 	 */
854 	if (destroysource) {
855 		/*
856 		 * Reverting the type is not necessary, the caller is going
857 		 * to destroy srcobject directly, but I'm doing it here
858 		 * for consistency since we've removed the object from its
859 		 * queues.
860 		 */
861 		swp_pager_meta_free_all(srcobject);
862 		if (srcobject->type == OBJT_SWAP)
863 			srcobject->type = OBJT_DEFAULT;
864 	}
865 }
866 
867 /*
868  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
869  *				the requested page.
870  *
871  *	We determine whether good backing store exists for the requested
872  *	page and return TRUE if it does, FALSE if it doesn't.
873  *
874  *	If TRUE, we also try to determine how much valid, contiguous backing
875  *	store exists before and after the requested page within a reasonable
876  *	distance.  We do not try to restrict it to the swap device stripe
877  *	(that is handled in getpages/putpages).  It probably isn't worth
878  *	doing here.
879  *
880  * No requirements.
881  */
882 boolean_t
883 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
884 {
885 	swblk_t blk0;
886 
887 	/*
888 	 * do we have good backing store at the requested index ?
889 	 */
890 	vm_object_hold(object);
891 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
892 
893 	if (blk0 == SWAPBLK_NONE) {
894 		vm_object_drop(object);
895 		return (FALSE);
896 	}
897 	vm_object_drop(object);
898 	return (TRUE);
899 }
900 
901 /*
902  * Object must be held exclusive or shared by the caller.
903  */
904 boolean_t
905 swap_pager_haspage_locked(vm_object_t object, vm_pindex_t pindex)
906 {
907 	if (swp_pager_meta_ctl(object, pindex, 0) == SWAPBLK_NONE)
908 		return FALSE;
909 	return TRUE;
910 }
911 
912 /*
913  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
914  *
915  * This removes any associated swap backing store, whether valid or
916  * not, from the page.  This operates on any VM object, not just OBJT_SWAP
917  * objects.
918  *
919  * This routine is typically called when a page is made dirty, at
920  * which point any associated swap can be freed.  MADV_FREE also
921  * calls us in a special-case situation
922  *
923  * NOTE!!!  If the page is clean and the swap was valid, the caller
924  *	    should make the page dirty before calling this routine.
925  *	    This routine does NOT change the m->dirty status of the page.
926  *	    Also: MADV_FREE depends on it.
927  *
928  * The page must be busied.
929  * The caller can hold the object to avoid blocking, else we might block.
930  * No other requirements.
931  */
932 void
933 swap_pager_unswapped(vm_page_t m)
934 {
935 	if (m->flags & PG_SWAPPED) {
936 		vm_object_hold(m->object);
937 		KKASSERT(m->flags & PG_SWAPPED);
938 		swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
939 		vm_page_flag_clear(m, PG_SWAPPED);
940 		vm_object_drop(m->object);
941 	}
942 }
943 
944 /*
945  * SWAP_PAGER_STRATEGY() - read, write, free blocks
946  *
947  * This implements a VM OBJECT strategy function using swap backing store.
948  * This can operate on any VM OBJECT type, not necessarily just OBJT_SWAP
949  * types.  Only BUF_CMD_{READ,WRITE,FREEBLKS} is supported, any other
950  * requests will return EINVAL.
951  *
952  * This is intended to be a cacheless interface (i.e. caching occurs at
953  * higher levels), and is also used as a swap-based SSD cache for vnode
954  * and device objects.
955  *
956  * All I/O goes directly to and from the swap device.
957  *
958  * We currently attempt to run I/O synchronously or asynchronously as
959  * the caller requests.  This isn't perfect because we loose error
960  * sequencing when we run multiple ops in parallel to satisfy a request.
961  * But this is swap, so we let it all hang out.
962  *
963  * NOTE: This function supports the KVABIO API wherein bp->b_data might
964  *	 not be synchronized to the current cpu.
965  *
966  * No requirements.
967  */
968 void
969 swap_pager_strategy(vm_object_t object, struct bio *bio)
970 {
971 	struct buf *bp = bio->bio_buf;
972 	struct bio *nbio;
973 	vm_pindex_t start;
974 	vm_pindex_t biox_blkno = 0;
975 	int count;
976 	char *data;
977 	struct bio *biox;
978 	struct buf *bufx;
979 #if 0
980 	struct bio_track *track;
981 #endif
982 
983 #if 0
984 	/*
985 	 * tracking for swapdev vnode I/Os
986 	 */
987 	if (bp->b_cmd == BUF_CMD_READ)
988 		track = &swapdev_vp->v_track_read;
989 	else
990 		track = &swapdev_vp->v_track_write;
991 #endif
992 
993 	/*
994 	 * Only supported commands
995 	 */
996 	if (bp->b_cmd != BUF_CMD_FREEBLKS &&
997 	    bp->b_cmd != BUF_CMD_READ &&
998 	    bp->b_cmd != BUF_CMD_WRITE) {
999 		bp->b_error = EINVAL;
1000 		bp->b_flags |= B_ERROR | B_INVAL;
1001 		biodone(bio);
1002 		return;
1003 	}
1004 
1005 	/*
1006 	 * bcount must be an integral number of pages.
1007 	 */
1008 	if (bp->b_bcount & PAGE_MASK) {
1009 		bp->b_error = EINVAL;
1010 		bp->b_flags |= B_ERROR | B_INVAL;
1011 		biodone(bio);
1012 		kprintf("swap_pager_strategy: bp %p offset %lld size %d, "
1013 			"not page bounded\n",
1014 			bp, (long long)bio->bio_offset, (int)bp->b_bcount);
1015 		return;
1016 	}
1017 
1018 	/*
1019 	 * Clear error indication, initialize page index, count, data pointer.
1020 	 */
1021 	bp->b_error = 0;
1022 	bp->b_flags &= ~B_ERROR;
1023 	bp->b_resid = bp->b_bcount;
1024 
1025 	start = (vm_pindex_t)(bio->bio_offset >> PAGE_SHIFT);
1026 	count = howmany(bp->b_bcount, PAGE_SIZE);
1027 
1028 	/*
1029 	 * WARNING!  Do not dereference *data without issuing a bkvasync()
1030 	 */
1031 	data = bp->b_data;
1032 
1033 	/*
1034 	 * Deal with BUF_CMD_FREEBLKS
1035 	 */
1036 	if (bp->b_cmd == BUF_CMD_FREEBLKS) {
1037 		/*
1038 		 * FREE PAGE(s) - destroy underlying swap that is no longer
1039 		 *		  needed.
1040 		 */
1041 		vm_object_hold(object);
1042 		swp_pager_meta_free(object, start, count);
1043 		vm_object_drop(object);
1044 		bp->b_resid = 0;
1045 		biodone(bio);
1046 		return;
1047 	}
1048 
1049 	/*
1050 	 * We need to be able to create a new cluster of I/O's.  We cannot
1051 	 * use the caller fields of the passed bio so push a new one.
1052 	 *
1053 	 * Because nbio is just a placeholder for the cluster links,
1054 	 * we can biodone() the original bio instead of nbio to make
1055 	 * things a bit more efficient.
1056 	 */
1057 	nbio = push_bio(bio);
1058 	nbio->bio_offset = bio->bio_offset;
1059 	nbio->bio_caller_info1.cluster_head = NULL;
1060 	nbio->bio_caller_info2.cluster_tail = NULL;
1061 
1062 	biox = NULL;
1063 	bufx = NULL;
1064 
1065 	/*
1066 	 * Execute read or write
1067 	 */
1068 	vm_object_hold(object);
1069 
1070 	while (count > 0) {
1071 		swblk_t blk;
1072 
1073 		/*
1074 		 * Obtain block.  If block not found and writing, allocate a
1075 		 * new block and build it into the object.
1076 		 */
1077 		blk = swp_pager_meta_ctl(object, start, 0);
1078 		if ((blk == SWAPBLK_NONE) && bp->b_cmd == BUF_CMD_WRITE) {
1079 			blk = swp_pager_getswapspace(object, 1);
1080 			if (blk == SWAPBLK_NONE) {
1081 				bp->b_error = ENOMEM;
1082 				bp->b_flags |= B_ERROR;
1083 				break;
1084 			}
1085 			swp_pager_meta_build(object, start, blk);
1086 		}
1087 
1088 		/*
1089 		 * Do we have to flush our current collection?  Yes if:
1090 		 *
1091 		 *	- no swap block at this index
1092 		 *	- swap block is not contiguous
1093 		 *	- we cross a physical disk boundry in the
1094 		 *	  stripe.
1095 		 */
1096 		if (biox &&
1097 		    (biox_blkno + btoc(bufx->b_bcount) != blk ||
1098 		     ((biox_blkno ^ blk) & ~SWB_DMMASK))) {
1099 			switch(bp->b_cmd) {
1100 			case BUF_CMD_READ:
1101 				++mycpu->gd_cnt.v_swapin;
1102 				mycpu->gd_cnt.v_swappgsin +=
1103 					btoc(bufx->b_bcount);
1104 				break;
1105 			case BUF_CMD_WRITE:
1106 				++mycpu->gd_cnt.v_swapout;
1107 				mycpu->gd_cnt.v_swappgsout +=
1108 					btoc(bufx->b_bcount);
1109 				bufx->b_dirtyend = bufx->b_bcount;
1110 				break;
1111 			default:
1112 				/* NOT REACHED */
1113 				break;
1114 			}
1115 
1116 			/*
1117 			 * Finished with this buf.
1118 			 */
1119 			KKASSERT(bufx->b_bcount != 0);
1120 			if (bufx->b_cmd != BUF_CMD_READ)
1121 				bufx->b_dirtyend = bufx->b_bcount;
1122 			biox = NULL;
1123 			bufx = NULL;
1124 		}
1125 
1126 		/*
1127 		 * Add new swapblk to biox, instantiating biox if necessary.
1128 		 * Zero-fill reads are able to take a shortcut.
1129 		 */
1130 		if (blk == SWAPBLK_NONE) {
1131 			/*
1132 			 * We can only get here if we are reading.
1133 			 */
1134 			bkvasync(bp);
1135 			bzero(data, PAGE_SIZE);
1136 			bp->b_resid -= PAGE_SIZE;
1137 		} else {
1138 			if (biox == NULL) {
1139 				/* XXX chain count > 4, wait to <= 4 */
1140 
1141 				bufx = getpbuf(NULL);
1142 				bufx->b_flags |= B_KVABIO;
1143 				biox = &bufx->b_bio1;
1144 				cluster_append(nbio, bufx);
1145 				bufx->b_cmd = bp->b_cmd;
1146 				biox->bio_done = swap_chain_iodone;
1147 				biox->bio_offset = (off_t)blk << PAGE_SHIFT;
1148 				biox->bio_caller_info1.cluster_parent = nbio;
1149 				biox_blkno = blk;
1150 				bufx->b_bcount = 0;
1151 				bufx->b_data = data;
1152 			}
1153 			bufx->b_bcount += PAGE_SIZE;
1154 		}
1155 		--count;
1156 		++start;
1157 		data += PAGE_SIZE;
1158 	}
1159 
1160 	vm_object_drop(object);
1161 
1162 	/*
1163 	 *  Flush out last buffer
1164 	 */
1165 	if (biox) {
1166 		if (bufx->b_cmd == BUF_CMD_READ) {
1167 			++mycpu->gd_cnt.v_swapin;
1168 			mycpu->gd_cnt.v_swappgsin += btoc(bufx->b_bcount);
1169 		} else {
1170 			++mycpu->gd_cnt.v_swapout;
1171 			mycpu->gd_cnt.v_swappgsout += btoc(bufx->b_bcount);
1172 			bufx->b_dirtyend = bufx->b_bcount;
1173 		}
1174 		KKASSERT(bufx->b_bcount);
1175 		if (bufx->b_cmd != BUF_CMD_READ)
1176 			bufx->b_dirtyend = bufx->b_bcount;
1177 		/* biox, bufx = NULL */
1178 	}
1179 
1180 	/*
1181 	 * Now initiate all the I/O.  Be careful looping on our chain as
1182 	 * I/O's may complete while we are still initiating them.
1183 	 *
1184 	 * If the request is a 100% sparse read no bios will be present
1185 	 * and we just biodone() the buffer.
1186 	 */
1187 	nbio->bio_caller_info2.cluster_tail = NULL;
1188 	bufx = nbio->bio_caller_info1.cluster_head;
1189 
1190 	if (bufx) {
1191 		while (bufx) {
1192 			biox = &bufx->b_bio1;
1193 			BUF_KERNPROC(bufx);
1194 			bufx = bufx->b_cluster_next;
1195 			vn_strategy(swapdev_vp, biox);
1196 		}
1197 	} else {
1198 		biodone(bio);
1199 	}
1200 
1201 	/*
1202 	 * Completion of the cluster will also call biodone_chain(nbio).
1203 	 * We never call biodone(nbio) so we don't have to worry about
1204 	 * setting up a bio_done callback.  It's handled in the sub-IO.
1205 	 */
1206 	/**/
1207 }
1208 
1209 /*
1210  * biodone callback
1211  *
1212  * No requirements.
1213  */
1214 static void
1215 swap_chain_iodone(struct bio *biox)
1216 {
1217 	struct buf **nextp;
1218 	struct buf *bufx;	/* chained sub-buffer */
1219 	struct bio *nbio;	/* parent nbio with chain glue */
1220 	struct buf *bp;		/* original bp associated with nbio */
1221 	int chain_empty;
1222 
1223 	bufx = biox->bio_buf;
1224 	nbio = biox->bio_caller_info1.cluster_parent;
1225 	bp = nbio->bio_buf;
1226 
1227 	/*
1228 	 * Update the original buffer
1229 	 */
1230         KKASSERT(bp != NULL);
1231 	if (bufx->b_flags & B_ERROR) {
1232 		atomic_set_int(&bufx->b_flags, B_ERROR);
1233 		bp->b_error = bufx->b_error;	/* race ok */
1234 	} else if (bufx->b_resid != 0) {
1235 		atomic_set_int(&bufx->b_flags, B_ERROR);
1236 		bp->b_error = EINVAL;		/* race ok */
1237 	} else {
1238 		atomic_subtract_int(&bp->b_resid, bufx->b_bcount);
1239 	}
1240 
1241 	/*
1242 	 * Remove us from the chain.
1243 	 */
1244 	spin_lock(&swapbp_spin);
1245 	nextp = &nbio->bio_caller_info1.cluster_head;
1246 	while (*nextp != bufx) {
1247 		KKASSERT(*nextp != NULL);
1248 		nextp = &(*nextp)->b_cluster_next;
1249 	}
1250 	*nextp = bufx->b_cluster_next;
1251 	chain_empty = (nbio->bio_caller_info1.cluster_head == NULL);
1252 	spin_unlock(&swapbp_spin);
1253 
1254 	/*
1255 	 * Clean up bufx.  If the chain is now empty we finish out
1256 	 * the parent.  Note that we may be racing other completions
1257 	 * so we must use the chain_empty status from above.
1258 	 */
1259 	if (chain_empty) {
1260 		if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
1261 			atomic_set_int(&bp->b_flags, B_ERROR);
1262 			bp->b_error = EINVAL;
1263 		}
1264 		biodone_chain(nbio);
1265         }
1266         relpbuf(bufx, NULL);
1267 }
1268 
1269 /*
1270  * SWAP_PAGER_GETPAGES() - bring page in from swap
1271  *
1272  * The requested page may have to be brought in from swap.  Calculate the
1273  * swap block and bring in additional pages if possible.  All pages must
1274  * have contiguous swap block assignments and reside in the same object.
1275  *
1276  * The caller has a single vm_object_pip_add() reference prior to
1277  * calling us and we should return with the same.
1278  *
1279  * The caller has BUSY'd the page.  We should return with (*mpp) left busy,
1280  * and any additinal pages unbusied.
1281  *
1282  * If the caller encounters a PG_RAM page it will pass it to us even though
1283  * it may be valid and dirty.  We cannot overwrite the page in this case!
1284  * The case is used to allow us to issue pure read-aheads.
1285  *
1286  * NOTE! XXX This code does not entirely pipeline yet due to the fact that
1287  *       the PG_RAM page is validated at the same time as mreq.  What we
1288  *	 really need to do is issue a separate read-ahead pbuf.
1289  *
1290  * No requirements.
1291  */
1292 static int
1293 swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
1294 {
1295 	struct buf *bp;
1296 	struct bio *bio;
1297 	vm_page_t mreq;
1298 	vm_page_t m;
1299 	vm_offset_t kva;
1300 	swblk_t blk;
1301 	int i;
1302 	int j;
1303 	int raonly;
1304 	int error;
1305 	u_int32_t busy_count;
1306 	vm_page_t marray[XIO_INTERNAL_PAGES];
1307 
1308 	mreq = *mpp;
1309 
1310 	vm_object_hold(object);
1311 	if (mreq->object != object) {
1312 		panic("swap_pager_getpages: object mismatch %p/%p",
1313 		    object,
1314 		    mreq->object
1315 		);
1316 	}
1317 
1318 	/*
1319 	 * We don't want to overwrite a fully valid page as it might be
1320 	 * dirty.  This case can occur when e.g. vm_fault hits a perfectly
1321 	 * valid page with PG_RAM set.
1322 	 *
1323 	 * In this case we see if the next page is a suitable page-in
1324 	 * candidate and if it is we issue read-ahead.  PG_RAM will be
1325 	 * set on the last page of the read-ahead to continue the pipeline.
1326 	 */
1327 	if (mreq->valid == VM_PAGE_BITS_ALL) {
1328 		if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size) {
1329 			vm_object_drop(object);
1330 			return(VM_PAGER_OK);
1331 		}
1332 		blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
1333 		if (blk == SWAPBLK_NONE) {
1334 			vm_object_drop(object);
1335 			return(VM_PAGER_OK);
1336 		}
1337 		m = vm_page_lookup_busy_try(object, mreq->pindex + 1,
1338 					    TRUE, &error);
1339 		if (error) {
1340 			vm_object_drop(object);
1341 			return(VM_PAGER_OK);
1342 		} else if (m == NULL) {
1343 			/*
1344 			 * Use VM_ALLOC_QUICK to avoid blocking on cache
1345 			 * page reuse.
1346 			 */
1347 			m = vm_page_alloc(object, mreq->pindex + 1,
1348 					  VM_ALLOC_QUICK);
1349 			if (m == NULL) {
1350 				vm_object_drop(object);
1351 				return(VM_PAGER_OK);
1352 			}
1353 		} else {
1354 			if (m->valid) {
1355 				vm_page_wakeup(m);
1356 				vm_object_drop(object);
1357 				return(VM_PAGER_OK);
1358 			}
1359 			vm_page_unqueue_nowakeup(m);
1360 		}
1361 		/* page is busy */
1362 		mreq = m;
1363 		raonly = 1;
1364 	} else {
1365 		raonly = 0;
1366 	}
1367 
1368 	/*
1369 	 * Try to block-read contiguous pages from swap if sequential,
1370 	 * otherwise just read one page.  Contiguous pages from swap must
1371 	 * reside within a single device stripe because the I/O cannot be
1372 	 * broken up across multiple stripes.
1373 	 *
1374 	 * Note that blk and iblk can be SWAPBLK_NONE but the loop is
1375 	 * set up such that the case(s) are handled implicitly.
1376 	 */
1377 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
1378 	marray[0] = mreq;
1379 
1380 	for (i = 1; i <= swap_burst_read &&
1381 		    i < XIO_INTERNAL_PAGES &&
1382 		    mreq->pindex + i < object->size; ++i) {
1383 		swblk_t iblk;
1384 
1385 		iblk = swp_pager_meta_ctl(object, mreq->pindex + i, 0);
1386 		if (iblk != blk + i)
1387 			break;
1388 		if ((blk ^ iblk) & ~SWB_DMMASK)
1389 			break;
1390 		m = vm_page_lookup_busy_try(object, mreq->pindex + i,
1391 					    TRUE, &error);
1392 		if (error) {
1393 			break;
1394 		} else if (m == NULL) {
1395 			/*
1396 			 * Use VM_ALLOC_QUICK to avoid blocking on cache
1397 			 * page reuse.
1398 			 */
1399 			m = vm_page_alloc(object, mreq->pindex + i,
1400 					  VM_ALLOC_QUICK);
1401 			if (m == NULL)
1402 				break;
1403 		} else {
1404 			if (m->valid) {
1405 				vm_page_wakeup(m);
1406 				break;
1407 			}
1408 			vm_page_unqueue_nowakeup(m);
1409 		}
1410 		/* page is busy */
1411 		marray[i] = m;
1412 	}
1413 	if (i > 1)
1414 		vm_page_flag_set(marray[i - 1], PG_RAM);
1415 
1416 	/*
1417 	 * If mreq is the requested page and we have nothing to do return
1418 	 * VM_PAGER_FAIL.  If raonly is set mreq is just another read-ahead
1419 	 * page and must be cleaned up.
1420 	 */
1421 	if (blk == SWAPBLK_NONE) {
1422 		KKASSERT(i == 1);
1423 		if (raonly) {
1424 			vnode_pager_freepage(mreq);
1425 			vm_object_drop(object);
1426 			return(VM_PAGER_OK);
1427 		} else {
1428 			vm_object_drop(object);
1429 			return(VM_PAGER_FAIL);
1430 		}
1431 	}
1432 
1433 	/*
1434 	 * Map our page(s) into kva for input
1435 	 *
1436 	 * Use the KVABIO API to avoid synchronizing the pmap.
1437 	 */
1438 	bp = getpbuf_kva(&nsw_rcount);
1439 	bio = &bp->b_bio1;
1440 	kva = (vm_offset_t) bp->b_kvabase;
1441 	bcopy(marray, bp->b_xio.xio_pages, i * sizeof(vm_page_t));
1442 	pmap_qenter_noinval(kva, bp->b_xio.xio_pages, i);
1443 
1444 	bp->b_data = (caddr_t)kva;
1445 	bp->b_bcount = PAGE_SIZE * i;
1446 	bp->b_xio.xio_npages = i;
1447 	bp->b_flags |= B_KVABIO;
1448 	bio->bio_done = swp_pager_async_iodone;
1449 	bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1450 	bio->bio_caller_info1.index = SWBIO_READ;
1451 
1452 	/*
1453 	 * Set index.  If raonly set the index beyond the array so all
1454 	 * the pages are treated the same, otherwise the original mreq is
1455 	 * at index 0.
1456 	 */
1457 	if (raonly)
1458 		bio->bio_driver_info = (void *)(intptr_t)i;
1459 	else
1460 		bio->bio_driver_info = (void *)(intptr_t)0;
1461 
1462 	for (j = 0; j < i; ++j) {
1463 		atomic_set_int(&bp->b_xio.xio_pages[j]->busy_count,
1464 			       PBUSY_SWAPINPROG);
1465 	}
1466 
1467 	mycpu->gd_cnt.v_swapin++;
1468 	mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
1469 
1470 	/*
1471 	 * We still hold the lock on mreq, and our automatic completion routine
1472 	 * does not remove it.
1473 	 */
1474 	vm_object_pip_add(object, bp->b_xio.xio_npages);
1475 
1476 	/*
1477 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
1478 	 * this point because we automatically release it on completion.
1479 	 * Instead, we look at the one page we are interested in which we
1480 	 * still hold a lock on even through the I/O completion.
1481 	 *
1482 	 * The other pages in our m[] array are also released on completion,
1483 	 * so we cannot assume they are valid anymore either.
1484 	 */
1485 	bp->b_cmd = BUF_CMD_READ;
1486 	BUF_KERNPROC(bp);
1487 	vn_strategy(swapdev_vp, bio);
1488 
1489 	/*
1490 	 * Wait for the page we want to complete.  PBUSY_SWAPINPROG is always
1491 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
1492 	 * is set in the meta-data.
1493 	 *
1494 	 * If this is a read-ahead only we return immediately without
1495 	 * waiting for I/O.
1496 	 */
1497 	if (raonly) {
1498 		vm_object_drop(object);
1499 		return(VM_PAGER_OK);
1500 	}
1501 
1502 	/*
1503 	 * Read-ahead includes originally requested page case.
1504 	 */
1505 	for (;;) {
1506 		busy_count = mreq->busy_count;
1507 		cpu_ccfence();
1508 		if ((busy_count & PBUSY_SWAPINPROG) == 0)
1509 			break;
1510 		tsleep_interlock(mreq, 0);
1511 		if (!atomic_cmpset_int(&mreq->busy_count, busy_count,
1512 				       busy_count |
1513 				        PBUSY_SWAPINPROG | PBUSY_WANTED)) {
1514 			continue;
1515 		}
1516 		atomic_set_int(&mreq->flags, PG_REFERENCED);
1517 		mycpu->gd_cnt.v_intrans++;
1518 		if (tsleep(mreq, PINTERLOCKED, "swread", hz*20)) {
1519 			kprintf(
1520 			    "swap_pager: indefinite wait buffer: "
1521 				" bp %p offset: %lld, size: %ld\n",
1522 			    bp,
1523 			    (long long)bio->bio_offset,
1524 			    (long)bp->b_bcount
1525 			);
1526 		}
1527 	}
1528 
1529 	/*
1530 	 * Disallow speculative reads prior to the SWAPINPROG test.
1531 	 */
1532 	cpu_lfence();
1533 
1534 	/*
1535 	 * mreq is left busied after completion, but all the other pages
1536 	 * are freed.  If we had an unrecoverable read error the page will
1537 	 * not be valid.
1538 	 */
1539 	vm_object_drop(object);
1540 	if (mreq->valid != VM_PAGE_BITS_ALL)
1541 		return(VM_PAGER_ERROR);
1542 	else
1543 		return(VM_PAGER_OK);
1544 
1545 	/*
1546 	 * A final note: in a low swap situation, we cannot deallocate swap
1547 	 * and mark a page dirty here because the caller is likely to mark
1548 	 * the page clean when we return, causing the page to possibly revert
1549 	 * to all-zero's later.
1550 	 */
1551 }
1552 
1553 /*
1554  *	swap_pager_putpages:
1555  *
1556  *	Assign swap (if necessary) and initiate I/O on the specified pages.
1557  *
1558  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
1559  *	are automatically converted to SWAP objects.
1560  *
1561  *	In a low memory situation we may block in vn_strategy(), but the new
1562  *	vm_page reservation system coupled with properly written VFS devices
1563  *	should ensure that no low-memory deadlock occurs.  This is an area
1564  *	which needs work.
1565  *
1566  *	The parent has N vm_object_pip_add() references prior to
1567  *	calling us and will remove references for rtvals[] that are
1568  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
1569  *	completion.
1570  *
1571  *	The parent has soft-busy'd the pages it passes us and will unbusy
1572  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
1573  *	We need to unbusy the rest on I/O completion.
1574  *
1575  * No requirements.
1576  */
1577 void
1578 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
1579 		    int flags, int *rtvals)
1580 {
1581 	int i;
1582 	int n = 0;
1583 
1584 	vm_object_hold(object);
1585 
1586 	if (count && m[0]->object != object) {
1587 		panic("swap_pager_getpages: object mismatch %p/%p",
1588 		    object,
1589 		    m[0]->object
1590 		);
1591 	}
1592 
1593 	/*
1594 	 * Step 1
1595 	 *
1596 	 * Turn object into OBJT_SWAP
1597 	 * Check for bogus sysops
1598 	 *
1599 	 * Force sync if not pageout process, we don't want any single
1600 	 * non-pageout process to be able to hog the I/O subsystem!  This
1601 	 * can be overridden by setting.
1602 	 */
1603 	if (object->type == OBJT_DEFAULT) {
1604 		if (object->type == OBJT_DEFAULT)
1605 			swp_pager_meta_convert(object);
1606 	}
1607 
1608 	/*
1609 	 * Normally we force synchronous swap I/O if this is not the
1610 	 * pageout daemon to prevent any single user process limited
1611 	 * via RLIMIT_RSS from hogging swap write bandwidth.
1612 	 */
1613 	if (curthread != pagethread &&
1614 	    curthread != emergpager &&
1615 	    swap_user_async == 0) {
1616 		flags |= VM_PAGER_PUT_SYNC;
1617 	}
1618 
1619 	/*
1620 	 * Step 2
1621 	 *
1622 	 * Update nsw parameters from swap_async_max sysctl values.
1623 	 * Do not let the sysop crash the machine with bogus numbers.
1624 	 */
1625 	if (swap_async_max != nsw_wcount_async_max) {
1626 		int n;
1627 
1628 		/*
1629 		 * limit range
1630 		 */
1631 		if ((n = swap_async_max) > nswbuf_kva / 2)
1632 			n = nswbuf_kva / 2;
1633 		if (n < 1)
1634 			n = 1;
1635 		swap_async_max = n;
1636 
1637 		/*
1638 		 * Adjust difference ( if possible ).  If the current async
1639 		 * count is too low, we may not be able to make the adjustment
1640 		 * at this time.
1641 		 *
1642 		 * vm_token needed for nsw_wcount sleep interlock
1643 		 */
1644 		lwkt_gettoken(&vm_token);
1645 		n -= nsw_wcount_async_max;
1646 		if (nsw_wcount_async + n >= 0) {
1647 			nsw_wcount_async_max += n;
1648 			pbuf_adjcount(&nsw_wcount_async, n);
1649 		}
1650 		lwkt_reltoken(&vm_token);
1651 	}
1652 
1653 	/*
1654 	 * Step 3
1655 	 *
1656 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
1657 	 * The page is left dirty until the pageout operation completes
1658 	 * successfully.
1659 	 */
1660 
1661 	for (i = 0; i < count; i += n) {
1662 		struct buf *bp;
1663 		struct bio *bio;
1664 		swblk_t blk;
1665 		int j;
1666 
1667 		/*
1668 		 * Maximum I/O size is limited by a number of factors.
1669 		 */
1670 
1671 		n = min(BLIST_MAX_ALLOC, count - i);
1672 		n = min(n, nsw_cluster_max);
1673 
1674 		lwkt_gettoken(&vm_token);
1675 
1676 		/*
1677 		 * Get biggest block of swap we can.  If we fail, fall
1678 		 * back and try to allocate a smaller block.  Don't go
1679 		 * overboard trying to allocate space if it would overly
1680 		 * fragment swap.
1681 		 */
1682 		while (
1683 		    (blk = swp_pager_getswapspace(object, n)) == SWAPBLK_NONE &&
1684 		    n > 4
1685 		) {
1686 			n >>= 1;
1687 		}
1688 		if (blk == SWAPBLK_NONE) {
1689 			for (j = 0; j < n; ++j)
1690 				rtvals[i+j] = VM_PAGER_FAIL;
1691 			lwkt_reltoken(&vm_token);
1692 			continue;
1693 		}
1694 		if (vm_report_swap_allocs > 0) {
1695 			kprintf("swap_alloc %08jx,%d\n", (intmax_t)blk, n);
1696 			--vm_report_swap_allocs;
1697 		}
1698 
1699 		/*
1700 		 * The I/O we are constructing cannot cross a physical
1701 		 * disk boundry in the swap stripe.
1702 		 */
1703 		if ((blk ^ (blk + n)) & ~SWB_DMMASK) {
1704 			j = ((blk + SWB_DMMAX) & ~SWB_DMMASK) - blk;
1705 			swp_pager_freeswapspace(object, blk + j, n - j);
1706 			n = j;
1707 		}
1708 
1709 		/*
1710 		 * All I/O parameters have been satisfied, build the I/O
1711 		 * request and assign the swap space.
1712 		 *
1713 		 * Use the KVABIO API to avoid synchronizing the pmap.
1714 		 */
1715 		if ((flags & VM_PAGER_PUT_SYNC))
1716 			bp = getpbuf_kva(&nsw_wcount_sync);
1717 		else
1718 			bp = getpbuf_kva(&nsw_wcount_async);
1719 		bio = &bp->b_bio1;
1720 
1721 		lwkt_reltoken(&vm_token);
1722 
1723 		pmap_qenter_noinval((vm_offset_t)bp->b_data, &m[i], n);
1724 
1725 		bp->b_flags |= B_KVABIO;
1726 		bp->b_bcount = PAGE_SIZE * n;
1727 		bio->bio_offset = (off_t)blk << PAGE_SHIFT;
1728 
1729 		for (j = 0; j < n; ++j) {
1730 			vm_page_t mreq = m[i+j];
1731 
1732 			swp_pager_meta_build(mreq->object, mreq->pindex,
1733 					     blk + j);
1734 			if (object->type == OBJT_SWAP)
1735 				vm_page_dirty(mreq);
1736 			rtvals[i+j] = VM_PAGER_OK;
1737 
1738 			atomic_set_int(&mreq->busy_count, PBUSY_SWAPINPROG);
1739 			bp->b_xio.xio_pages[j] = mreq;
1740 		}
1741 		bp->b_xio.xio_npages = n;
1742 
1743 		mycpu->gd_cnt.v_swapout++;
1744 		mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
1745 
1746 		bp->b_dirtyoff = 0;		/* req'd for NFS */
1747 		bp->b_dirtyend = bp->b_bcount;	/* req'd for NFS */
1748 		bp->b_cmd = BUF_CMD_WRITE;
1749 		bio->bio_caller_info1.index = SWBIO_WRITE;
1750 
1751 		/*
1752 		 * asynchronous
1753 		 */
1754 		if ((flags & VM_PAGER_PUT_SYNC) == 0) {
1755 			bio->bio_done = swp_pager_async_iodone;
1756 			BUF_KERNPROC(bp);
1757 			vn_strategy(swapdev_vp, bio);
1758 
1759 			for (j = 0; j < n; ++j)
1760 				rtvals[i+j] = VM_PAGER_PEND;
1761 			continue;
1762 		}
1763 
1764 		/*
1765 		 * Issue synchrnously.
1766 		 *
1767 		 * Wait for the sync I/O to complete, then update rtvals.
1768 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
1769 		 * our async completion routine at the end, thus avoiding a
1770 		 * double-free.
1771 		 */
1772 		bio->bio_caller_info1.index |= SWBIO_SYNC;
1773 		if (flags & VM_PAGER_TRY_TO_CACHE)
1774 			bio->bio_caller_info1.index |= SWBIO_TTC;
1775 		bio->bio_done = biodone_sync;
1776 		bio->bio_flags |= BIO_SYNC;
1777 		vn_strategy(swapdev_vp, bio);
1778 		biowait(bio, "swwrt");
1779 
1780 		for (j = 0; j < n; ++j)
1781 			rtvals[i+j] = VM_PAGER_PEND;
1782 
1783 		/*
1784 		 * Now that we are through with the bp, we can call the
1785 		 * normal async completion, which frees everything up.
1786 		 */
1787 		swp_pager_async_iodone(bio);
1788 	}
1789 	vm_object_drop(object);
1790 }
1791 
1792 /*
1793  * No requirements.
1794  *
1795  * Recalculate the low and high-water marks.
1796  */
1797 void
1798 swap_pager_newswap(void)
1799 {
1800 	/*
1801 	 * NOTE: vm_swap_max cannot exceed 1 billion blocks, which is the
1802 	 *	 limitation imposed by the blist code.  Remember that this
1803 	 *	 will be divided by NSWAP_MAX (4), so each swap device is
1804 	 *	 limited to around a terrabyte.
1805 	 */
1806 	if (vm_swap_max) {
1807 		nswap_lowat = (int64_t)vm_swap_max * 4 / 100;	/* 4% left */
1808 		nswap_hiwat = (int64_t)vm_swap_max * 6 / 100;	/* 6% left */
1809 		kprintf("swap low/high-water marks set to %d/%d\n",
1810 			nswap_lowat, nswap_hiwat);
1811 	} else {
1812 		nswap_lowat = 128;
1813 		nswap_hiwat = 512;
1814 	}
1815 	swp_sizecheck();
1816 }
1817 
1818 /*
1819  *	swp_pager_async_iodone:
1820  *
1821  *	Completion routine for asynchronous reads and writes from/to swap.
1822  *	Also called manually by synchronous code to finish up a bp.
1823  *
1824  *	For READ operations, the pages are BUSY'd.  For WRITE operations,
1825  *	the pages are vm_page_t->busy'd.  For READ operations, we BUSY
1826  *	unbusy all pages except the 'main' request page.  For WRITE
1827  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this
1828  *	because we marked them all VM_PAGER_PEND on return from putpages ).
1829  *
1830  *	This routine may not block.
1831  *
1832  * No requirements.
1833  */
1834 static void
1835 swp_pager_async_iodone(struct bio *bio)
1836 {
1837 	struct buf *bp = bio->bio_buf;
1838 	vm_object_t object = NULL;
1839 	int i;
1840 	int *nswptr;
1841 
1842 	/*
1843 	 * report error
1844 	 */
1845 	if (bp->b_flags & B_ERROR) {
1846 		kprintf(
1847 		    "swap_pager: I/O error - %s failed; offset %lld,"
1848 			"size %ld, error %d\n",
1849 		    ((bio->bio_caller_info1.index & SWBIO_READ) ?
1850 			"pagein" : "pageout"),
1851 		    (long long)bio->bio_offset,
1852 		    (long)bp->b_bcount,
1853 		    bp->b_error
1854 		);
1855 	}
1856 
1857 	/*
1858 	 * set object.
1859 	 */
1860 	if (bp->b_xio.xio_npages)
1861 		object = bp->b_xio.xio_pages[0]->object;
1862 
1863 #if 0
1864 	/* PMAP TESTING CODE (useful, keep it in but #if 0'd) */
1865 	if (bio->bio_caller_info1.index & SWBIO_WRITE) {
1866 		if (bio->bio_crc != iscsi_crc32(bp->b_data, bp->b_bcount)) {
1867 			kprintf("SWAPOUT: BADCRC %08x %08x\n",
1868 				bio->bio_crc,
1869 				iscsi_crc32(bp->b_data, bp->b_bcount));
1870 			for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1871 				vm_page_t m = bp->b_xio.xio_pages[i];
1872 				if ((m->flags & PG_WRITEABLE) &&
1873 				    (pmap_mapped_sync(m) & PG_WRITEABLE)) {
1874 					kprintf("SWAPOUT: "
1875 						"%d/%d %p writable\n",
1876 						i, bp->b_xio.xio_npages, m);
1877 				}
1878 			}
1879 		}
1880 	}
1881 #endif
1882 
1883 	/*
1884 	 * remove the mapping for kernel virtual
1885 	 */
1886 	pmap_qremove((vm_offset_t)bp->b_data, bp->b_xio.xio_npages);
1887 
1888 	/*
1889 	 * cleanup pages.  If an error occurs writing to swap, we are in
1890 	 * very serious trouble.  If it happens to be a disk error, though,
1891 	 * we may be able to recover by reassigning the swap later on.  So
1892 	 * in this case we remove the m->swapblk assignment for the page
1893 	 * but do not free it in the rlist.  The errornous block(s) are thus
1894 	 * never reallocated as swap.  Redirty the page and continue.
1895 	 */
1896 	for (i = 0; i < bp->b_xio.xio_npages; ++i) {
1897 		vm_page_t m = bp->b_xio.xio_pages[i];
1898 
1899 		if (bp->b_flags & B_ERROR) {
1900 			/*
1901 			 * If an error occurs I'd love to throw the swapblk
1902 			 * away without freeing it back to swapspace, so it
1903 			 * can never be used again.  But I can't from an
1904 			 * interrupt.
1905 			 */
1906 
1907 			if (bio->bio_caller_info1.index & SWBIO_READ) {
1908 				/*
1909 				 * When reading, reqpage needs to stay
1910 				 * locked for the parent, but all other
1911 				 * pages can be freed.  We still want to
1912 				 * wakeup the parent waiting on the page,
1913 				 * though.  ( also: pg_reqpage can be -1 and
1914 				 * not match anything ).
1915 				 *
1916 				 * We have to wake specifically requested pages
1917 				 * up too because we cleared SWAPINPROG and
1918 				 * someone may be waiting for that.
1919 				 *
1920 				 * NOTE: For reads, m->dirty will probably
1921 				 *	 be overridden by the original caller
1922 				 *	 of getpages so don't play cute tricks
1923 				 *	 here.
1924 				 *
1925 				 * NOTE: We can't actually free the page from
1926 				 *	 here, because this is an interrupt.
1927 				 *	 It is not legal to mess with
1928 				 *	 object->memq from an interrupt.
1929 				 *	 Deactivate the page instead.
1930 				 *
1931 				 * WARNING! The instant SWAPINPROG is
1932 				 *	    cleared another cpu may start
1933 				 *	    using the mreq page (it will
1934 				 *	    check m->valid immediately).
1935 				 */
1936 
1937 				m->valid = 0;
1938 				atomic_clear_int(&m->busy_count,
1939 						 PBUSY_SWAPINPROG);
1940 
1941 				/*
1942 				 * bio_driver_info holds the requested page
1943 				 * index.
1944 				 */
1945 				if (i != (int)(intptr_t)bio->bio_driver_info) {
1946 					vm_page_deactivate(m);
1947 					vm_page_wakeup(m);
1948 				} else {
1949 					vm_page_flash(m);
1950 				}
1951 				/*
1952 				 * If i == bp->b_pager.pg_reqpage, do not wake
1953 				 * the page up.  The caller needs to.
1954 				 */
1955 			} else {
1956 				/*
1957 				 * If a write error occurs remove the swap
1958 				 * assignment (note that PG_SWAPPED may or
1959 				 * may not be set depending on prior activity).
1960 				 *
1961 				 * Re-dirty OBJT_SWAP pages as there is no
1962 				 * other backing store, we can't throw the
1963 				 * page away.
1964 				 *
1965 				 * Non-OBJT_SWAP pages (aka swapcache) must
1966 				 * not be dirtied since they may not have
1967 				 * been dirty in the first place, and they
1968 				 * do have backing store (the vnode).
1969 				 */
1970 				vm_page_busy_wait(m, FALSE, "swadpg");
1971 				vm_object_hold(m->object);
1972 				swp_pager_meta_ctl(m->object, m->pindex,
1973 						   SWM_FREE);
1974 				vm_page_flag_clear(m, PG_SWAPPED);
1975 				vm_object_drop(m->object);
1976 				if (m->object->type == OBJT_SWAP) {
1977 					vm_page_dirty(m);
1978 					vm_page_activate(m);
1979 				}
1980 				vm_page_io_finish(m);
1981 				atomic_clear_int(&m->busy_count,
1982 						 PBUSY_SWAPINPROG);
1983 				vm_page_wakeup(m);
1984 			}
1985 		} else if (bio->bio_caller_info1.index & SWBIO_READ) {
1986 			/*
1987 			 * NOTE: for reads, m->dirty will probably be
1988 			 * overridden by the original caller of getpages so
1989 			 * we cannot set them in order to free the underlying
1990 			 * swap in a low-swap situation.  I don't think we'd
1991 			 * want to do that anyway, but it was an optimization
1992 			 * that existed in the old swapper for a time before
1993 			 * it got ripped out due to precisely this problem.
1994 			 *
1995 			 * If not the requested page then deactivate it.
1996 			 *
1997 			 * Note that the requested page, reqpage, is left
1998 			 * busied, but we still have to wake it up.  The
1999 			 * other pages are released (unbusied) by
2000 			 * vm_page_wakeup().  We do not set reqpage's
2001 			 * valid bits here, it is up to the caller.
2002 			 */
2003 
2004 			/*
2005 			 * NOTE: Can't call pmap_clear_modify(m) from an
2006 			 *	 interrupt thread, the pmap code may have to
2007 			 *	 map non-kernel pmaps and currently asserts
2008 			 *	 the case.
2009 			 *
2010 			 * WARNING! The instant SWAPINPROG is
2011 			 *	    cleared another cpu may start
2012 			 *	    using the mreq page (it will
2013 			 *	    check m->valid immediately).
2014 			 */
2015 			/*pmap_clear_modify(m);*/
2016 			m->valid = VM_PAGE_BITS_ALL;
2017 			vm_page_undirty(m);
2018 			vm_page_flag_set(m, PG_SWAPPED);
2019 			atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
2020 
2021 			/*
2022 			 * We have to wake specifically requested pages
2023 			 * up too because we cleared SWAPINPROG and
2024 			 * could be waiting for it in getpages.  However,
2025 			 * be sure to not unbusy getpages specifically
2026 			 * requested page - getpages expects it to be
2027 			 * left busy.
2028 			 *
2029 			 * bio_driver_info holds the requested page
2030 			 */
2031 			if (i != (int)(intptr_t)bio->bio_driver_info) {
2032 				vm_page_deactivate(m);
2033 				vm_page_wakeup(m);
2034 			} else {
2035 				vm_page_flash(m);
2036 			}
2037 		} else {
2038 			/*
2039 			 * Mark the page clean but do not mess with the
2040 			 * pmap-layer's modified state.  That state should
2041 			 * also be clear since the caller protected the
2042 			 * page VM_PROT_READ, but allow the case.
2043 			 *
2044 			 * We are in an interrupt, avoid pmap operations.
2045 			 *
2046 			 * If we have a severe page deficit, deactivate the
2047 			 * page.  Do not try to cache it (which would also
2048 			 * involve a pmap op), because the page might still
2049 			 * be read-heavy.
2050 			 *
2051 			 * When using the swap to cache clean vnode pages
2052 			 * we do not mess with the page dirty bits.
2053 			 *
2054 			 * NOTE! Nobody is waiting for the key mreq page
2055 			 *	 on write completion.
2056 			 */
2057 			vm_page_busy_wait(m, FALSE, "swadpg");
2058 			if (m->object->type == OBJT_SWAP)
2059 				vm_page_undirty(m);
2060 			vm_page_flag_set(m, PG_SWAPPED);
2061 			atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
2062 			if (vm_page_count_severe())
2063 				vm_page_deactivate(m);
2064 			vm_page_io_finish(m);
2065 			if (bio->bio_caller_info1.index & SWBIO_TTC)
2066 				vm_page_try_to_cache(m);
2067 			else
2068 				vm_page_wakeup(m);
2069 		}
2070 	}
2071 
2072 	/*
2073 	 * adjust pip.  NOTE: the original parent may still have its own
2074 	 * pip refs on the object.
2075 	 */
2076 
2077 	if (object)
2078 		vm_object_pip_wakeup_n(object, bp->b_xio.xio_npages);
2079 
2080 	/*
2081 	 * Release the physical I/O buffer.
2082 	 *
2083 	 * NOTE: Due to synchronous operations in the write case b_cmd may
2084 	 *	 already be set to BUF_CMD_DONE and BIO_SYNC may have already
2085 	 *	 been cleared.
2086 	 *
2087 	 * Use vm_token to interlock nsw_rcount/wcount wakeup?
2088 	 */
2089 	lwkt_gettoken(&vm_token);
2090 	if (bio->bio_caller_info1.index & SWBIO_READ)
2091 		nswptr = &nsw_rcount;
2092 	else if (bio->bio_caller_info1.index & SWBIO_SYNC)
2093 		nswptr = &nsw_wcount_sync;
2094 	else
2095 		nswptr = &nsw_wcount_async;
2096 	bp->b_cmd = BUF_CMD_DONE;
2097 	relpbuf(bp, nswptr);
2098 	lwkt_reltoken(&vm_token);
2099 }
2100 
2101 /*
2102  * Fault-in a potentially swapped page and remove the swap reference.
2103  * (used by swapoff code)
2104  *
2105  * object must be held.
2106  */
2107 static __inline void
2108 swp_pager_fault_page(vm_object_t object, int *sharedp, vm_pindex_t pindex)
2109 {
2110 	struct vnode *vp;
2111 	vm_page_t m;
2112 	int error;
2113 
2114 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2115 
2116 	if (object->type == OBJT_VNODE) {
2117 		/*
2118 		 * Any swap related to a vnode is due to swapcache.  We must
2119 		 * vget() the vnode in case it is not active (otherwise
2120 		 * vref() will panic).  Calling vm_object_page_remove() will
2121 		 * ensure that any swap ref is removed interlocked with the
2122 		 * page.  clean_only is set to TRUE so we don't throw away
2123 		 * dirty pages.
2124 		 */
2125 		vp = object->handle;
2126 		error = vget(vp, LK_SHARED | LK_RETRY | LK_CANRECURSE);
2127 		if (error == 0) {
2128 			vm_object_page_remove(object, pindex, pindex + 1, TRUE);
2129 			vput(vp);
2130 		}
2131 	} else {
2132 		/*
2133 		 * Otherwise it is a normal OBJT_SWAP object and we can
2134 		 * fault the page in and remove the swap.
2135 		 */
2136 		m = vm_fault_object_page(object, IDX_TO_OFF(pindex),
2137 					 VM_PROT_NONE,
2138 					 VM_FAULT_DIRTY | VM_FAULT_UNSWAP,
2139 					 sharedp, &error);
2140 		if (m)
2141 			vm_page_unhold(m);
2142 	}
2143 }
2144 
2145 /*
2146  * This removes all swap blocks related to a particular device.  We have
2147  * to be careful of ripups during the scan.
2148  */
2149 static int swp_pager_swapoff_callback(struct swblock *swap, void *data);
2150 
2151 int
2152 swap_pager_swapoff(int devidx)
2153 {
2154 	struct vm_object_hash *hash;
2155 	struct swswapoffinfo info;
2156 	struct vm_object marker;
2157 	vm_object_t object;
2158 	int n;
2159 
2160 	bzero(&marker, sizeof(marker));
2161 	marker.type = OBJT_MARKER;
2162 
2163 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
2164 		hash = &vm_object_hash[n];
2165 
2166 		lwkt_gettoken(&hash->token);
2167 		TAILQ_INSERT_HEAD(&hash->list, &marker, object_entry);
2168 
2169 		while ((object = TAILQ_NEXT(&marker, object_entry)) != NULL) {
2170 			if (object->type == OBJT_MARKER)
2171 				goto skip;
2172 			if (object->type != OBJT_SWAP &&
2173 			    object->type != OBJT_VNODE)
2174 				goto skip;
2175 			vm_object_hold(object);
2176 			if (object->type != OBJT_SWAP &&
2177 			    object->type != OBJT_VNODE) {
2178 				vm_object_drop(object);
2179 				goto skip;
2180 			}
2181 
2182 			/*
2183 			 * Object is special in that we can't just pagein
2184 			 * into vm_page's in it (tmpfs, vn).
2185 			 */
2186 			if ((object->flags & OBJ_NOPAGEIN) &&
2187 			    RB_ROOT(&object->swblock_root)) {
2188 				vm_object_drop(object);
2189 				goto skip;
2190 			}
2191 
2192 			info.object = object;
2193 			info.shared = 0;
2194 			info.devidx = devidx;
2195 			swblock_rb_tree_RB_SCAN(&object->swblock_root,
2196 					    NULL, swp_pager_swapoff_callback,
2197 					    &info);
2198 			vm_object_drop(object);
2199 skip:
2200 			if (object == TAILQ_NEXT(&marker, object_entry)) {
2201 				TAILQ_REMOVE(&hash->list, &marker,
2202 					     object_entry);
2203 				TAILQ_INSERT_AFTER(&hash->list, object,
2204 						   &marker, object_entry);
2205 			}
2206 		}
2207 		TAILQ_REMOVE(&hash->list, &marker, object_entry);
2208 		lwkt_reltoken(&hash->token);
2209 	}
2210 
2211 	/*
2212 	 * If we fail to locate all swblocks we just fail gracefully and
2213 	 * do not bother to restore paging on the swap device.  If the
2214 	 * user wants to retry the user can retry.
2215 	 */
2216 	if (swdevt[devidx].sw_nused)
2217 		return (1);
2218 	else
2219 		return (0);
2220 }
2221 
2222 static
2223 int
2224 swp_pager_swapoff_callback(struct swblock *swap, void *data)
2225 {
2226 	struct swswapoffinfo *info = data;
2227 	vm_object_t object = info->object;
2228 	vm_pindex_t index;
2229 	swblk_t v;
2230 	int i;
2231 
2232 	index = swap->swb_index;
2233 	for (i = 0; i < SWAP_META_PAGES; ++i) {
2234 		/*
2235 		 * Make sure we don't race a dying object.  This will
2236 		 * kill the scan of the object's swap blocks entirely.
2237 		 */
2238 		if (object->flags & OBJ_DEAD)
2239 			return(-1);
2240 
2241 		/*
2242 		 * Fault the page, which can obviously block.  If the swap
2243 		 * structure disappears break out.
2244 		 */
2245 		v = swap->swb_pages[i];
2246 		if (v != SWAPBLK_NONE && BLK2DEVIDX(v) == info->devidx) {
2247 			swp_pager_fault_page(object, &info->shared,
2248 					     swap->swb_index + i);
2249 			/* swap ptr might go away */
2250 			if (RB_LOOKUP(swblock_rb_tree,
2251 				      &object->swblock_root, index) != swap) {
2252 				break;
2253 			}
2254 		}
2255 	}
2256 	return(0);
2257 }
2258 
2259 /************************************************************************
2260  *				SWAP META DATA 				*
2261  ************************************************************************
2262  *
2263  *	These routines manipulate the swap metadata stored in the
2264  *	OBJT_SWAP object.
2265  *
2266  *	Swap metadata is implemented with a global hash and not directly
2267  *	linked into the object.  Instead the object simply contains
2268  *	appropriate tracking counters.
2269  */
2270 
2271 /*
2272  * Lookup the swblock containing the specified swap block index.
2273  *
2274  * The caller must hold the object.
2275  */
2276 static __inline
2277 struct swblock *
2278 swp_pager_lookup(vm_object_t object, vm_pindex_t index)
2279 {
2280 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2281 	index &= ~(vm_pindex_t)SWAP_META_MASK;
2282 	return (RB_LOOKUP(swblock_rb_tree, &object->swblock_root, index));
2283 }
2284 
2285 /*
2286  * Remove a swblock from the RB tree.
2287  *
2288  * The caller must hold the object.
2289  */
2290 static __inline
2291 void
2292 swp_pager_remove(vm_object_t object, struct swblock *swap)
2293 {
2294 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2295 	RB_REMOVE(swblock_rb_tree, &object->swblock_root, swap);
2296 }
2297 
2298 /*
2299  * Convert default object to swap object if necessary
2300  *
2301  * The caller must hold the object.
2302  */
2303 static void
2304 swp_pager_meta_convert(vm_object_t object)
2305 {
2306 	if (object->type == OBJT_DEFAULT) {
2307 		object->type = OBJT_SWAP;
2308 		KKASSERT(object->swblock_count == 0);
2309 	}
2310 }
2311 
2312 /*
2313  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
2314  *
2315  *	We first convert the object to a swap object if it is a default
2316  *	object.  Vnode objects do not need to be converted.
2317  *
2318  *	The specified swapblk is added to the object's swap metadata.  If
2319  *	the swapblk is not valid, it is freed instead.  Any previously
2320  *	assigned swapblk is freed.
2321  *
2322  * The caller must hold the object.
2323  */
2324 static void
2325 swp_pager_meta_build(vm_object_t object, vm_pindex_t index, swblk_t swapblk)
2326 {
2327 	struct swblock *swap;
2328 	struct swblock *oswap;
2329 	vm_pindex_t v;
2330 
2331 	KKASSERT(swapblk != SWAPBLK_NONE);
2332 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2333 
2334 	/*
2335 	 * Convert object if necessary
2336 	 */
2337 	if (object->type == OBJT_DEFAULT)
2338 		swp_pager_meta_convert(object);
2339 
2340 	/*
2341 	 * Locate swblock.  If not found create, but if we aren't adding
2342 	 * anything just return.  If we run out of space in the map we wait
2343 	 * and, since the hash table may have changed, retry.
2344 	 */
2345 retry:
2346 	swap = swp_pager_lookup(object, index);
2347 
2348 	if (swap == NULL) {
2349 		int i;
2350 
2351 		swap = zalloc(swap_zone);
2352 		if (swap == NULL) {
2353 			vm_wait(0);
2354 			goto retry;
2355 		}
2356 		swap->swb_index = index & ~(vm_pindex_t)SWAP_META_MASK;
2357 		swap->swb_count = 0;
2358 
2359 		++object->swblock_count;
2360 
2361 		for (i = 0; i < SWAP_META_PAGES; ++i)
2362 			swap->swb_pages[i] = SWAPBLK_NONE;
2363 		oswap = RB_INSERT(swblock_rb_tree, &object->swblock_root, swap);
2364 		KKASSERT(oswap == NULL);
2365 	}
2366 
2367 	/*
2368 	 * Delete prior contents of metadata.
2369 	 *
2370 	 * NOTE: Decrement swb_count after the freeing operation (which
2371 	 *	 might block) to prevent racing destruction of the swblock.
2372 	 */
2373 	index &= SWAP_META_MASK;
2374 
2375 	while ((v = swap->swb_pages[index]) != SWAPBLK_NONE) {
2376 		swap->swb_pages[index] = SWAPBLK_NONE;
2377 		/* can block */
2378 		swp_pager_freeswapspace(object, v, 1);
2379 		--swap->swb_count;
2380 		--mycpu->gd_vmtotal.t_vm;
2381 	}
2382 
2383 	/*
2384 	 * Enter block into metadata
2385 	 */
2386 	swap->swb_pages[index] = swapblk;
2387 	if (swapblk != SWAPBLK_NONE) {
2388 		++swap->swb_count;
2389 		++mycpu->gd_vmtotal.t_vm;
2390 	}
2391 }
2392 
2393 /*
2394  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
2395  *
2396  *	The requested range of blocks is freed, with any associated swap
2397  *	returned to the swap bitmap.
2398  *
2399  *	This routine will free swap metadata structures as they are cleaned
2400  *	out.  This routine does *NOT* operate on swap metadata associated
2401  *	with resident pages.
2402  *
2403  * The caller must hold the object.
2404  */
2405 static int swp_pager_meta_free_callback(struct swblock *swb, void *data);
2406 
2407 static void
2408 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, vm_pindex_t count)
2409 {
2410 	struct swfreeinfo info;
2411 
2412 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2413 
2414 	/*
2415 	 * Nothing to do
2416 	 */
2417 	if (object->swblock_count == 0) {
2418 		KKASSERT(RB_EMPTY(&object->swblock_root));
2419 		return;
2420 	}
2421 	if (count == 0)
2422 		return;
2423 
2424 	/*
2425 	 * Setup for RB tree scan.  Note that the pindex range can be huge
2426 	 * due to the 64 bit page index space so we cannot safely iterate.
2427 	 */
2428 	info.object = object;
2429 	info.basei = index & ~(vm_pindex_t)SWAP_META_MASK;
2430 	info.begi = index;
2431 	info.endi = index + count - 1;
2432 	swblock_rb_tree_RB_SCAN(&object->swblock_root, rb_swblock_scancmp,
2433 				swp_pager_meta_free_callback, &info);
2434 }
2435 
2436 /*
2437  * The caller must hold the object.
2438  */
2439 static
2440 int
2441 swp_pager_meta_free_callback(struct swblock *swap, void *data)
2442 {
2443 	struct swfreeinfo *info = data;
2444 	vm_object_t object = info->object;
2445 	int index;
2446 	int eindex;
2447 
2448 	/*
2449 	 * Figure out the range within the swblock.  The wider scan may
2450 	 * return edge-case swap blocks when the start and/or end points
2451 	 * are in the middle of a block.
2452 	 */
2453 	if (swap->swb_index < info->begi)
2454 		index = (int)info->begi & SWAP_META_MASK;
2455 	else
2456 		index = 0;
2457 
2458 	if (swap->swb_index + SWAP_META_PAGES > info->endi)
2459 		eindex = (int)info->endi & SWAP_META_MASK;
2460 	else
2461 		eindex = SWAP_META_MASK;
2462 
2463 	/*
2464 	 * Scan and free the blocks.  The loop terminates early
2465 	 * if (swap) runs out of blocks and could be freed.
2466 	 *
2467 	 * NOTE: Decrement swb_count after swp_pager_freeswapspace()
2468 	 *	 to deal with a zfree race.
2469 	 */
2470 	while (index <= eindex) {
2471 		swblk_t v = swap->swb_pages[index];
2472 
2473 		if (v != SWAPBLK_NONE) {
2474 			swap->swb_pages[index] = SWAPBLK_NONE;
2475 			/* can block */
2476 			swp_pager_freeswapspace(object, v, 1);
2477 			--mycpu->gd_vmtotal.t_vm;
2478 			if (--swap->swb_count == 0) {
2479 				swp_pager_remove(object, swap);
2480 				zfree(swap_zone, swap);
2481 				--object->swblock_count;
2482 				break;
2483 			}
2484 		}
2485 		++index;
2486 	}
2487 
2488 	/* swap may be invalid here due to zfree above */
2489 	lwkt_yield();
2490 
2491 	return(0);
2492 }
2493 
2494 /*
2495  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
2496  *
2497  *	This routine locates and destroys all swap metadata associated with
2498  *	an object.
2499  *
2500  * NOTE: Decrement swb_count after the freeing operation (which
2501  *	 might block) to prevent racing destruction of the swblock.
2502  *
2503  * The caller must hold the object.
2504  */
2505 static void
2506 swp_pager_meta_free_all(vm_object_t object)
2507 {
2508 	struct swblock *swap;
2509 	int i;
2510 
2511 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
2512 
2513 	while ((swap = RB_ROOT(&object->swblock_root)) != NULL) {
2514 		swp_pager_remove(object, swap);
2515 		for (i = 0; i < SWAP_META_PAGES; ++i) {
2516 			swblk_t v = swap->swb_pages[i];
2517 			if (v != SWAPBLK_NONE) {
2518 				/* can block */
2519 				swp_pager_freeswapspace(object, v, 1);
2520 				--swap->swb_count;
2521 				--mycpu->gd_vmtotal.t_vm;
2522 			}
2523 		}
2524 		if (swap->swb_count != 0)
2525 			panic("swap_pager_meta_free_all: swb_count != 0");
2526 		zfree(swap_zone, swap);
2527 		--object->swblock_count;
2528 		lwkt_yield();
2529 	}
2530 	KKASSERT(object->swblock_count == 0);
2531 }
2532 
2533 /*
2534  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
2535  *
2536  *	This routine is capable of looking up, popping, or freeing
2537  *	swapblk assignments in the swap meta data or in the vm_page_t.
2538  *	The routine typically returns the swapblk being looked-up, or popped,
2539  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
2540  *	was invalid.  This routine will automatically free any invalid
2541  *	meta-data swapblks.
2542  *
2543  *	It is not possible to store invalid swapblks in the swap meta data
2544  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
2545  *
2546  *	When acting on a busy resident page and paging is in progress, we
2547  *	have to wait until paging is complete but otherwise can act on the
2548  *	busy page.
2549  *
2550  *	SWM_FREE	remove and free swap block from metadata
2551  *	SWM_POP		remove from meta data but do not free.. pop it out
2552  *
2553  * The caller must hold the object.
2554  */
2555 static swblk_t
2556 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t index, int flags)
2557 {
2558 	struct swblock *swap;
2559 	swblk_t r1;
2560 
2561 	if (object->swblock_count == 0)
2562 		return(SWAPBLK_NONE);
2563 
2564 	r1 = SWAPBLK_NONE;
2565 	swap = swp_pager_lookup(object, index);
2566 
2567 	if (swap != NULL) {
2568 		index &= SWAP_META_MASK;
2569 		r1 = swap->swb_pages[index];
2570 
2571 		if (r1 != SWAPBLK_NONE) {
2572 			if (flags & (SWM_FREE|SWM_POP)) {
2573 				swap->swb_pages[index] = SWAPBLK_NONE;
2574 				--mycpu->gd_vmtotal.t_vm;
2575 				if (--swap->swb_count == 0) {
2576 					swp_pager_remove(object, swap);
2577 					zfree(swap_zone, swap);
2578 					--object->swblock_count;
2579 				}
2580 			}
2581 			/* swap ptr may be invalid */
2582 			if (flags & SWM_FREE) {
2583 				swp_pager_freeswapspace(object, r1, 1);
2584 				r1 = SWAPBLK_NONE;
2585 			}
2586 		}
2587 		/* swap ptr may be invalid */
2588 	}
2589 	return(r1);
2590 }
2591