xref: /dragonfly/sys/vm/vm_swap.c (revision 65cc0652)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
32  * $FreeBSD: src/sys/vm/vm_swap.c,v 1.96.2.2 2001/10/14 18:46:47 iedowse Exp $
33  */
34 
35 #include "opt_swap.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/sysproto.h>
40 #include <sys/buf.h>
41 #include <sys/proc.h>
42 #include <sys/priv.h>
43 #include <sys/nlookup.h>
44 #include <sys/sysctl.h>
45 #include <sys/dmap.h>		/* XXX */
46 #include <sys/vnode.h>
47 #include <sys/fcntl.h>
48 #include <sys/blist.h>
49 #include <sys/kernel.h>
50 #include <sys/lock.h>
51 #include <sys/conf.h>
52 #include <sys/stat.h>
53 
54 #include <vm/vm.h>
55 #include <vm/vm_extern.h>
56 #include <vm/swap_pager.h>
57 #include <vm/vm_zone.h>
58 #include <vm/vm_param.h>
59 
60 #include <sys/thread2.h>
61 #include <sys/mutex2.h>
62 #include <sys/spinlock2.h>
63 
64 /*
65  * Indirect driver for multi-controller paging.
66  */
67 
68 #ifndef NSWAPDEV
69 #define NSWAPDEV	4
70 #endif
71 static struct swdevt should_be_malloced[NSWAPDEV];
72 struct swdevt *swdevt = should_be_malloced;	/* exported to pstat/systat */
73 static swblk_t nswap;		/* first block after the interleaved devs */
74 static struct mtx swap_mtx = MTX_INITIALIZER("swpmtx");
75 int nswdev = NSWAPDEV;				/* exported to pstat/systat */
76 swblk_t vm_swap_size;
77 swblk_t vm_swap_max;
78 
79 static int swapoff_one(int index);
80 struct vnode *swapdev_vp;
81 
82 /*
83  * (struct vnode *a_vp, struct bio *b_bio)
84  *
85  * vn_strategy() for swapdev_vp.  Perform swap strategy interleave device
86  * selection.
87  *
88  * This function supports the KVABIO API.  If the underlying vnode/device
89  * does not, it will make appropriate adjustments.
90  *
91  * No requirements.
92  */
93 static int
94 swapdev_strategy(struct vop_strategy_args *ap)
95 {
96 	struct bio *bio = ap->a_bio;
97 	struct bio *nbio;
98 	struct buf *bp = bio->bio_buf;
99 	swblk_t sz, off, seg, blkno, nblkno;
100 	int index;
101 	struct swdevt *sp;
102 	sz = howmany(bp->b_bcount, PAGE_SIZE);
103 	blkno = (swblk_t)(bio->bio_offset >> PAGE_SHIFT);
104 
105 	/*
106 	 * Convert interleaved swap into per-device swap.  Note that
107 	 * the block size is left in PAGE_SIZE'd chunks (for the newswap)
108 	 * here.
109 	 */
110 	nbio = push_bio(bio);
111 	if (nswdev > 1) {
112 		off = blkno % SWB_DMMAX;
113 		if (off + sz > SWB_DMMAX) {
114 			bp->b_error = EINVAL;
115 			bp->b_flags |= B_ERROR;
116 			biodone(bio);
117 			return 0;
118 		}
119 		seg = blkno / SWB_DMMAX;
120 		index = seg % nswdev;
121 		seg /= nswdev;
122 		nbio->bio_offset = (off_t)(seg * SWB_DMMAX + off) << PAGE_SHIFT;
123 	} else {
124 		index = 0;
125 		nbio->bio_offset = bio->bio_offset;
126 	}
127 	nblkno = (swblk_t)(nbio->bio_offset >> PAGE_SHIFT);
128 	sp = &swdevt[index];
129 	if (nblkno + sz > sp->sw_nblks) {
130 		bp->b_error = EINVAL;
131 		bp->b_flags |= B_ERROR;
132 		/* I/O was never started on nbio, must biodone(bio) */
133 		biodone(bio);
134 		return 0;
135 	}
136 	if (sp->sw_vp == NULL) {
137 		bp->b_error = ENODEV;
138 		bp->b_flags |= B_ERROR;
139 		/* I/O was never started on nbio, must biodone(bio) */
140 		biodone(bio);
141 		return 0;
142 	}
143 
144 	/*
145 	 * Issue a strategy call on the appropriate swap vnode.  Note that
146 	 * bp->b_vp is not modified.  Strategy code is always supposed to
147 	 * use the passed vp.
148 	 *
149 	 * We have to use vn_strategy() here even if we know we have a
150 	 * device in order to properly break up requests which exceed the
151 	 * device's DMA limits.
152 	 */
153 	vn_strategy(sp->sw_vp, nbio);
154 
155 	return 0;
156 }
157 
158 static int
159 swapdev_inactive(struct vop_inactive_args *ap)
160 {
161 	vrecycle(ap->a_vp);
162 	return(0);
163 }
164 
165 static int
166 swapdev_reclaim(struct vop_reclaim_args *ap)
167 {
168 	return(0);
169 }
170 
171 /*
172  * Create a special vnode op vector for swapdev_vp - we only use
173  * vn_strategy(), everything else returns an error.
174  */
175 static struct vop_ops swapdev_vnode_vops = {
176 	.vop_default =		vop_defaultop,
177 	.vop_strategy =		swapdev_strategy,
178 	.vop_inactive =		swapdev_inactive,
179 	.vop_reclaim =		swapdev_reclaim
180 };
181 static struct vop_ops *swapdev_vnode_vops_p = &swapdev_vnode_vops;
182 
183 VNODEOP_SET(swapdev_vnode_vops);
184 
185 /*
186  * swapon_args(char *name)
187  *
188  * System call swapon(name) enables swapping on device name,
189  * which must be in the swdevsw.  Return EBUSY
190  * if already swapping on this device.
191  *
192  * No requirements.
193  */
194 int
195 sys_swapon(struct swapon_args *uap)
196 {
197 	struct thread *td = curthread;
198 	struct vattr attr;
199 	struct vnode *vp;
200 	struct nlookupdata nd;
201 	int error;
202 
203 	error = priv_check(td, PRIV_ROOT);
204 	if (error)
205 		return (error);
206 
207 	mtx_lock(&swap_mtx);
208 	vp = NULL;
209 	error = nlookup_init(&nd, uap->name, UIO_USERSPACE, NLC_FOLLOW);
210 	if (error == 0)
211 		error = nlookup(&nd);
212 	if (error == 0)
213 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
214 	nlookup_done(&nd);
215 	if (error) {
216 		mtx_unlock(&swap_mtx);
217 		return (error);
218 	}
219 
220 	if (vn_isdisk(vp, &error)) {
221 		error = swaponvp(td, vp, 0);
222 	} else if (vp->v_type == VREG && vp->v_tag == VT_NFS &&
223 		   (error = VOP_GETATTR(vp, &attr)) == 0) {
224 		/*
225 		 * Allow direct swapping to NFS regular files in the same
226 		 * way that nfs_mountroot() sets up diskless swapping.
227 		 */
228 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
229 	}
230 	if (error)
231 		vrele(vp);
232 	mtx_unlock(&swap_mtx);
233 
234 	return (error);
235 }
236 
237 /*
238  * Swfree(index) frees the index'th portion of the swap map.
239  * Each of the nswdev devices provides 1/nswdev'th of the swap
240  * space, which is laid out with blocks of SWB_DMMAX pages circularly
241  * among the devices.
242  *
243  * The new swap code uses page-sized blocks.  The old swap code used
244  * DEV_BSIZE'd chunks.
245  *
246  * XXX locking when multiple swapon's run in parallel
247  */
248 int
249 swaponvp(struct thread *td, struct vnode *vp, u_quad_t nblks)
250 {
251 	swblk_t aligned_nblks;
252 	int64_t dpsize;
253 	struct ucred *cred;
254 	struct swdevt *sp;
255 	swblk_t vsbase;
256 	swblk_t dvbase;
257 	cdev_t dev;
258 	int index;
259 	int error;
260 	swblk_t blk;
261 
262 	cred = td->td_ucred;
263 
264 	lwkt_gettoken(&vm_token);	/* needed for vm_swap_size and blist */
265 	mtx_lock(&swap_mtx);
266 
267 	/*
268 	 * Setup swapdev_vp.  We support the KVABIO API for this vnode's
269 	 * strategy function.
270 	 */
271 	if (!swapdev_vp) {
272 		error = getspecialvnode(VT_NON, NULL, &swapdev_vnode_vops_p,
273 				    &swapdev_vp, 0, 0);
274 		if (error)
275 			panic("Cannot get vnode for swapdev");
276 		swapdev_vp->v_type = VNON;	/* Untyped */
277 		vsetflags(swapdev_vp, VKVABIO);
278 		vx_unlock(swapdev_vp);
279 	}
280 
281 	for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) {
282 		if (sp->sw_vp == vp) {
283 			error = EBUSY;
284 			goto done;
285 		}
286 		if (!sp->sw_vp)
287 			goto found;
288 
289 	}
290 	error = EINVAL;
291 	goto done;
292     found:
293 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
294 	error = VOP_OPEN(vp, FREAD | FWRITE, cred, NULL);
295 	vn_unlock(vp);
296 	if (error)
297 		goto done;
298 
299 	/*
300 	 * v_rdev is not valid until after the VOP_OPEN() call.  dev_psize()
301 	 * must be supported if a character device has been specified.
302 	 */
303 	if (vp->v_type == VCHR)
304 		dev = vp->v_rdev;
305 	else
306 		dev = NULL;
307 
308 	if (nblks == 0 && dev != NULL) {
309 		dpsize = dev_dpsize(dev);
310 		if (dpsize == -1) {
311 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
312 			VOP_CLOSE(vp, FREAD | FWRITE, NULL);
313 			vn_unlock(vp);
314 			error = ENXIO;
315 			goto done;
316 		}
317 		nblks = (u_quad_t)dpsize;
318 	}
319 	if (nblks == 0) {
320 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
321 		VOP_CLOSE(vp, FREAD | FWRITE, NULL);
322 		vn_unlock(vp);
323 		error = ENXIO;
324 		goto done;
325 	}
326 
327 	/*
328 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
329 	 * First chop nblks off to page-align it, then convert.
330 	 *
331 	 * sw->sw_nblks is in page-sized chunks now too.
332 	 */
333 	nblks &= ~(u_quad_t)(ctodb(1) - 1);
334 	nblks = dbtoc(nblks);
335 
336 	/*
337 	 * Post-conversion nblks must not be >= BLIST_MAXBLKS, and
338 	 * we impose a 4-swap-device limit so we have to divide it out
339 	 * further.  Going beyond this will result in overflows in the
340 	 * blist code.
341 	 *
342 	 * Post-conversion nblks must fit within a (swblk_t), which
343 	 * this test also ensures.
344 	 */
345 	if (nblks > BLIST_MAXBLKS / nswdev) {
346 		kprintf("exceeded maximum of %ld blocks per swap unit\n",
347 			(long)BLIST_MAXBLKS / nswdev);
348 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
349 		VOP_CLOSE(vp, FREAD | FWRITE, NULL);
350 		vn_unlock(vp);
351 		error = ENXIO;
352 		goto done;
353 	}
354 
355 	sp->sw_vp = vp;
356 	sp->sw_dev = dev2udev(dev);
357 	sp->sw_device = dev;
358 	sp->sw_flags = SW_FREED;
359 	sp->sw_nused = 0;
360 
361 	/*
362 	 * nblks, nswap, and SWB_DMMAX are PAGE_SIZE'd parameters now, not
363 	 * DEV_BSIZE'd.   aligned_nblks is used to calculate the
364 	 * size of the swap bitmap, taking into account the stripe size.
365 	 */
366 	aligned_nblks = (swblk_t)((nblks + SWB_DMMASK) &
367 				  ~(u_swblk_t)SWB_DMMASK);
368 	sp->sw_nblks = aligned_nblks;
369 
370 	if (aligned_nblks * nswdev > nswap)
371 		nswap = aligned_nblks * nswdev;
372 
373 	if (swapblist == NULL)
374 		swapblist = blist_create(nswap);
375 	else
376 		blist_resize(&swapblist, nswap, 0);
377 
378 	for (dvbase = SWB_DMMAX; dvbase < aligned_nblks; dvbase += SWB_DMMAX) {
379 		blk = min(aligned_nblks - dvbase, SWB_DMMAX);
380 		vsbase = index * SWB_DMMAX + dvbase * nswdev;
381 		blist_free(swapblist, vsbase, blk);
382 		vm_swap_size += blk;
383 		vm_swap_max += blk;
384 	}
385 	swap_pager_newswap();
386 	error = 0;
387 done:
388 	mtx_unlock(&swap_mtx);
389 	lwkt_reltoken(&vm_token);
390 	return (error);
391 }
392 
393 /*
394  * swapoff_args(char *name)
395  *
396  * System call swapoff(name) disables swapping on device name,
397  * which must be an active swap device. Return ENOMEM
398  * if there is not enough memory to page in the contents of
399  * the given device.
400  *
401  * No requirements.
402  */
403 int
404 sys_swapoff(struct swapoff_args *uap)
405 {
406 	struct vnode *vp;
407 	struct nlookupdata nd;
408 	struct swdevt *sp;
409 	int error, index;
410 
411 	error = priv_check(curthread, PRIV_ROOT);
412 	if (error)
413 		return (error);
414 
415 	mtx_lock(&swap_mtx);
416 	vp = NULL;
417 	error = nlookup_init(&nd, uap->name, UIO_USERSPACE, NLC_FOLLOW);
418 	if (error == 0)
419 		error = nlookup(&nd);
420 	if (error == 0)
421 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
422 	nlookup_done(&nd);
423 	if (error)
424 		goto done;
425 
426 	for (sp = swdevt, index = 0; index < nswdev; index++, sp++) {
427 		if (sp->sw_vp == vp)
428 			goto found;
429 	}
430 	error = EINVAL;
431 	goto done;
432 found:
433 	error = swapoff_one(index);
434 	swap_pager_newswap();
435 
436 done:
437 	mtx_unlock(&swap_mtx);
438 	return (error);
439 }
440 
441 static int
442 swapoff_one(int index)
443 {
444 	swblk_t blk, aligned_nblks;
445 	swblk_t dvbase, vsbase;
446 	u_int pq_active_clean, pq_inactive_clean;
447 	struct swdevt *sp;
448 	struct vm_page marker;
449 	vm_page_t m;
450 	int q;
451 
452 	mtx_lock(&swap_mtx);
453 
454 	sp = &swdevt[index];
455 	aligned_nblks = sp->sw_nblks;
456 	pq_active_clean = pq_inactive_clean = 0;
457 
458 	/*
459 	 * We can turn off this swap device safely only if the
460 	 * available virtual memory in the system will fit the amount
461 	 * of data we will have to page back in, plus an epsilon so
462 	 * the system doesn't become critically low on swap space.
463 	 */
464 	for (q = 0; q < PQ_L2_SIZE; ++q) {
465 		bzero(&marker, sizeof(marker));
466 		marker.flags = PG_FICTITIOUS | PG_MARKER;
467 		marker.busy_count = PBUSY_LOCKED;
468 		marker.queue = PQ_ACTIVE + q;
469 		marker.pc = q;
470 		marker.wire_count = 1;
471 
472 		vm_page_queues_spin_lock(marker.queue);
473 		TAILQ_INSERT_HEAD(&vm_page_queues[marker.queue].pl,
474 				  &marker, pageq);
475 
476 		while ((m = TAILQ_NEXT(&marker, pageq)) != NULL) {
477 			TAILQ_REMOVE(&vm_page_queues[marker.queue].pl,
478 				     &marker, pageq);
479 			TAILQ_INSERT_AFTER(&vm_page_queues[marker.queue].pl, m,
480 					   &marker, pageq);
481 			if (m->flags & (PG_MARKER | PG_FICTITIOUS))
482 				continue;
483 
484 			if (vm_page_busy_try(m, FALSE) == 0) {
485 				vm_page_queues_spin_unlock(marker.queue);
486 				if (m->dirty == 0) {
487 					vm_page_test_dirty(m);
488 					if (m->dirty == 0)
489 						++pq_active_clean;
490 				}
491 				vm_page_wakeup(m);
492 				vm_page_queues_spin_lock(marker.queue);
493 			}
494 		}
495 		TAILQ_REMOVE(&vm_page_queues[marker.queue].pl, &marker, pageq);
496 		vm_page_queues_spin_unlock(marker.queue);
497 
498 		marker.queue = PQ_INACTIVE + q;
499 		marker.pc = q;
500 		vm_page_queues_spin_lock(marker.queue);
501 		TAILQ_INSERT_HEAD(&vm_page_queues[marker.queue].pl,
502 				  &marker, pageq);
503 
504 		while ((m = TAILQ_NEXT(&marker, pageq)) != NULL) {
505 			TAILQ_REMOVE(
506 				&vm_page_queues[marker.queue].pl,
507 				&marker, pageq);
508 			TAILQ_INSERT_AFTER(
509 				&vm_page_queues[marker.queue].pl,
510 				m, &marker, pageq);
511 			if (m->flags & (PG_MARKER | PG_FICTITIOUS))
512 				continue;
513 
514 			if (vm_page_busy_try(m, FALSE) == 0) {
515 				vm_page_queues_spin_unlock(marker.queue);
516 				if (m->dirty == 0) {
517 					vm_page_test_dirty(m);
518 					if (m->dirty == 0)
519 						++pq_inactive_clean;
520 				}
521 				vm_page_wakeup(m);
522 				vm_page_queues_spin_lock(marker.queue);
523 			}
524 		}
525 		TAILQ_REMOVE(&vm_page_queues[marker.queue].pl,
526 			     &marker, pageq);
527 		vm_page_queues_spin_unlock(marker.queue);
528 	}
529 
530 	if (vmstats.v_free_count + vmstats.v_cache_count + pq_active_clean +
531 	    pq_inactive_clean + vm_swap_size < aligned_nblks + nswap_lowat) {
532 		mtx_unlock(&swap_mtx);
533 		return (ENOMEM);
534 	}
535 
536 	/*
537 	 * Prevent further allocations on this device
538 	 */
539 	sp->sw_flags |= SW_CLOSING;
540 	for (dvbase = SWB_DMMAX; dvbase < aligned_nblks; dvbase += SWB_DMMAX) {
541 		blk = min(aligned_nblks - dvbase, SWB_DMMAX);
542 		vsbase = index * SWB_DMMAX + dvbase * nswdev;
543 		vm_swap_size -= blist_fill(swapblist, vsbase, blk);
544 		vm_swap_max -= blk;
545 	}
546 
547 	/*
548 	 * Page in the contents of the device and close it.
549 	 */
550 	if (swap_pager_swapoff(index) && swap_pager_swapoff(index)) {
551 		mtx_unlock(&swap_mtx);
552 		return (EINTR);
553 	}
554 
555 	vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
556 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, NULL);
557 	vn_unlock(sp->sw_vp);
558 	vrele(sp->sw_vp);
559 	bzero(swdevt + index, sizeof(struct swdevt));
560 
561 	/*
562 	 * Resize the bitmap based on the nem largest swap device,
563 	 * or free the bitmap if there are no more devices.
564 	 */
565 	for (sp = swdevt, aligned_nblks = 0; sp < swdevt + nswdev; sp++) {
566 		if (sp->sw_vp)
567 			aligned_nblks = max(aligned_nblks, sp->sw_nblks);
568 	}
569 
570 	nswap = aligned_nblks * nswdev;
571 
572 	if (nswap == 0) {
573 		blist_destroy(swapblist);
574 		swapblist = NULL;
575 		vrele(swapdev_vp);
576 		swapdev_vp = NULL;
577 	} else {
578 		blist_resize(&swapblist, nswap, 0);
579 	}
580 
581 	mtx_unlock(&swap_mtx);
582 	return (0);
583 }
584 
585 /*
586  * Account for swap space in individual swdevt's.  The caller ensures
587  * that the provided range falls into a single swdevt.
588  *
589  * +count	space freed
590  * -count	space allocated
591  */
592 void
593 swapacctspace(swblk_t base, swblk_t count)
594 {
595 	int index;
596 	swblk_t seg;
597 
598 	vm_swap_size += count;
599 	seg = base / SWB_DMMAX;
600 	index = seg % nswdev;
601 	swdevt[index].sw_nused -= count;
602 }
603 
604 /*
605  * Retrieve swap info
606  */
607 static int
608 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
609 {
610 	struct xswdev xs;
611 	struct swdevt *sp;
612 	int	error;
613 	int	n;
614 
615 	error = 0;
616 	for (n = 0; n < nswdev; ++n) {
617 		sp = &swdevt[n];
618 
619 		xs.xsw_size = sizeof(xs);
620 		xs.xsw_version = XSWDEV_VERSION;
621 		xs.xsw_blksize = PAGE_SIZE;
622 		xs.xsw_dev = sp->sw_dev;
623 		xs.xsw_flags = sp->sw_flags;
624 		xs.xsw_nblks = sp->sw_nblks;
625 		xs.xsw_used = sp->sw_nused;
626 
627 		error = SYSCTL_OUT(req, &xs, sizeof(xs));
628 		if (error)
629 			break;
630 	}
631 	return (error);
632 }
633 
634 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswdev, 0,
635 	   "Number of swap devices");
636 SYSCTL_NODE(_vm, OID_AUTO, swap_info_array, CTLFLAG_RD, sysctl_vm_swap_info,
637 	    "Swap statistics by device");
638