xref: /netbsd/sys/uvm/uvm_swap.c (revision 8a204295)
1 /*	$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $");
34 
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37 #include "opt_ddb.h"
38 #include "opt_vmswap.h"
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/atomic.h>
43 #include <sys/buf.h>
44 #include <sys/bufq.h>
45 #include <sys/conf.h>
46 #include <sys/cprng.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/disklabel.h>
50 #include <sys/errno.h>
51 #include <sys/kernel.h>
52 #include <sys/vnode.h>
53 #include <sys/file.h>
54 #include <sys/vmem.h>
55 #include <sys/blist.h>
56 #include <sys/mount.h>
57 #include <sys/pool.h>
58 #include <sys/kmem.h>
59 #include <sys/syscallargs.h>
60 #include <sys/swap.h>
61 #include <sys/kauth.h>
62 #include <sys/sysctl.h>
63 #include <sys/workqueue.h>
64 
65 #include <uvm/uvm.h>
66 
67 #include <miscfs/specfs/specdev.h>
68 
69 #include <crypto/aes/aes.h>
70 #include <crypto/aes/aes_cbc.h>
71 
72 /*
73  * uvm_swap.c: manage configuration and i/o to swap space.
74  */
75 
76 /*
77  * swap space is managed in the following way:
78  *
79  * each swap partition or file is described by a "swapdev" structure.
80  * each "swapdev" structure contains a "swapent" structure which contains
81  * information that is passed up to the user (via system calls).
82  *
83  * each swap partition is assigned a "priority" (int) which controls
84  * swap partition usage.
85  *
86  * the system maintains a global data structure describing all swap
87  * partitions/files.   there is a sorted LIST of "swappri" structures
88  * which describe "swapdev"'s at that priority.   this LIST is headed
89  * by the "swap_priority" global var.    each "swappri" contains a
90  * TAILQ of "swapdev" structures at that priority.
91  *
92  * locking:
93  *  - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
94  *    system call and prevents the swap priority list from changing
95  *    while we are in the middle of a system call (e.g. SWAP_STATS).
96  *  - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
97  *    structures including the priority list, the swapdev structures,
98  *    and the swapmap arena.
99  *
100  * each swap device has the following info:
101  *  - swap device in use (could be disabled, preventing future use)
102  *  - swap enabled (allows new allocations on swap)
103  *  - map info in /dev/drum
104  *  - vnode pointer
105  * for swap files only:
106  *  - block size
107  *  - max byte count in buffer
108  *  - buffer
109  *
110  * userland controls and configures swap with the swapctl(2) system call.
111  * the sys_swapctl performs the following operations:
112  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
113  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
114  *	(passed in via "arg") of a size passed in via "misc" ... we load
115  *	the current swap config into the array. The actual work is done
116  *	in the uvm_swap_stats() function.
117  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
118  *	priority in "misc", start swapping on it.
119  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
120  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
121  *	"misc")
122  */
123 
124 /*
125  * swapdev: describes a single swap partition/file
126  *
127  * note the following should be true:
128  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
129  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
130  */
131 struct swapdev {
132 	dev_t			swd_dev;	/* device id */
133 	int			swd_flags;	/* flags:inuse/enable/fake */
134 	int			swd_priority;	/* our priority */
135 	int			swd_nblks;	/* blocks in this device */
136 	char			*swd_path;	/* saved pathname of device */
137 	int			swd_pathlen;	/* length of pathname */
138 	int			swd_npages;	/* #pages we can use */
139 	int			swd_npginuse;	/* #pages in use */
140 	int			swd_npgbad;	/* #pages bad */
141 	int			swd_drumoffset;	/* page0 offset in drum */
142 	int			swd_drumsize;	/* #pages in drum */
143 	blist_t			swd_blist;	/* blist for this swapdev */
144 	struct vnode		*swd_vp;	/* backing vnode */
145 	TAILQ_ENTRY(swapdev)	swd_next;	/* priority tailq */
146 
147 	int			swd_bsize;	/* blocksize (bytes) */
148 	int			swd_maxactive;	/* max active i/o reqs */
149 	struct bufq_state	*swd_tab;	/* buffer list */
150 	int			swd_active;	/* number of active buffers */
151 
152 	volatile uint32_t	*swd_encmap;	/* bitmap of encrypted slots */
153 	struct aesenc		swd_enckey;	/* AES key expanded for enc */
154 	struct aesdec		swd_deckey;	/* AES key expanded for dec */
155 	bool			swd_encinit;	/* true if keys initialized */
156 };
157 
158 /*
159  * swap device priority entry; the list is kept sorted on `spi_priority'.
160  */
161 struct swappri {
162 	int			spi_priority;     /* priority */
163 	TAILQ_HEAD(spi_swapdev, swapdev)	spi_swapdev;
164 	/* tailq of swapdevs at this priority */
165 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
166 };
167 
168 /*
169  * The following two structures are used to keep track of data transfers
170  * on swap devices associated with regular files.
171  * NOTE: this code is more or less a copy of vnd.c; we use the same
172  * structure names here to ease porting..
173  */
174 struct vndxfer {
175 	struct buf	*vx_bp;		/* Pointer to parent buffer */
176 	struct swapdev	*vx_sdp;
177 	int		vx_error;
178 	int		vx_pending;	/* # of pending aux buffers */
179 	int		vx_flags;
180 #define VX_BUSY		1
181 #define VX_DEAD		2
182 };
183 
184 struct vndbuf {
185 	struct buf	vb_buf;
186 	struct vndxfer	*vb_xfer;
187 };
188 
189 /*
190  * We keep a of pool vndbuf's and vndxfer structures.
191  */
192 static struct pool vndxfer_pool, vndbuf_pool;
193 
194 /*
195  * local variables
196  */
197 static vmem_t *swapmap;	/* controls the mapping of /dev/drum */
198 
199 /* list of all active swap devices [by priority] */
200 LIST_HEAD(swap_priority, swappri);
201 static struct swap_priority swap_priority;
202 
203 /* locks */
204 static kmutex_t uvm_swap_data_lock __cacheline_aligned;
205 static krwlock_t swap_syscall_lock;
206 bool uvm_swap_init_done = false;
207 
208 /* workqueue and use counter for swap to regular files */
209 static int sw_reg_count = 0;
210 static struct workqueue *sw_reg_workqueue;
211 
212 /* tuneables */
213 u_int uvm_swapisfull_factor = 99;
214 #if VMSWAP_DEFAULT_PLAINTEXT
215 bool uvm_swap_encrypt = false;
216 #else
217 bool uvm_swap_encrypt = true;
218 #endif
219 
220 /*
221  * prototypes
222  */
223 static struct swapdev	*swapdrum_getsdp(int);
224 
225 static struct swapdev	*swaplist_find(struct vnode *, bool);
226 static void		 swaplist_insert(struct swapdev *,
227 					 struct swappri *, int);
228 static void		 swaplist_trim(void);
229 
230 static int swap_on(struct lwp *, struct swapdev *);
231 static int swap_off(struct lwp *, struct swapdev *);
232 
233 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
234 static void sw_reg_biodone(struct buf *);
235 static void sw_reg_iodone(struct work *wk, void *dummy);
236 static void sw_reg_start(struct swapdev *);
237 
238 static int uvm_swap_io(struct vm_page **, int, int, int);
239 
240 static void uvm_swap_genkey(struct swapdev *);
241 static void uvm_swap_encryptpage(struct swapdev *, void *, int);
242 static void uvm_swap_decryptpage(struct swapdev *, void *, int);
243 
244 static size_t
encmap_size(size_t npages)245 encmap_size(size_t npages)
246 {
247 	struct swapdev *sdp;
248 	const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
249 	const size_t bitsperword = NBBY * bytesperword;
250 	const size_t nbits = npages; /* one bit for each page */
251 	const size_t nwords = howmany(nbits, bitsperword);
252 	const size_t nbytes = nwords * bytesperword;
253 
254 	return nbytes;
255 }
256 
257 /*
258  * uvm_swap_init: init the swap system data structures and locks
259  *
260  * => called at boot time from init_main.c after the filesystems
261  *	are brought up (which happens after uvm_init())
262  */
263 void
uvm_swap_init(void)264 uvm_swap_init(void)
265 {
266 	UVMHIST_FUNC(__func__);
267 
268 	UVMHIST_CALLED(pdhist);
269 	/*
270 	 * first, init the swap list, its counter, and its lock.
271 	 * then get a handle on the vnode for /dev/drum by using
272 	 * the its dev_t number ("swapdev", from MD conf.c).
273 	 */
274 
275 	LIST_INIT(&swap_priority);
276 	uvmexp.nswapdev = 0;
277 	rw_init(&swap_syscall_lock);
278 	mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
279 
280 	if (bdevvp(swapdev, &swapdev_vp))
281 		panic("%s: can't get vnode for swap device", __func__);
282 	if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
283 		panic("%s: can't lock swap device", __func__);
284 	if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
285 		panic("%s: can't open swap device", __func__);
286 	VOP_UNLOCK(swapdev_vp);
287 
288 	/*
289 	 * create swap block resource map to map /dev/drum.   the range
290 	 * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
291 	 * that block 0 is reserved (used to indicate an allocation
292 	 * failure, or no allocation).
293 	 */
294 	swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
295 	    VM_NOSLEEP, IPL_NONE);
296 	if (swapmap == 0) {
297 		panic("%s: vmem_create failed", __func__);
298 	}
299 
300 	pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
301 	    NULL, IPL_BIO);
302 	pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
303 	    NULL, IPL_BIO);
304 
305 	uvm_swap_init_done = true;
306 
307 	UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
308 }
309 
310 /*
311  * swaplist functions: functions that operate on the list of swap
312  * devices on the system.
313  */
314 
315 /*
316  * swaplist_insert: insert swap device "sdp" into the global list
317  *
318  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
319  * => caller must provide a newly allocated swappri structure (we will
320  *	FREE it if we don't need it... this it to prevent allocation
321  *	blocking here while adding swap)
322  */
323 static void
swaplist_insert(struct swapdev * sdp,struct swappri * newspp,int priority)324 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
325 {
326 	struct swappri *spp, *pspp;
327 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
328 
329 	KASSERT(rw_write_held(&swap_syscall_lock));
330 	KASSERT(mutex_owned(&uvm_swap_data_lock));
331 
332 	/*
333 	 * find entry at or after which to insert the new device.
334 	 */
335 	pspp = NULL;
336 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
337 		if (priority <= spp->spi_priority)
338 			break;
339 		pspp = spp;
340 	}
341 
342 	/*
343 	 * new priority?
344 	 */
345 	if (spp == NULL || spp->spi_priority != priority) {
346 		spp = newspp;  /* use newspp! */
347 		UVMHIST_LOG(pdhist, "created new swappri = %jd",
348 			    priority, 0, 0, 0);
349 
350 		spp->spi_priority = priority;
351 		TAILQ_INIT(&spp->spi_swapdev);
352 
353 		if (pspp)
354 			LIST_INSERT_AFTER(pspp, spp, spi_swappri);
355 		else
356 			LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
357 	} else {
358 	  	/* we don't need a new priority structure, free it */
359 		kmem_free(newspp, sizeof(*newspp));
360 	}
361 
362 	/*
363 	 * priority found (or created).   now insert on the priority's
364 	 * tailq list and bump the total number of swapdevs.
365 	 */
366 	sdp->swd_priority = priority;
367 	TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
368 	uvmexp.nswapdev++;
369 }
370 
371 /*
372  * swaplist_find: find and optionally remove a swap device from the
373  *	global list.
374  *
375  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
376  * => we return the swapdev we found (and removed)
377  */
378 static struct swapdev *
swaplist_find(struct vnode * vp,bool remove)379 swaplist_find(struct vnode *vp, bool remove)
380 {
381 	struct swapdev *sdp;
382 	struct swappri *spp;
383 
384 	KASSERT(rw_lock_held(&swap_syscall_lock));
385 	KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1);
386 	KASSERT(mutex_owned(&uvm_swap_data_lock));
387 
388 	/*
389 	 * search the lists for the requested vp
390 	 */
391 
392 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
393 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
394 			if (sdp->swd_vp == vp) {
395 				if (remove) {
396 					TAILQ_REMOVE(&spp->spi_swapdev,
397 					    sdp, swd_next);
398 					uvmexp.nswapdev--;
399 				}
400 				return(sdp);
401 			}
402 		}
403 	}
404 	return (NULL);
405 }
406 
407 /*
408  * swaplist_trim: scan priority list for empty priority entries and kill
409  *	them.
410  *
411  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
412  */
413 static void
swaplist_trim(void)414 swaplist_trim(void)
415 {
416 	struct swappri *spp, *nextspp;
417 
418 	KASSERT(rw_write_held(&swap_syscall_lock));
419 	KASSERT(mutex_owned(&uvm_swap_data_lock));
420 
421 	LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
422 		if (!TAILQ_EMPTY(&spp->spi_swapdev))
423 			continue;
424 		LIST_REMOVE(spp, spi_swappri);
425 		kmem_free(spp, sizeof(*spp));
426 	}
427 }
428 
429 /*
430  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
431  *	to the "swapdev" that maps that section of the drum.
432  *
433  * => each swapdev takes one big contig chunk of the drum
434  * => caller must hold uvm_swap_data_lock
435  */
436 static struct swapdev *
swapdrum_getsdp(int pgno)437 swapdrum_getsdp(int pgno)
438 {
439 	struct swapdev *sdp;
440 	struct swappri *spp;
441 
442 	KASSERT(mutex_owned(&uvm_swap_data_lock));
443 
444 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
445 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
446 			if (sdp->swd_flags & SWF_FAKE)
447 				continue;
448 			if (pgno >= sdp->swd_drumoffset &&
449 			    pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
450 				return sdp;
451 			}
452 		}
453 	}
454 	return NULL;
455 }
456 
457 /*
458  * swapdrum_sdp_is: true iff the swap device for pgno is sdp
459  *
460  * => for use in positive assertions only; result is not stable
461  */
462 static bool __debugused
swapdrum_sdp_is(int pgno,struct swapdev * sdp)463 swapdrum_sdp_is(int pgno, struct swapdev *sdp)
464 {
465 	bool result;
466 
467 	mutex_enter(&uvm_swap_data_lock);
468 	result = swapdrum_getsdp(pgno) == sdp;
469 	mutex_exit(&uvm_swap_data_lock);
470 
471 	return result;
472 }
473 
swapsys_lock(krw_t op)474 void swapsys_lock(krw_t op)
475 {
476 	rw_enter(&swap_syscall_lock, op);
477 }
478 
swapsys_unlock(void)479 void swapsys_unlock(void)
480 {
481 	rw_exit(&swap_syscall_lock);
482 }
483 
484 static void
swapent_cvt(struct swapent * se,const struct swapdev * sdp,int inuse)485 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
486 {
487 	se->se_dev = sdp->swd_dev;
488 	se->se_flags = sdp->swd_flags;
489 	se->se_nblks = sdp->swd_nblks;
490 	se->se_inuse = inuse;
491 	se->se_priority = sdp->swd_priority;
492 	KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
493 	strcpy(se->se_path, sdp->swd_path);
494 }
495 
496 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
497     (void *)enosys;
498 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
499     (void *)enosys;
500 
501 /*
502  * sys_swapctl: main entry point for swapctl(2) system call
503  * 	[with two helper functions: swap_on and swap_off]
504  */
505 int
sys_swapctl(struct lwp * l,const struct sys_swapctl_args * uap,register_t * retval)506 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
507 {
508 	/* {
509 		syscallarg(int) cmd;
510 		syscallarg(void *) arg;
511 		syscallarg(int) misc;
512 	} */
513 	struct vnode *vp;
514 	struct nameidata nd;
515 	struct swappri *spp;
516 	struct swapdev *sdp;
517 #define SWAP_PATH_MAX (PATH_MAX + 1)
518 	char	*userpath;
519 	size_t	len = 0;
520 	int	error;
521 	int	priority;
522 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
523 
524 	/*
525 	 * we handle the non-priv NSWAP and STATS request first.
526 	 *
527 	 * SWAP_NSWAP: return number of config'd swap devices
528 	 * [can also be obtained with uvmexp sysctl]
529 	 */
530 	if (SCARG(uap, cmd) == SWAP_NSWAP) {
531 		const int nswapdev = uvmexp.nswapdev;
532 		UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
533 		    0, 0, 0);
534 		*retval = nswapdev;
535 		return 0;
536 	}
537 
538 	userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
539 
540 	/*
541 	 * ensure serialized syscall access by grabbing the swap_syscall_lock
542 	 */
543 	rw_enter(&swap_syscall_lock, RW_WRITER);
544 
545 	/*
546 	 * SWAP_STATS: get stats on current # of configured swap devs
547 	 *
548 	 * note that the swap_priority list can't change as long
549 	 * as we are holding the swap_syscall_lock.  we don't want
550 	 * to grab the uvm_swap_data_lock because we may fault&sleep during
551 	 * copyout() and we don't want to be holding that lock then!
552 	 */
553 	switch (SCARG(uap, cmd)) {
554 	case SWAP_STATS13:
555 		error = (*uvm_swap_stats13)(uap, retval);
556 		goto out;
557 	case SWAP_STATS50:
558 		error = (*uvm_swap_stats50)(uap, retval);
559 		goto out;
560 	case SWAP_STATS:
561 		error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
562 		    NULL, sizeof(struct swapent), retval);
563 		UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
564 		goto out;
565 
566 	case SWAP_GETDUMPDEV:
567 		error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
568 		goto out;
569 	default:
570 		break;
571 	}
572 
573 	/*
574 	 * all other requests require superuser privs.   verify.
575 	 */
576 	if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
577 	    0, NULL, NULL, NULL)))
578 		goto out;
579 
580 	if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
581 		/* drop the current dump device */
582 		dumpdev = NODEV;
583 		dumpcdev = NODEV;
584 		cpu_dumpconf();
585 		goto out;
586 	}
587 
588 	/*
589 	 * at this point we expect a path name in arg.   we will
590 	 * use namei() to gain a vnode reference (vref), and lock
591 	 * the vnode (VOP_LOCK).
592 	 *
593 	 * XXX: a NULL arg means use the root vnode pointer (e.g. for
594 	 * miniroot)
595 	 */
596 	if (SCARG(uap, arg) == NULL) {
597 		vp = rootvp;		/* miniroot */
598 		vref(vp);
599 		if (vn_lock(vp, LK_EXCLUSIVE)) {
600 			vrele(vp);
601 			error = EBUSY;
602 			goto out;
603 		}
604 		if (SCARG(uap, cmd) == SWAP_ON &&
605 		    copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
606 			panic("swapctl: miniroot copy failed");
607 	} else {
608 		struct pathbuf *pb;
609 
610 		/*
611 		 * This used to allow copying in one extra byte
612 		 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
613 		 * This was completely pointless because if anyone
614 		 * used that extra byte namei would fail with
615 		 * ENAMETOOLONG anyway, so I've removed the excess
616 		 * logic. - dholland 20100215
617 		 */
618 
619 		error = pathbuf_copyin(SCARG(uap, arg), &pb);
620 		if (error) {
621 			goto out;
622 		}
623 		if (SCARG(uap, cmd) == SWAP_ON) {
624 			/* get a copy of the string */
625 			pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
626 			len = strlen(userpath) + 1;
627 		}
628 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
629 		if ((error = namei(&nd))) {
630 			pathbuf_destroy(pb);
631 			goto out;
632 		}
633 		vp = nd.ni_vp;
634 		pathbuf_destroy(pb);
635 	}
636 	/* note: "vp" is referenced and locked */
637 
638 	error = 0;		/* assume no error */
639 	switch(SCARG(uap, cmd)) {
640 
641 	case SWAP_DUMPDEV:
642 		if (vp->v_type != VBLK) {
643 			error = ENOTBLK;
644 			break;
645 		}
646 		if (bdevsw_lookup(vp->v_rdev)) {
647 			dumpdev = vp->v_rdev;
648 			dumpcdev = devsw_blk2chr(dumpdev);
649 		} else
650 			dumpdev = NODEV;
651 		cpu_dumpconf();
652 		break;
653 
654 	case SWAP_CTL:
655 		/*
656 		 * get new priority, remove old entry (if any) and then
657 		 * reinsert it in the correct place.  finally, prune out
658 		 * any empty priority structures.
659 		 */
660 		priority = SCARG(uap, misc);
661 		spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
662 		mutex_enter(&uvm_swap_data_lock);
663 		if ((sdp = swaplist_find(vp, true)) == NULL) {
664 			error = ENOENT;
665 		} else {
666 			swaplist_insert(sdp, spp, priority);
667 			swaplist_trim();
668 		}
669 		mutex_exit(&uvm_swap_data_lock);
670 		if (error)
671 			kmem_free(spp, sizeof(*spp));
672 		break;
673 
674 	case SWAP_ON:
675 
676 		/*
677 		 * check for duplicates.   if none found, then insert a
678 		 * dummy entry on the list to prevent someone else from
679 		 * trying to enable this device while we are working on
680 		 * it.
681 		 */
682 
683 		priority = SCARG(uap, misc);
684 		sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
685 		spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
686 		sdp->swd_flags = SWF_FAKE;
687 		sdp->swd_vp = vp;
688 		sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
689 		bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
690 		mutex_enter(&uvm_swap_data_lock);
691 		if (swaplist_find(vp, false) != NULL) {
692 			error = EBUSY;
693 			mutex_exit(&uvm_swap_data_lock);
694 			bufq_free(sdp->swd_tab);
695 			kmem_free(sdp, sizeof(*sdp));
696 			kmem_free(spp, sizeof(*spp));
697 			break;
698 		}
699 		swaplist_insert(sdp, spp, priority);
700 		mutex_exit(&uvm_swap_data_lock);
701 
702 		KASSERT(len > 0);
703 		sdp->swd_pathlen = len;
704 		sdp->swd_path = kmem_alloc(len, KM_SLEEP);
705 		if (copystr(userpath, sdp->swd_path, len, 0) != 0)
706 			panic("swapctl: copystr");
707 
708 		/*
709 		 * we've now got a FAKE placeholder in the swap list.
710 		 * now attempt to enable swap on it.  if we fail, undo
711 		 * what we've done and kill the fake entry we just inserted.
712 		 * if swap_on is a success, it will clear the SWF_FAKE flag
713 		 */
714 
715 		if ((error = swap_on(l, sdp)) != 0) {
716 			mutex_enter(&uvm_swap_data_lock);
717 			(void) swaplist_find(vp, true);  /* kill fake entry */
718 			swaplist_trim();
719 			mutex_exit(&uvm_swap_data_lock);
720 			bufq_free(sdp->swd_tab);
721 			kmem_free(sdp->swd_path, sdp->swd_pathlen);
722 			kmem_free(sdp, sizeof(*sdp));
723 			break;
724 		}
725 		break;
726 
727 	case SWAP_OFF:
728 		mutex_enter(&uvm_swap_data_lock);
729 		if ((sdp = swaplist_find(vp, false)) == NULL) {
730 			mutex_exit(&uvm_swap_data_lock);
731 			error = ENXIO;
732 			break;
733 		}
734 
735 		/*
736 		 * If a device isn't in use or enabled, we
737 		 * can't stop swapping from it (again).
738 		 */
739 		if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
740 			mutex_exit(&uvm_swap_data_lock);
741 			error = EBUSY;
742 			break;
743 		}
744 
745 		/*
746 		 * do the real work.
747 		 */
748 		error = swap_off(l, sdp);
749 		break;
750 
751 	default:
752 		error = EINVAL;
753 	}
754 
755 	/*
756 	 * done!  release the ref gained by namei() and unlock.
757 	 */
758 	vput(vp);
759 out:
760 	rw_exit(&swap_syscall_lock);
761 	kmem_free(userpath, SWAP_PATH_MAX);
762 
763 	UVMHIST_LOG(pdhist, "<- done!  error=%jd", error, 0, 0, 0);
764 	return (error);
765 }
766 
767 /*
768  * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
769  * away from sys_swapctl() in order to allow COMPAT_* swapctl()
770  * emulation to use it directly without going through sys_swapctl().
771  * The problem with using sys_swapctl() there is that it involves
772  * copying the swapent array to the stackgap, and this array's size
773  * is not known at build time. Hence it would not be possible to
774  * ensure it would fit in the stackgap in any case.
775  */
776 int
uvm_swap_stats(char * ptr,int misc,void (* f)(void *,const struct swapent *),size_t len,register_t * retval)777 uvm_swap_stats(char *ptr, int misc,
778     void (*f)(void *, const struct swapent *), size_t len,
779     register_t *retval)
780 {
781 	struct swappri *spp;
782 	struct swapdev *sdp;
783 	struct swapent sep;
784 	int count = 0;
785 	int error;
786 
787 	KASSERT(len <= sizeof(sep));
788 	if (len == 0)
789 		return ENOSYS;
790 
791 	if (misc < 0)
792 		return EINVAL;
793 
794 	if (misc == 0 || uvmexp.nswapdev == 0)
795 		return 0;
796 
797 	/* Make sure userland cannot exhaust kernel memory */
798 	if ((size_t)misc > (size_t)uvmexp.nswapdev)
799 		misc = uvmexp.nswapdev;
800 
801 	KASSERT(rw_lock_held(&swap_syscall_lock));
802 
803 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
804 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
805 			int inuse;
806 
807 			if (misc-- <= 0)
808 				break;
809 
810 			inuse = btodb((uint64_t)sdp->swd_npginuse <<
811 			    PAGE_SHIFT);
812 
813 			memset(&sep, 0, sizeof(sep));
814 			swapent_cvt(&sep, sdp, inuse);
815 			if (f)
816 				(*f)(&sep, &sep);
817 			if ((error = copyout(&sep, ptr, len)) != 0)
818 				return error;
819 			ptr += len;
820 			count++;
821 		}
822 	}
823 	*retval = count;
824 	return 0;
825 }
826 
827 /*
828  * swap_on: attempt to enable a swapdev for swapping.   note that the
829  *	swapdev is already on the global list, but disabled (marked
830  *	SWF_FAKE).
831  *
832  * => we avoid the start of the disk (to protect disk labels)
833  * => we also avoid the miniroot, if we are swapping to root.
834  * => caller should leave uvm_swap_data_lock unlocked, we may lock it
835  *	if needed.
836  */
837 static int
swap_on(struct lwp * l,struct swapdev * sdp)838 swap_on(struct lwp *l, struct swapdev *sdp)
839 {
840 	struct vnode *vp;
841 	int error, npages, nblocks, size;
842 	long addr;
843 	vmem_addr_t result;
844 	struct vattr va;
845 	dev_t dev;
846 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
847 
848 	/*
849 	 * we want to enable swapping on sdp.   the swd_vp contains
850 	 * the vnode we want (locked and ref'd), and the swd_dev
851 	 * contains the dev_t of the file, if it a block device.
852 	 */
853 
854 	vp = sdp->swd_vp;
855 	dev = sdp->swd_dev;
856 
857 	/*
858 	 * open the swap file (mostly useful for block device files to
859 	 * let device driver know what is up).
860 	 *
861 	 * we skip the open/close for root on swap because the root
862 	 * has already been opened when root was mounted (mountroot).
863 	 */
864 	if (vp != rootvp) {
865 		if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
866 			return (error);
867 	}
868 
869 	/* XXX this only works for block devices */
870 	UVMHIST_LOG(pdhist, "  dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
871 
872 	/*
873 	 * we now need to determine the size of the swap area.   for
874 	 * block specials we can call the d_psize function.
875 	 * for normal files, we must stat [get attrs].
876 	 *
877 	 * we put the result in nblks.
878 	 * for normal files, we also want the filesystem block size
879 	 * (which we get with statfs).
880 	 */
881 	switch (vp->v_type) {
882 	case VBLK:
883 		if ((nblocks = bdev_size(dev)) == -1) {
884 			error = ENXIO;
885 			goto bad;
886 		}
887 		break;
888 
889 	case VREG:
890 		if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
891 			goto bad;
892 		nblocks = (int)btodb(va.va_size);
893 		sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
894 		/*
895 		 * limit the max # of outstanding I/O requests we issue
896 		 * at any one time.   take it easy on NFS servers.
897 		 */
898 		if (vp->v_tag == VT_NFS)
899 			sdp->swd_maxactive = 2; /* XXX */
900 		else
901 			sdp->swd_maxactive = 8; /* XXX */
902 		break;
903 
904 	default:
905 		error = ENXIO;
906 		goto bad;
907 	}
908 
909 	/*
910 	 * save nblocks in a safe place and convert to pages.
911 	 */
912 
913 	sdp->swd_nblks = nblocks;
914 	npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
915 
916 	/*
917 	 * for block special files, we want to make sure that leave
918 	 * the disklabel and bootblocks alone, so we arrange to skip
919 	 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
920 	 * note that because of this the "size" can be less than the
921 	 * actual number of blocks on the device.
922 	 */
923 	if (vp->v_type == VBLK) {
924 		/* we use pages 1 to (size - 1) [inclusive] */
925 		size = npages - 1;
926 		addr = 1;
927 	} else {
928 		/* we use pages 0 to (size - 1) [inclusive] */
929 		size = npages;
930 		addr = 0;
931 	}
932 
933 	/*
934 	 * make sure we have enough blocks for a reasonable sized swap
935 	 * area.   we want at least one page.
936 	 */
937 
938 	if (size < 1) {
939 		UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
940 		error = EINVAL;
941 		goto bad;
942 	}
943 
944 	UVMHIST_LOG(pdhist, "  dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);
945 
946 	/*
947 	 * now we need to allocate an extent to manage this swap device
948 	 */
949 
950 	sdp->swd_blist = blist_create(npages);
951 	/* mark all expect the `saved' region free. */
952 	blist_free(sdp->swd_blist, addr, size);
953 
954 	/*
955 	 * allocate space to for swap encryption state and mark the
956 	 * keys uninitialized so we generate them lazily
957 	 */
958 	sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
959 	sdp->swd_encinit = false;
960 
961 	/*
962 	 * if the vnode we are swapping to is the root vnode
963 	 * (i.e. we are swapping to the miniroot) then we want
964 	 * to make sure we don't overwrite it.   do a statfs to
965 	 * find its size and skip over it.
966 	 */
967 	if (vp == rootvp) {
968 		struct mount *mp;
969 		struct statvfs *sp;
970 		int rootblocks, rootpages;
971 
972 		mp = rootvnode->v_mount;
973 		sp = &mp->mnt_stat;
974 		rootblocks = sp->f_blocks * btodb(sp->f_frsize);
975 		/*
976 		 * XXX: sp->f_blocks isn't the total number of
977 		 * blocks in the filesystem, it's the number of
978 		 * data blocks.  so, our rootblocks almost
979 		 * definitely underestimates the total size
980 		 * of the filesystem - how badly depends on the
981 		 * details of the filesystem type.  there isn't
982 		 * an obvious way to deal with this cleanly
983 		 * and perfectly, so for now we just pad our
984 		 * rootblocks estimate with an extra 5 percent.
985 		 */
986 		rootblocks += (rootblocks >> 5) +
987 			(rootblocks >> 6) +
988 			(rootblocks >> 7);
989 		rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
990 		if (rootpages > size)
991 			panic("swap_on: miniroot larger than swap?");
992 
993 		if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
994 			panic("swap_on: unable to preserve miniroot");
995 		}
996 
997 		size -= rootpages;
998 		printf("Preserved %d pages of miniroot ", rootpages);
999 		printf("leaving %d pages of swap\n", size);
1000 	}
1001 
1002 	/*
1003 	 * add a ref to vp to reflect usage as a swap device.
1004 	 */
1005 	vref(vp);
1006 
1007 	/*
1008 	 * now add the new swapdev to the drum and enable.
1009 	 */
1010 	error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
1011 	if (error != 0)
1012 		panic("swapdrum_add");
1013 	/*
1014 	 * If this is the first regular swap create the workqueue.
1015 	 * => Protected by swap_syscall_lock.
1016 	 */
1017 	if (vp->v_type != VBLK) {
1018 		if (sw_reg_count++ == 0) {
1019 			KASSERT(sw_reg_workqueue == NULL);
1020 			if (workqueue_create(&sw_reg_workqueue, "swapiod",
1021 			    sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
1022 				panic("%s: workqueue_create failed", __func__);
1023 		}
1024 	}
1025 
1026 	sdp->swd_drumoffset = (int)result;
1027 	sdp->swd_drumsize = npages;
1028 	sdp->swd_npages = size;
1029 	mutex_enter(&uvm_swap_data_lock);
1030 	sdp->swd_flags &= ~SWF_FAKE;	/* going live */
1031 	sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1032 	uvmexp.swpages += size;
1033 	uvmexp.swpgavail += size;
1034 	mutex_exit(&uvm_swap_data_lock);
1035 	return (0);
1036 
1037 	/*
1038 	 * failure: clean up and return error.
1039 	 */
1040 
1041 bad:
1042 	if (sdp->swd_blist) {
1043 		blist_destroy(sdp->swd_blist);
1044 	}
1045 	if (vp != rootvp) {
1046 		(void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
1047 	}
1048 	return (error);
1049 }
1050 
1051 /*
1052  * swap_off: stop swapping on swapdev
1053  *
1054  * => swap data should be locked, we will unlock.
1055  */
1056 static int
swap_off(struct lwp * l,struct swapdev * sdp)1057 swap_off(struct lwp *l, struct swapdev *sdp)
1058 {
1059 	int npages = sdp->swd_npages;
1060 	int error = 0;
1061 
1062 	UVMHIST_FUNC(__func__);
1063 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
1064 
1065 	KASSERT(rw_write_held(&swap_syscall_lock));
1066 	KASSERT(mutex_owned(&uvm_swap_data_lock));
1067 
1068 	/* disable the swap area being removed */
1069 	sdp->swd_flags &= ~SWF_ENABLE;
1070 	uvmexp.swpgavail -= npages;
1071 	mutex_exit(&uvm_swap_data_lock);
1072 
1073 	/*
1074 	 * the idea is to find all the pages that are paged out to this
1075 	 * device, and page them all in.  in uvm, swap-backed pageable
1076 	 * memory can take two forms: aobjs and anons.  call the
1077 	 * swapoff hook for each subsystem to bring in pages.
1078 	 */
1079 
1080 	if (uao_swap_off(sdp->swd_drumoffset,
1081 			 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1082 	    amap_swap_off(sdp->swd_drumoffset,
1083 			  sdp->swd_drumoffset + sdp->swd_drumsize)) {
1084 		error = ENOMEM;
1085 	} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1086 		error = EBUSY;
1087 	}
1088 
1089 	if (error) {
1090 		mutex_enter(&uvm_swap_data_lock);
1091 		sdp->swd_flags |= SWF_ENABLE;
1092 		uvmexp.swpgavail += npages;
1093 		mutex_exit(&uvm_swap_data_lock);
1094 
1095 		return error;
1096 	}
1097 
1098 	/*
1099 	 * If this is the last regular swap destroy the workqueue.
1100 	 * => Protected by swap_syscall_lock.
1101 	 */
1102 	if (sdp->swd_vp->v_type != VBLK) {
1103 		KASSERT(sw_reg_count > 0);
1104 		KASSERT(sw_reg_workqueue != NULL);
1105 		if (--sw_reg_count == 0) {
1106 			workqueue_destroy(sw_reg_workqueue);
1107 			sw_reg_workqueue = NULL;
1108 		}
1109 	}
1110 
1111 	/*
1112 	 * done with the vnode.
1113 	 * drop our ref on the vnode before calling VOP_CLOSE()
1114 	 * so that spec_close() can tell if this is the last close.
1115 	 */
1116 	vrele(sdp->swd_vp);
1117 	if (sdp->swd_vp != rootvp) {
1118 		(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1119 	}
1120 
1121 	mutex_enter(&uvm_swap_data_lock);
1122 	uvmexp.swpages -= npages;
1123 	uvmexp.swpginuse -= sdp->swd_npgbad;
1124 
1125 	if (swaplist_find(sdp->swd_vp, true) == NULL)
1126 		panic("%s: swapdev not in list", __func__);
1127 	swaplist_trim();
1128 	mutex_exit(&uvm_swap_data_lock);
1129 
1130 	/*
1131 	 * free all resources!
1132 	 */
1133 	vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1134 	blist_destroy(sdp->swd_blist);
1135 	bufq_free(sdp->swd_tab);
1136 	kmem_free(__UNVOLATILE(sdp->swd_encmap),
1137 	    encmap_size(sdp->swd_drumsize));
1138 	explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
1139 	explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
1140 	kmem_free(sdp, sizeof(*sdp));
1141 	return (0);
1142 }
1143 
1144 void
uvm_swap_shutdown(struct lwp * l)1145 uvm_swap_shutdown(struct lwp *l)
1146 {
1147 	struct swapdev *sdp;
1148 	struct swappri *spp;
1149 	struct vnode *vp;
1150 	int error;
1151 
1152 	if (!uvm_swap_init_done || uvmexp.nswapdev == 0)
1153 		return;
1154 	printf("turning off swap...");
1155 	rw_enter(&swap_syscall_lock, RW_WRITER);
1156 	mutex_enter(&uvm_swap_data_lock);
1157 again:
1158 	LIST_FOREACH(spp, &swap_priority, spi_swappri)
1159 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1160 			if (sdp->swd_flags & SWF_FAKE)
1161 				continue;
1162 			if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
1163 				continue;
1164 #ifdef DEBUG
1165 			printf("\nturning off swap on %s...", sdp->swd_path);
1166 #endif
1167 			/* Have to lock and reference vnode for swap_off(). */
1168 			vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
1169 			vref(vp);
1170 			error = swap_off(l, sdp);
1171 			vput(vp);
1172 			mutex_enter(&uvm_swap_data_lock);
1173 			if (error) {
1174 				printf("stopping swap on %s failed "
1175 				    "with error %d\n", sdp->swd_path, error);
1176 				TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1177 				uvmexp.nswapdev--;
1178 				swaplist_trim();
1179 			}
1180 			goto again;
1181 		}
1182 	printf(" done\n");
1183 	mutex_exit(&uvm_swap_data_lock);
1184 	rw_exit(&swap_syscall_lock);
1185 }
1186 
1187 
1188 /*
1189  * /dev/drum interface and i/o functions
1190  */
1191 
1192 /*
1193  * swopen: allow the initial open from uvm_swap_init() and reject all others.
1194  */
1195 
1196 static int
swopen(dev_t dev,int flag,int mode,struct lwp * l)1197 swopen(dev_t dev, int flag, int mode, struct lwp *l)
1198 {
1199 	static bool inited = false;
1200 
1201 	if (!inited) {
1202 		inited = true;
1203 		return 0;
1204 	}
1205 	return ENODEV;
1206 }
1207 
1208 /*
1209  * swstrategy: perform I/O on the drum
1210  *
1211  * => we must map the i/o request from the drum to the correct swapdev.
1212  */
1213 static void
swstrategy(struct buf * bp)1214 swstrategy(struct buf *bp)
1215 {
1216 	struct swapdev *sdp;
1217 	struct vnode *vp;
1218 	int pageno, bn;
1219 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1220 
1221 	/*
1222 	 * convert block number to swapdev.   note that swapdev can't
1223 	 * be yanked out from under us because we are holding resources
1224 	 * in it (i.e. the blocks we are doing I/O on).
1225 	 */
1226 	pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1227 	mutex_enter(&uvm_swap_data_lock);
1228 	sdp = swapdrum_getsdp(pageno);
1229 	mutex_exit(&uvm_swap_data_lock);
1230 	if (sdp == NULL) {
1231 		bp->b_error = EINVAL;
1232 		bp->b_resid = bp->b_bcount;
1233 		biodone(bp);
1234 		UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1235 		return;
1236 	}
1237 
1238 	/*
1239 	 * convert drum page number to block number on this swapdev.
1240 	 */
1241 
1242 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
1243 	bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1244 
1245 	UVMHIST_LOG(pdhist, "  Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
1246 		((bp->b_flags & B_READ) == 0) ? 1 : 0,
1247 		sdp->swd_drumoffset, bn, bp->b_bcount);
1248 
1249 	/*
1250 	 * for block devices we finish up here.
1251 	 * for regular files we have to do more work which we delegate
1252 	 * to sw_reg_strategy().
1253 	 */
1254 
1255 	vp = sdp->swd_vp;		/* swapdev vnode pointer */
1256 	switch (vp->v_type) {
1257 	default:
1258 		panic("%s: vnode type 0x%x", __func__, vp->v_type);
1259 
1260 	case VBLK:
1261 
1262 		/*
1263 		 * must convert "bp" from an I/O on /dev/drum to an I/O
1264 		 * on the swapdev (sdp).
1265 		 */
1266 		bp->b_blkno = bn;		/* swapdev block number */
1267 		bp->b_dev = sdp->swd_dev;	/* swapdev dev_t */
1268 
1269 		/*
1270 		 * if we are doing a write, we have to redirect the i/o on
1271 		 * drum's v_numoutput counter to the swapdevs.
1272 		 */
1273 		if ((bp->b_flags & B_READ) == 0) {
1274 			mutex_enter(bp->b_objlock);
1275 			vwakeup(bp);	/* kills one 'v_numoutput' on drum */
1276 			mutex_exit(bp->b_objlock);
1277 			mutex_enter(vp->v_interlock);
1278 			vp->v_numoutput++;	/* put it on swapdev */
1279 			mutex_exit(vp->v_interlock);
1280 		}
1281 
1282 		/*
1283 		 * finally plug in swapdev vnode and start I/O
1284 		 */
1285 		bp->b_vp = vp;
1286 		bp->b_objlock = vp->v_interlock;
1287 		VOP_STRATEGY(vp, bp);
1288 		return;
1289 
1290 	case VREG:
1291 		/*
1292 		 * delegate to sw_reg_strategy function.
1293 		 */
1294 		sw_reg_strategy(sdp, bp, bn);
1295 		return;
1296 	}
1297 	/* NOTREACHED */
1298 }
1299 
1300 /*
1301  * swread: the read function for the drum (just a call to physio)
1302  */
1303 /*ARGSUSED*/
1304 static int
swread(dev_t dev,struct uio * uio,int ioflag)1305 swread(dev_t dev, struct uio *uio, int ioflag)
1306 {
1307 	UVMHIST_FUNC(__func__);
1308 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1309 
1310 	return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1311 }
1312 
1313 /*
1314  * swwrite: the write function for the drum (just a call to physio)
1315  */
1316 /*ARGSUSED*/
1317 static int
swwrite(dev_t dev,struct uio * uio,int ioflag)1318 swwrite(dev_t dev, struct uio *uio, int ioflag)
1319 {
1320 	UVMHIST_FUNC(__func__);
1321 	UVMHIST_CALLARGS(pdhist, "  dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1322 
1323 	return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1324 }
1325 
1326 const struct bdevsw swap_bdevsw = {
1327 	.d_open = swopen,
1328 	.d_close = noclose,
1329 	.d_strategy = swstrategy,
1330 	.d_ioctl = noioctl,
1331 	.d_dump = nodump,
1332 	.d_psize = nosize,
1333 	.d_discard = nodiscard,
1334 	.d_flag = D_OTHER
1335 };
1336 
1337 const struct cdevsw swap_cdevsw = {
1338 	.d_open = nullopen,
1339 	.d_close = nullclose,
1340 	.d_read = swread,
1341 	.d_write = swwrite,
1342 	.d_ioctl = noioctl,
1343 	.d_stop = nostop,
1344 	.d_tty = notty,
1345 	.d_poll = nopoll,
1346 	.d_mmap = nommap,
1347 	.d_kqfilter = nokqfilter,
1348 	.d_discard = nodiscard,
1349 	.d_flag = D_OTHER,
1350 };
1351 
1352 /*
1353  * sw_reg_strategy: handle swap i/o to regular files
1354  */
1355 static void
sw_reg_strategy(struct swapdev * sdp,struct buf * bp,int bn)1356 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1357 {
1358 	struct vnode	*vp;
1359 	struct vndxfer	*vnx;
1360 	daddr_t		nbn;
1361 	char 		*addr;
1362 	off_t		byteoff;
1363 	int		s, off, nra, error, sz, resid;
1364 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1365 
1366 	/*
1367 	 * allocate a vndxfer head for this transfer and point it to
1368 	 * our buffer.
1369 	 */
1370 	vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1371 	vnx->vx_flags = VX_BUSY;
1372 	vnx->vx_error = 0;
1373 	vnx->vx_pending = 0;
1374 	vnx->vx_bp = bp;
1375 	vnx->vx_sdp = sdp;
1376 
1377 	/*
1378 	 * setup for main loop where we read filesystem blocks into
1379 	 * our buffer.
1380 	 */
1381 	error = 0;
1382 	bp->b_resid = bp->b_bcount;	/* nothing transferred yet! */
1383 	addr = bp->b_data;		/* current position in buffer */
1384 	byteoff = dbtob((uint64_t)bn);
1385 
1386 	for (resid = bp->b_resid; resid; resid -= sz) {
1387 		struct vndbuf	*nbp;
1388 
1389 		/*
1390 		 * translate byteoffset into block number.  return values:
1391 		 *   vp = vnode of underlying device
1392 		 *  nbn = new block number (on underlying vnode dev)
1393 		 *  nra = num blocks we can read-ahead (excludes requested
1394 		 *	block)
1395 		 */
1396 		nra = 0;
1397 		error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1398 				 	&vp, &nbn, &nra);
1399 
1400 		if (error == 0 && nbn == (daddr_t)-1) {
1401 			/*
1402 			 * this used to just set error, but that doesn't
1403 			 * do the right thing.  Instead, it causes random
1404 			 * memory errors.  The panic() should remain until
1405 			 * this condition doesn't destabilize the system.
1406 			 */
1407 #if 1
1408 			panic("%s: swap to sparse file", __func__);
1409 #else
1410 			error = EIO;	/* failure */
1411 #endif
1412 		}
1413 
1414 		/*
1415 		 * punt if there was an error or a hole in the file.
1416 		 * we must wait for any i/o ops we have already started
1417 		 * to finish before returning.
1418 		 *
1419 		 * XXX we could deal with holes here but it would be
1420 		 * a hassle (in the write case).
1421 		 */
1422 		if (error) {
1423 			s = splbio();
1424 			vnx->vx_error = error;	/* pass error up */
1425 			goto out;
1426 		}
1427 
1428 		/*
1429 		 * compute the size ("sz") of this transfer (in bytes).
1430 		 */
1431 		off = byteoff % sdp->swd_bsize;
1432 		sz = (1 + nra) * sdp->swd_bsize - off;
1433 		if (sz > resid)
1434 			sz = resid;
1435 
1436 		UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1437 		    "vp %#jx/%#jx offset %#jx/%#jx",
1438 		    (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
1439 
1440 		/*
1441 		 * now get a buf structure.   note that the vb_buf is
1442 		 * at the front of the nbp structure so that you can
1443 		 * cast pointers between the two structure easily.
1444 		 */
1445 		nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1446 		buf_init(&nbp->vb_buf);
1447 		nbp->vb_buf.b_flags    = bp->b_flags;
1448 		nbp->vb_buf.b_cflags   = bp->b_cflags;
1449 		nbp->vb_buf.b_oflags   = bp->b_oflags;
1450 		nbp->vb_buf.b_bcount   = sz;
1451 		nbp->vb_buf.b_bufsize  = sz;
1452 		nbp->vb_buf.b_error    = 0;
1453 		nbp->vb_buf.b_data     = addr;
1454 		nbp->vb_buf.b_lblkno   = 0;
1455 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
1456 		nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1457 		nbp->vb_buf.b_iodone   = sw_reg_biodone;
1458 		nbp->vb_buf.b_vp       = vp;
1459 		nbp->vb_buf.b_objlock  = vp->v_interlock;
1460 		if (vp->v_type == VBLK) {
1461 			nbp->vb_buf.b_dev = vp->v_rdev;
1462 		}
1463 
1464 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */
1465 
1466 		/*
1467 		 * Just sort by block number
1468 		 */
1469 		s = splbio();
1470 		if (vnx->vx_error != 0) {
1471 			buf_destroy(&nbp->vb_buf);
1472 			pool_put(&vndbuf_pool, nbp);
1473 			goto out;
1474 		}
1475 		vnx->vx_pending++;
1476 
1477 		/* sort it in and start I/O if we are not over our limit */
1478 		/* XXXAD locking */
1479 		bufq_put(sdp->swd_tab, &nbp->vb_buf);
1480 		sw_reg_start(sdp);
1481 		splx(s);
1482 
1483 		/*
1484 		 * advance to the next I/O
1485 		 */
1486 		byteoff += sz;
1487 		addr += sz;
1488 	}
1489 
1490 	s = splbio();
1491 
1492 out: /* Arrive here at splbio */
1493 	vnx->vx_flags &= ~VX_BUSY;
1494 	if (vnx->vx_pending == 0) {
1495 		error = vnx->vx_error;
1496 		pool_put(&vndxfer_pool, vnx);
1497 		bp->b_error = error;
1498 		biodone(bp);
1499 	}
1500 	splx(s);
1501 }
1502 
1503 /*
1504  * sw_reg_start: start an I/O request on the requested swapdev
1505  *
1506  * => reqs are sorted by b_rawblkno (above)
1507  */
1508 static void
sw_reg_start(struct swapdev * sdp)1509 sw_reg_start(struct swapdev *sdp)
1510 {
1511 	struct buf	*bp;
1512 	struct vnode	*vp;
1513 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1514 
1515 	/* recursion control */
1516 	if ((sdp->swd_flags & SWF_BUSY) != 0)
1517 		return;
1518 
1519 	sdp->swd_flags |= SWF_BUSY;
1520 
1521 	while (sdp->swd_active < sdp->swd_maxactive) {
1522 		bp = bufq_get(sdp->swd_tab);
1523 		if (bp == NULL)
1524 			break;
1525 		sdp->swd_active++;
1526 
1527 		UVMHIST_LOG(pdhist,
1528 		    "sw_reg_start:  bp %#jx vp %#jx blkno %#jx cnt %#jx",
1529 		    (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
1530 		    bp->b_bcount);
1531 		vp = bp->b_vp;
1532 		KASSERT(bp->b_objlock == vp->v_interlock);
1533 		if ((bp->b_flags & B_READ) == 0) {
1534 			mutex_enter(vp->v_interlock);
1535 			vp->v_numoutput++;
1536 			mutex_exit(vp->v_interlock);
1537 		}
1538 		VOP_STRATEGY(vp, bp);
1539 	}
1540 	sdp->swd_flags &= ~SWF_BUSY;
1541 }
1542 
1543 /*
1544  * sw_reg_biodone: one of our i/o's has completed
1545  */
1546 static void
sw_reg_biodone(struct buf * bp)1547 sw_reg_biodone(struct buf *bp)
1548 {
1549 	workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1550 }
1551 
1552 /*
1553  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1554  *
1555  * => note that we can recover the vndbuf struct by casting the buf ptr
1556  */
1557 static void
sw_reg_iodone(struct work * wk,void * dummy)1558 sw_reg_iodone(struct work *wk, void *dummy)
1559 {
1560 	struct vndbuf *vbp = (void *)wk;
1561 	struct vndxfer *vnx = vbp->vb_xfer;
1562 	struct buf *pbp = vnx->vx_bp;		/* parent buffer */
1563 	struct swapdev	*sdp = vnx->vx_sdp;
1564 	int s, resid, error;
1565 	KASSERT(&vbp->vb_buf.b_work == wk);
1566 	UVMHIST_FUNC(__func__);
1567 	UVMHIST_CALLARGS(pdhist, "  vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
1568 	    (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
1569 	    (uintptr_t)vbp->vb_buf.b_data);
1570 	UVMHIST_LOG(pdhist, "  cnt=%#jx resid=%#jx",
1571 	    vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1572 
1573 	/*
1574 	 * protect vbp at splbio and update.
1575 	 */
1576 
1577 	s = splbio();
1578 	resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1579 	pbp->b_resid -= resid;
1580 	vnx->vx_pending--;
1581 
1582 	if (vbp->vb_buf.b_error != 0) {
1583 		/* pass error upward */
1584 		error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1585 		UVMHIST_LOG(pdhist, "  got error=%jd !", error, 0, 0, 0);
1586 		vnx->vx_error = error;
1587 	}
1588 
1589 	/*
1590 	 * kill vbp structure
1591 	 */
1592 	buf_destroy(&vbp->vb_buf);
1593 	pool_put(&vndbuf_pool, vbp);
1594 
1595 	/*
1596 	 * wrap up this transaction if it has run to completion or, in
1597 	 * case of an error, when all auxiliary buffers have returned.
1598 	 */
1599 	if (vnx->vx_error != 0) {
1600 		/* pass error upward */
1601 		error = vnx->vx_error;
1602 		if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1603 			pbp->b_error = error;
1604 			biodone(pbp);
1605 			pool_put(&vndxfer_pool, vnx);
1606 		}
1607 	} else if (pbp->b_resid == 0) {
1608 		KASSERT(vnx->vx_pending == 0);
1609 		if ((vnx->vx_flags & VX_BUSY) == 0) {
1610 			UVMHIST_LOG(pdhist, "  iodone, pbp=%#jx error=%jd !",
1611 			    (uintptr_t)pbp, vnx->vx_error, 0, 0);
1612 			biodone(pbp);
1613 			pool_put(&vndxfer_pool, vnx);
1614 		}
1615 	}
1616 
1617 	/*
1618 	 * done!   start next swapdev I/O if one is pending
1619 	 */
1620 	sdp->swd_active--;
1621 	sw_reg_start(sdp);
1622 	splx(s);
1623 }
1624 
1625 
1626 /*
1627  * uvm_swap_alloc: allocate space on swap
1628  *
1629  * => allocation is done "round robin" down the priority list, as we
1630  *	allocate in a priority we "rotate" the circle queue.
1631  * => space can be freed with uvm_swap_free
1632  * => we return the page slot number in /dev/drum (0 == invalid slot)
1633  * => we lock uvm_swap_data_lock
1634  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1635  */
1636 int
uvm_swap_alloc(int * nslots,bool lessok)1637 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1638 {
1639 	struct swapdev *sdp;
1640 	struct swappri *spp;
1641 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1642 
1643 	/*
1644 	 * no swap devices configured yet?   definite failure.
1645 	 */
1646 	if (uvmexp.nswapdev < 1)
1647 		return 0;
1648 
1649 	/*
1650 	 * XXXJAK: BEGIN HACK
1651 	 *
1652 	 * blist_alloc() in subr_blist.c will panic if we try to allocate
1653 	 * too many slots.
1654 	 */
1655 	if (*nslots > BLIST_MAX_ALLOC) {
1656 		if (__predict_false(lessok == false))
1657 			return 0;
1658 		*nslots = BLIST_MAX_ALLOC;
1659 	}
1660 	/* XXXJAK: END HACK */
1661 
1662 	/*
1663 	 * lock data lock, convert slots into blocks, and enter loop
1664 	 */
1665 	mutex_enter(&uvm_swap_data_lock);
1666 
1667 ReTry:	/* XXXMRG */
1668 	LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1669 		TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1670 			uint64_t result;
1671 
1672 			/* if it's not enabled, then we can't swap from it */
1673 			if ((sdp->swd_flags & SWF_ENABLE) == 0)
1674 				continue;
1675 			if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1676 				continue;
1677 			result = blist_alloc(sdp->swd_blist, *nslots);
1678 			if (result == BLIST_NONE) {
1679 				continue;
1680 			}
1681 			KASSERT(result < sdp->swd_drumsize);
1682 
1683 			/*
1684 			 * successful allocation!  now rotate the tailq.
1685 			 */
1686 			TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1687 			TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1688 			sdp->swd_npginuse += *nslots;
1689 			uvmexp.swpginuse += *nslots;
1690 			mutex_exit(&uvm_swap_data_lock);
1691 			/* done!  return drum slot number */
1692 			UVMHIST_LOG(pdhist,
1693 			    "success!  returning %jd slots starting at %jd",
1694 			    *nslots, result + sdp->swd_drumoffset, 0, 0);
1695 			return (result + sdp->swd_drumoffset);
1696 		}
1697 	}
1698 
1699 	/* XXXMRG: BEGIN HACK */
1700 	if (*nslots > 1 && lessok) {
1701 		*nslots = 1;
1702 		/* XXXMRG: ugh!  blist should support this for us */
1703 		goto ReTry;
1704 	}
1705 	/* XXXMRG: END HACK */
1706 
1707 	mutex_exit(&uvm_swap_data_lock);
1708 	return 0;
1709 }
1710 
1711 /*
1712  * uvm_swapisfull: return true if most of available swap is allocated
1713  * and in use.  we don't count some small portion as it may be inaccessible
1714  * to us at any given moment, for example if there is lock contention or if
1715  * pages are busy.
1716  */
1717 bool
uvm_swapisfull(void)1718 uvm_swapisfull(void)
1719 {
1720 	int swpgonly;
1721 	bool rv;
1722 
1723 	if (uvmexp.swpages == 0) {
1724 		return true;
1725 	}
1726 
1727 	mutex_enter(&uvm_swap_data_lock);
1728 	KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1729 	swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1730 	    uvm_swapisfull_factor);
1731 	rv = (swpgonly >= uvmexp.swpgavail);
1732 	mutex_exit(&uvm_swap_data_lock);
1733 
1734 	return (rv);
1735 }
1736 
1737 /*
1738  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1739  *
1740  * => we lock uvm_swap_data_lock
1741  */
1742 void
uvm_swap_markbad(int startslot,int nslots)1743 uvm_swap_markbad(int startslot, int nslots)
1744 {
1745 	struct swapdev *sdp;
1746 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1747 
1748 	mutex_enter(&uvm_swap_data_lock);
1749 	sdp = swapdrum_getsdp(startslot);
1750 	KASSERT(sdp != NULL);
1751 
1752 	/*
1753 	 * we just keep track of how many pages have been marked bad
1754 	 * in this device, to make everything add up in swap_off().
1755 	 * we assume here that the range of slots will all be within
1756 	 * one swap device.
1757 	 */
1758 
1759 	KASSERT(uvmexp.swpgonly >= nslots);
1760 	atomic_add_int(&uvmexp.swpgonly, -nslots);
1761 	sdp->swd_npgbad += nslots;
1762 	UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
1763 	mutex_exit(&uvm_swap_data_lock);
1764 }
1765 
1766 /*
1767  * uvm_swap_free: free swap slots
1768  *
1769  * => this can be all or part of an allocation made by uvm_swap_alloc
1770  * => we lock uvm_swap_data_lock
1771  */
1772 void
uvm_swap_free(int startslot,int nslots)1773 uvm_swap_free(int startslot, int nslots)
1774 {
1775 	struct swapdev *sdp;
1776 	UVMHIST_FUNC(__func__);
1777 	UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
1778 	    startslot, 0, 0);
1779 
1780 	/*
1781 	 * ignore attempts to free the "bad" slot.
1782 	 */
1783 
1784 	if (startslot == SWSLOT_BAD) {
1785 		return;
1786 	}
1787 
1788 	/*
1789 	 * convert drum slot offset back to sdp, free the blocks
1790 	 * in the extent, and return.   must hold pri lock to do
1791 	 * lookup and access the extent.
1792 	 */
1793 
1794 	mutex_enter(&uvm_swap_data_lock);
1795 	sdp = swapdrum_getsdp(startslot);
1796 	KASSERT(uvmexp.nswapdev >= 1);
1797 	KASSERT(sdp != NULL);
1798 	KASSERT(sdp->swd_npginuse >= nslots);
1799 	blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1800 	sdp->swd_npginuse -= nslots;
1801 	uvmexp.swpginuse -= nslots;
1802 	mutex_exit(&uvm_swap_data_lock);
1803 }
1804 
1805 /*
1806  * uvm_swap_put: put any number of pages into a contig place on swap
1807  *
1808  * => can be sync or async
1809  */
1810 
1811 int
uvm_swap_put(int swslot,struct vm_page ** ppsp,int npages,int flags)1812 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1813 {
1814 	int error;
1815 
1816 	error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1817 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1818 	return error;
1819 }
1820 
1821 /*
1822  * uvm_swap_get: get a single page from swap
1823  *
1824  * => usually a sync op (from fault)
1825  */
1826 
1827 int
uvm_swap_get(struct vm_page * page,int swslot,int flags)1828 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1829 {
1830 	int error;
1831 
1832 	atomic_inc_uint(&uvmexp.nswget);
1833 	KASSERT(flags & PGO_SYNCIO);
1834 	if (swslot == SWSLOT_BAD) {
1835 		return EIO;
1836 	}
1837 
1838 	error = uvm_swap_io(&page, swslot, 1, B_READ |
1839 	    ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1840 	if (error == 0) {
1841 
1842 		/*
1843 		 * this page is no longer only in swap.
1844 		 */
1845 
1846 		KASSERT(uvmexp.swpgonly > 0);
1847 		atomic_dec_uint(&uvmexp.swpgonly);
1848 	}
1849 	return error;
1850 }
1851 
1852 /*
1853  * uvm_swap_io: do an i/o operation to swap
1854  */
1855 
1856 static int
uvm_swap_io(struct vm_page ** pps,int startslot,int npages,int flags)1857 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1858 {
1859 	daddr_t startblk;
1860 	struct	buf *bp;
1861 	vaddr_t kva;
1862 	int	error, mapinflags;
1863 	bool write, async, swap_encrypt;
1864 	UVMHIST_FUNC(__func__);
1865 	UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
1866 	    startslot, npages, flags, 0);
1867 
1868 	write = (flags & B_READ) == 0;
1869 	async = (flags & B_ASYNC) != 0;
1870 	swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);
1871 
1872 	/*
1873 	 * allocate a buf for the i/o.
1874 	 */
1875 
1876 	KASSERT(curlwp != uvm.pagedaemon_lwp || write);
1877 	KASSERT(curlwp != uvm.pagedaemon_lwp || async);
1878 	bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1879 	if (bp == NULL) {
1880 		uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1881 		return ENOMEM;
1882 	}
1883 
1884 	/*
1885 	 * convert starting drum slot to block number
1886 	 */
1887 
1888 	startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1889 
1890 	/*
1891 	 * first, map the pages into the kernel.
1892 	 */
1893 
1894 	mapinflags = !write ?
1895 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1896 		UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1897 	if (write && swap_encrypt)	/* need to encrypt in-place */
1898 		mapinflags |= UVMPAGER_MAPIN_READ;
1899 	kva = uvm_pagermapin(pps, npages, mapinflags);
1900 
1901 	/*
1902 	 * encrypt writes in place if requested
1903 	 */
1904 
1905 	if (write) do {
1906 		struct swapdev *sdp;
1907 		int i;
1908 
1909 		/*
1910 		 * Get the swapdev so we can discriminate on the
1911 		 * encryption state.  There may or may not be an
1912 		 * encryption key generated; we may or may not be asked
1913 		 * to encrypt swap.
1914 		 *
1915 		 * 1. NO KEY, NO ENCRYPTION: Nothing to do.
1916 		 *
1917 		 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
1918 		 *    and mark the slots encrypted.
1919 		 *
1920 		 * 3. KEY, BUT NO ENCRYPTION: The slots may already be
1921 		 *    marked encrypted from a past life.  Mark them not
1922 		 *    encrypted.
1923 		 *
1924 		 * 4. KEY, ENCRYPTION: Encrypt and mark the slots
1925 		 *    encrypted.
1926 		 */
1927 		mutex_enter(&uvm_swap_data_lock);
1928 		sdp = swapdrum_getsdp(startslot);
1929 		if (!sdp->swd_encinit) {
1930 			if (!swap_encrypt) {
1931 				mutex_exit(&uvm_swap_data_lock);
1932 				break;
1933 			}
1934 			uvm_swap_genkey(sdp);
1935 		}
1936 		KASSERT(sdp->swd_encinit);
1937 		mutex_exit(&uvm_swap_data_lock);
1938 
1939 		for (i = 0; i < npages; i++) {
1940 			int s = startslot + i;
1941 			KDASSERT(swapdrum_sdp_is(s, sdp));
1942 			KASSERT(s >= sdp->swd_drumoffset);
1943 			s -= sdp->swd_drumoffset;
1944 			KASSERT(s < sdp->swd_drumsize);
1945 
1946 			if (swap_encrypt) {
1947 				uvm_swap_encryptpage(sdp,
1948 				    (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
1949 				atomic_or_32(&sdp->swd_encmap[s/32],
1950 				    __BIT(s%32));
1951 			} else {
1952 				atomic_and_32(&sdp->swd_encmap[s/32],
1953 				    ~__BIT(s%32));
1954 			}
1955 		}
1956 	} while (0);
1957 
1958 	/*
1959 	 * fill in the bp/sbp.   we currently route our i/o through
1960 	 * /dev/drum's vnode [swapdev_vp].
1961 	 */
1962 
1963 	bp->b_cflags = BC_BUSY | BC_NOCACHE;
1964 	bp->b_flags = (flags & (B_READ|B_ASYNC));
1965 	bp->b_proc = &proc0;	/* XXX */
1966 	bp->b_vnbufs.le_next = NOLIST;
1967 	bp->b_data = (void *)kva;
1968 	bp->b_blkno = startblk;
1969 	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1970 
1971 	/*
1972 	 * bump v_numoutput (counter of number of active outputs).
1973 	 */
1974 
1975 	if (write) {
1976 		mutex_enter(swapdev_vp->v_interlock);
1977 		swapdev_vp->v_numoutput++;
1978 		mutex_exit(swapdev_vp->v_interlock);
1979 	}
1980 
1981 	/*
1982 	 * for async ops we must set up the iodone handler.
1983 	 */
1984 
1985 	if (async) {
1986 		bp->b_iodone = uvm_aio_aiodone;
1987 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1988 		if (curlwp == uvm.pagedaemon_lwp)
1989 			BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1990 		else
1991 			BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1992 	} else {
1993 		bp->b_iodone = NULL;
1994 		BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1995 	}
1996 	UVMHIST_LOG(pdhist,
1997 	    "about to start io: data = %#jx blkno = %#jx, bcount = %jd",
1998 	    (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1999 
2000 	/*
2001 	 * now we start the I/O, and if async, return.
2002 	 */
2003 
2004 	VOP_STRATEGY(swapdev_vp, bp);
2005 	if (async) {
2006 		/*
2007 		 * Reads are always synchronous; if this changes, we
2008 		 * need to add an asynchronous path for decryption.
2009 		 */
2010 		KASSERT(write);
2011 		return 0;
2012 	}
2013 
2014 	/*
2015 	 * must be sync i/o.   wait for it to finish
2016 	 */
2017 
2018 	error = biowait(bp);
2019 	if (error)
2020 		goto out;
2021 
2022 	/*
2023 	 * decrypt reads in place if needed
2024 	 */
2025 
2026 	if (!write) do {
2027 		struct swapdev *sdp;
2028 		bool encinit;
2029 		int i;
2030 
2031 		/*
2032 		 * Get the sdp.  Everything about it except the encinit
2033 		 * bit, saying whether the encryption key is
2034 		 * initialized or not, and the encrypted bit for each
2035 		 * page, is stable until all swap pages have been
2036 		 * released and the device is removed.
2037 		 */
2038 		mutex_enter(&uvm_swap_data_lock);
2039 		sdp = swapdrum_getsdp(startslot);
2040 		encinit = sdp->swd_encinit;
2041 		mutex_exit(&uvm_swap_data_lock);
2042 
2043 		if (!encinit)
2044 			/*
2045 			 * If there's no encryption key, there's no way
2046 			 * any of these slots can be encrypted, so
2047 			 * nothing to do here.
2048 			 */
2049 			break;
2050 		for (i = 0; i < npages; i++) {
2051 			int s = startslot + i;
2052 			KDASSERT(swapdrum_sdp_is(s, sdp));
2053 			KASSERT(s >= sdp->swd_drumoffset);
2054 			s -= sdp->swd_drumoffset;
2055 			KASSERT(s < sdp->swd_drumsize);
2056 			if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
2057 				__BIT(s%32)) == 0)
2058 				continue;
2059 			uvm_swap_decryptpage(sdp,
2060 			    (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
2061 		}
2062 	} while (0);
2063 out:
2064 	/*
2065 	 * kill the pager mapping
2066 	 */
2067 
2068 	uvm_pagermapout(kva, npages);
2069 
2070 	/*
2071 	 * now dispose of the buf and we're done.
2072 	 */
2073 
2074 	if (write) {
2075 		mutex_enter(swapdev_vp->v_interlock);
2076 		vwakeup(bp);
2077 		mutex_exit(swapdev_vp->v_interlock);
2078 	}
2079 	putiobuf(bp);
2080 	UVMHIST_LOG(pdhist, "<- done (sync)  error=%jd", error, 0, 0, 0);
2081 
2082 	return (error);
2083 }
2084 
2085 /*
2086  * uvm_swap_genkey(sdp)
2087  *
2088  *	Generate a key for swap encryption.
2089  */
2090 static void
uvm_swap_genkey(struct swapdev * sdp)2091 uvm_swap_genkey(struct swapdev *sdp)
2092 {
2093 	uint8_t key[32];
2094 
2095 	KASSERT(!sdp->swd_encinit);
2096 
2097 	cprng_strong(kern_cprng, key, sizeof key, 0);
2098 	aes_setenckey256(&sdp->swd_enckey, key);
2099 	aes_setdeckey256(&sdp->swd_deckey, key);
2100 	explicit_memset(key, 0, sizeof key);
2101 
2102 	sdp->swd_encinit = true;
2103 }
2104 
2105 /*
2106  * uvm_swap_encryptpage(sdp, kva, slot)
2107  *
2108  *	Encrypt one page of data at kva for the specified slot number
2109  *	in the swap device.
2110  */
2111 static void
uvm_swap_encryptpage(struct swapdev * sdp,void * kva,int slot)2112 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
2113 {
2114 	uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2115 
2116 	/* iv := AES_k(le32enc(slot) || 0^96) */
2117 	le32enc(preiv, slot);
2118 	aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2119 
2120 	/* *kva := AES-CBC_k(iv, *kva) */
2121 	aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
2122 	    AES_256_NROUNDS);
2123 
2124 	explicit_memset(&iv, 0, sizeof iv);
2125 }
2126 
2127 /*
2128  * uvm_swap_decryptpage(sdp, kva, slot)
2129  *
2130  *	Decrypt one page of data at kva for the specified slot number
2131  *	in the swap device.
2132  */
2133 static void
uvm_swap_decryptpage(struct swapdev * sdp,void * kva,int slot)2134 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
2135 {
2136 	uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2137 
2138 	/* iv := AES_k(le32enc(slot) || 0^96) */
2139 	le32enc(preiv, slot);
2140 	aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2141 
2142 	/* *kva := AES-CBC^{-1}_k(iv, *kva) */
2143 	aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
2144 	    AES_256_NROUNDS);
2145 
2146 	explicit_memset(&iv, 0, sizeof iv);
2147 }
2148 
2149 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
2150 {
2151 
2152 	sysctl_createv(clog, 0, NULL, NULL,
2153 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
2154 	    SYSCTL_DESCR("Encrypt data when swapped out to disk"),
2155 	    NULL, 0, &uvm_swap_encrypt, 0,
2156 	    CTL_VM, CTL_CREATE, CTL_EOL);
2157 }
2158