1 /* $NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $ */
2
3 /*
4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $");
34
35 #include "opt_uvmhist.h"
36 #include "opt_compat_netbsd.h"
37 #include "opt_ddb.h"
38 #include "opt_vmswap.h"
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/atomic.h>
43 #include <sys/buf.h>
44 #include <sys/bufq.h>
45 #include <sys/conf.h>
46 #include <sys/cprng.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/disklabel.h>
50 #include <sys/errno.h>
51 #include <sys/kernel.h>
52 #include <sys/vnode.h>
53 #include <sys/file.h>
54 #include <sys/vmem.h>
55 #include <sys/blist.h>
56 #include <sys/mount.h>
57 #include <sys/pool.h>
58 #include <sys/kmem.h>
59 #include <sys/syscallargs.h>
60 #include <sys/swap.h>
61 #include <sys/kauth.h>
62 #include <sys/sysctl.h>
63 #include <sys/workqueue.h>
64
65 #include <uvm/uvm.h>
66
67 #include <miscfs/specfs/specdev.h>
68
69 #include <crypto/aes/aes.h>
70 #include <crypto/aes/aes_cbc.h>
71
72 /*
73 * uvm_swap.c: manage configuration and i/o to swap space.
74 */
75
76 /*
77 * swap space is managed in the following way:
78 *
79 * each swap partition or file is described by a "swapdev" structure.
80 * each "swapdev" structure contains a "swapent" structure which contains
81 * information that is passed up to the user (via system calls).
82 *
83 * each swap partition is assigned a "priority" (int) which controls
84 * swap partition usage.
85 *
86 * the system maintains a global data structure describing all swap
87 * partitions/files. there is a sorted LIST of "swappri" structures
88 * which describe "swapdev"'s at that priority. this LIST is headed
89 * by the "swap_priority" global var. each "swappri" contains a
90 * TAILQ of "swapdev" structures at that priority.
91 *
92 * locking:
93 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
94 * system call and prevents the swap priority list from changing
95 * while we are in the middle of a system call (e.g. SWAP_STATS).
96 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
97 * structures including the priority list, the swapdev structures,
98 * and the swapmap arena.
99 *
100 * each swap device has the following info:
101 * - swap device in use (could be disabled, preventing future use)
102 * - swap enabled (allows new allocations on swap)
103 * - map info in /dev/drum
104 * - vnode pointer
105 * for swap files only:
106 * - block size
107 * - max byte count in buffer
108 * - buffer
109 *
110 * userland controls and configures swap with the swapctl(2) system call.
111 * the sys_swapctl performs the following operations:
112 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
113 * [2] SWAP_STATS: given a pointer to an array of swapent structures
114 * (passed in via "arg") of a size passed in via "misc" ... we load
115 * the current swap config into the array. The actual work is done
116 * in the uvm_swap_stats() function.
117 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
118 * priority in "misc", start swapping on it.
119 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
120 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
121 * "misc")
122 */
123
124 /*
125 * swapdev: describes a single swap partition/file
126 *
127 * note the following should be true:
128 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
129 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
130 */
131 struct swapdev {
132 dev_t swd_dev; /* device id */
133 int swd_flags; /* flags:inuse/enable/fake */
134 int swd_priority; /* our priority */
135 int swd_nblks; /* blocks in this device */
136 char *swd_path; /* saved pathname of device */
137 int swd_pathlen; /* length of pathname */
138 int swd_npages; /* #pages we can use */
139 int swd_npginuse; /* #pages in use */
140 int swd_npgbad; /* #pages bad */
141 int swd_drumoffset; /* page0 offset in drum */
142 int swd_drumsize; /* #pages in drum */
143 blist_t swd_blist; /* blist for this swapdev */
144 struct vnode *swd_vp; /* backing vnode */
145 TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */
146
147 int swd_bsize; /* blocksize (bytes) */
148 int swd_maxactive; /* max active i/o reqs */
149 struct bufq_state *swd_tab; /* buffer list */
150 int swd_active; /* number of active buffers */
151
152 volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */
153 struct aesenc swd_enckey; /* AES key expanded for enc */
154 struct aesdec swd_deckey; /* AES key expanded for dec */
155 bool swd_encinit; /* true if keys initialized */
156 };
157
158 /*
159 * swap device priority entry; the list is kept sorted on `spi_priority'.
160 */
161 struct swappri {
162 int spi_priority; /* priority */
163 TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
164 /* tailq of swapdevs at this priority */
165 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
166 };
167
168 /*
169 * The following two structures are used to keep track of data transfers
170 * on swap devices associated with regular files.
171 * NOTE: this code is more or less a copy of vnd.c; we use the same
172 * structure names here to ease porting..
173 */
174 struct vndxfer {
175 struct buf *vx_bp; /* Pointer to parent buffer */
176 struct swapdev *vx_sdp;
177 int vx_error;
178 int vx_pending; /* # of pending aux buffers */
179 int vx_flags;
180 #define VX_BUSY 1
181 #define VX_DEAD 2
182 };
183
184 struct vndbuf {
185 struct buf vb_buf;
186 struct vndxfer *vb_xfer;
187 };
188
189 /*
190 * We keep a of pool vndbuf's and vndxfer structures.
191 */
192 static struct pool vndxfer_pool, vndbuf_pool;
193
194 /*
195 * local variables
196 */
197 static vmem_t *swapmap; /* controls the mapping of /dev/drum */
198
199 /* list of all active swap devices [by priority] */
200 LIST_HEAD(swap_priority, swappri);
201 static struct swap_priority swap_priority;
202
203 /* locks */
204 static kmutex_t uvm_swap_data_lock __cacheline_aligned;
205 static krwlock_t swap_syscall_lock;
206 bool uvm_swap_init_done = false;
207
208 /* workqueue and use counter for swap to regular files */
209 static int sw_reg_count = 0;
210 static struct workqueue *sw_reg_workqueue;
211
212 /* tuneables */
213 u_int uvm_swapisfull_factor = 99;
214 #if VMSWAP_DEFAULT_PLAINTEXT
215 bool uvm_swap_encrypt = false;
216 #else
217 bool uvm_swap_encrypt = true;
218 #endif
219
220 /*
221 * prototypes
222 */
223 static struct swapdev *swapdrum_getsdp(int);
224
225 static struct swapdev *swaplist_find(struct vnode *, bool);
226 static void swaplist_insert(struct swapdev *,
227 struct swappri *, int);
228 static void swaplist_trim(void);
229
230 static int swap_on(struct lwp *, struct swapdev *);
231 static int swap_off(struct lwp *, struct swapdev *);
232
233 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
234 static void sw_reg_biodone(struct buf *);
235 static void sw_reg_iodone(struct work *wk, void *dummy);
236 static void sw_reg_start(struct swapdev *);
237
238 static int uvm_swap_io(struct vm_page **, int, int, int);
239
240 static void uvm_swap_genkey(struct swapdev *);
241 static void uvm_swap_encryptpage(struct swapdev *, void *, int);
242 static void uvm_swap_decryptpage(struct swapdev *, void *, int);
243
244 static size_t
encmap_size(size_t npages)245 encmap_size(size_t npages)
246 {
247 struct swapdev *sdp;
248 const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
249 const size_t bitsperword = NBBY * bytesperword;
250 const size_t nbits = npages; /* one bit for each page */
251 const size_t nwords = howmany(nbits, bitsperword);
252 const size_t nbytes = nwords * bytesperword;
253
254 return nbytes;
255 }
256
257 /*
258 * uvm_swap_init: init the swap system data structures and locks
259 *
260 * => called at boot time from init_main.c after the filesystems
261 * are brought up (which happens after uvm_init())
262 */
263 void
uvm_swap_init(void)264 uvm_swap_init(void)
265 {
266 UVMHIST_FUNC(__func__);
267
268 UVMHIST_CALLED(pdhist);
269 /*
270 * first, init the swap list, its counter, and its lock.
271 * then get a handle on the vnode for /dev/drum by using
272 * the its dev_t number ("swapdev", from MD conf.c).
273 */
274
275 LIST_INIT(&swap_priority);
276 uvmexp.nswapdev = 0;
277 rw_init(&swap_syscall_lock);
278 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
279
280 if (bdevvp(swapdev, &swapdev_vp))
281 panic("%s: can't get vnode for swap device", __func__);
282 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
283 panic("%s: can't lock swap device", __func__);
284 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
285 panic("%s: can't open swap device", __func__);
286 VOP_UNLOCK(swapdev_vp);
287
288 /*
289 * create swap block resource map to map /dev/drum. the range
290 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
291 * that block 0 is reserved (used to indicate an allocation
292 * failure, or no allocation).
293 */
294 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
295 VM_NOSLEEP, IPL_NONE);
296 if (swapmap == 0) {
297 panic("%s: vmem_create failed", __func__);
298 }
299
300 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
301 NULL, IPL_BIO);
302 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
303 NULL, IPL_BIO);
304
305 uvm_swap_init_done = true;
306
307 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
308 }
309
310 /*
311 * swaplist functions: functions that operate on the list of swap
312 * devices on the system.
313 */
314
315 /*
316 * swaplist_insert: insert swap device "sdp" into the global list
317 *
318 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
319 * => caller must provide a newly allocated swappri structure (we will
320 * FREE it if we don't need it... this it to prevent allocation
321 * blocking here while adding swap)
322 */
323 static void
swaplist_insert(struct swapdev * sdp,struct swappri * newspp,int priority)324 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
325 {
326 struct swappri *spp, *pspp;
327 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
328
329 KASSERT(rw_write_held(&swap_syscall_lock));
330 KASSERT(mutex_owned(&uvm_swap_data_lock));
331
332 /*
333 * find entry at or after which to insert the new device.
334 */
335 pspp = NULL;
336 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
337 if (priority <= spp->spi_priority)
338 break;
339 pspp = spp;
340 }
341
342 /*
343 * new priority?
344 */
345 if (spp == NULL || spp->spi_priority != priority) {
346 spp = newspp; /* use newspp! */
347 UVMHIST_LOG(pdhist, "created new swappri = %jd",
348 priority, 0, 0, 0);
349
350 spp->spi_priority = priority;
351 TAILQ_INIT(&spp->spi_swapdev);
352
353 if (pspp)
354 LIST_INSERT_AFTER(pspp, spp, spi_swappri);
355 else
356 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
357 } else {
358 /* we don't need a new priority structure, free it */
359 kmem_free(newspp, sizeof(*newspp));
360 }
361
362 /*
363 * priority found (or created). now insert on the priority's
364 * tailq list and bump the total number of swapdevs.
365 */
366 sdp->swd_priority = priority;
367 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
368 uvmexp.nswapdev++;
369 }
370
371 /*
372 * swaplist_find: find and optionally remove a swap device from the
373 * global list.
374 *
375 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
376 * => we return the swapdev we found (and removed)
377 */
378 static struct swapdev *
swaplist_find(struct vnode * vp,bool remove)379 swaplist_find(struct vnode *vp, bool remove)
380 {
381 struct swapdev *sdp;
382 struct swappri *spp;
383
384 KASSERT(rw_lock_held(&swap_syscall_lock));
385 KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1);
386 KASSERT(mutex_owned(&uvm_swap_data_lock));
387
388 /*
389 * search the lists for the requested vp
390 */
391
392 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
393 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
394 if (sdp->swd_vp == vp) {
395 if (remove) {
396 TAILQ_REMOVE(&spp->spi_swapdev,
397 sdp, swd_next);
398 uvmexp.nswapdev--;
399 }
400 return(sdp);
401 }
402 }
403 }
404 return (NULL);
405 }
406
407 /*
408 * swaplist_trim: scan priority list for empty priority entries and kill
409 * them.
410 *
411 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
412 */
413 static void
swaplist_trim(void)414 swaplist_trim(void)
415 {
416 struct swappri *spp, *nextspp;
417
418 KASSERT(rw_write_held(&swap_syscall_lock));
419 KASSERT(mutex_owned(&uvm_swap_data_lock));
420
421 LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) {
422 if (!TAILQ_EMPTY(&spp->spi_swapdev))
423 continue;
424 LIST_REMOVE(spp, spi_swappri);
425 kmem_free(spp, sizeof(*spp));
426 }
427 }
428
429 /*
430 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
431 * to the "swapdev" that maps that section of the drum.
432 *
433 * => each swapdev takes one big contig chunk of the drum
434 * => caller must hold uvm_swap_data_lock
435 */
436 static struct swapdev *
swapdrum_getsdp(int pgno)437 swapdrum_getsdp(int pgno)
438 {
439 struct swapdev *sdp;
440 struct swappri *spp;
441
442 KASSERT(mutex_owned(&uvm_swap_data_lock));
443
444 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
445 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
446 if (sdp->swd_flags & SWF_FAKE)
447 continue;
448 if (pgno >= sdp->swd_drumoffset &&
449 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
450 return sdp;
451 }
452 }
453 }
454 return NULL;
455 }
456
457 /*
458 * swapdrum_sdp_is: true iff the swap device for pgno is sdp
459 *
460 * => for use in positive assertions only; result is not stable
461 */
462 static bool __debugused
swapdrum_sdp_is(int pgno,struct swapdev * sdp)463 swapdrum_sdp_is(int pgno, struct swapdev *sdp)
464 {
465 bool result;
466
467 mutex_enter(&uvm_swap_data_lock);
468 result = swapdrum_getsdp(pgno) == sdp;
469 mutex_exit(&uvm_swap_data_lock);
470
471 return result;
472 }
473
swapsys_lock(krw_t op)474 void swapsys_lock(krw_t op)
475 {
476 rw_enter(&swap_syscall_lock, op);
477 }
478
swapsys_unlock(void)479 void swapsys_unlock(void)
480 {
481 rw_exit(&swap_syscall_lock);
482 }
483
484 static void
swapent_cvt(struct swapent * se,const struct swapdev * sdp,int inuse)485 swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
486 {
487 se->se_dev = sdp->swd_dev;
488 se->se_flags = sdp->swd_flags;
489 se->se_nblks = sdp->swd_nblks;
490 se->se_inuse = inuse;
491 se->se_priority = sdp->swd_priority;
492 KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
493 strcpy(se->se_path, sdp->swd_path);
494 }
495
496 int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
497 (void *)enosys;
498 int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
499 (void *)enosys;
500
501 /*
502 * sys_swapctl: main entry point for swapctl(2) system call
503 * [with two helper functions: swap_on and swap_off]
504 */
505 int
sys_swapctl(struct lwp * l,const struct sys_swapctl_args * uap,register_t * retval)506 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
507 {
508 /* {
509 syscallarg(int) cmd;
510 syscallarg(void *) arg;
511 syscallarg(int) misc;
512 } */
513 struct vnode *vp;
514 struct nameidata nd;
515 struct swappri *spp;
516 struct swapdev *sdp;
517 #define SWAP_PATH_MAX (PATH_MAX + 1)
518 char *userpath;
519 size_t len = 0;
520 int error;
521 int priority;
522 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
523
524 /*
525 * we handle the non-priv NSWAP and STATS request first.
526 *
527 * SWAP_NSWAP: return number of config'd swap devices
528 * [can also be obtained with uvmexp sysctl]
529 */
530 if (SCARG(uap, cmd) == SWAP_NSWAP) {
531 const int nswapdev = uvmexp.nswapdev;
532 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
533 0, 0, 0);
534 *retval = nswapdev;
535 return 0;
536 }
537
538 userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
539
540 /*
541 * ensure serialized syscall access by grabbing the swap_syscall_lock
542 */
543 rw_enter(&swap_syscall_lock, RW_WRITER);
544
545 /*
546 * SWAP_STATS: get stats on current # of configured swap devs
547 *
548 * note that the swap_priority list can't change as long
549 * as we are holding the swap_syscall_lock. we don't want
550 * to grab the uvm_swap_data_lock because we may fault&sleep during
551 * copyout() and we don't want to be holding that lock then!
552 */
553 switch (SCARG(uap, cmd)) {
554 case SWAP_STATS13:
555 error = (*uvm_swap_stats13)(uap, retval);
556 goto out;
557 case SWAP_STATS50:
558 error = (*uvm_swap_stats50)(uap, retval);
559 goto out;
560 case SWAP_STATS:
561 error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
562 NULL, sizeof(struct swapent), retval);
563 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
564 goto out;
565
566 case SWAP_GETDUMPDEV:
567 error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
568 goto out;
569 default:
570 break;
571 }
572
573 /*
574 * all other requests require superuser privs. verify.
575 */
576 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
577 0, NULL, NULL, NULL)))
578 goto out;
579
580 if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
581 /* drop the current dump device */
582 dumpdev = NODEV;
583 dumpcdev = NODEV;
584 cpu_dumpconf();
585 goto out;
586 }
587
588 /*
589 * at this point we expect a path name in arg. we will
590 * use namei() to gain a vnode reference (vref), and lock
591 * the vnode (VOP_LOCK).
592 *
593 * XXX: a NULL arg means use the root vnode pointer (e.g. for
594 * miniroot)
595 */
596 if (SCARG(uap, arg) == NULL) {
597 vp = rootvp; /* miniroot */
598 vref(vp);
599 if (vn_lock(vp, LK_EXCLUSIVE)) {
600 vrele(vp);
601 error = EBUSY;
602 goto out;
603 }
604 if (SCARG(uap, cmd) == SWAP_ON &&
605 copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
606 panic("swapctl: miniroot copy failed");
607 } else {
608 struct pathbuf *pb;
609
610 /*
611 * This used to allow copying in one extra byte
612 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
613 * This was completely pointless because if anyone
614 * used that extra byte namei would fail with
615 * ENAMETOOLONG anyway, so I've removed the excess
616 * logic. - dholland 20100215
617 */
618
619 error = pathbuf_copyin(SCARG(uap, arg), &pb);
620 if (error) {
621 goto out;
622 }
623 if (SCARG(uap, cmd) == SWAP_ON) {
624 /* get a copy of the string */
625 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
626 len = strlen(userpath) + 1;
627 }
628 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
629 if ((error = namei(&nd))) {
630 pathbuf_destroy(pb);
631 goto out;
632 }
633 vp = nd.ni_vp;
634 pathbuf_destroy(pb);
635 }
636 /* note: "vp" is referenced and locked */
637
638 error = 0; /* assume no error */
639 switch(SCARG(uap, cmd)) {
640
641 case SWAP_DUMPDEV:
642 if (vp->v_type != VBLK) {
643 error = ENOTBLK;
644 break;
645 }
646 if (bdevsw_lookup(vp->v_rdev)) {
647 dumpdev = vp->v_rdev;
648 dumpcdev = devsw_blk2chr(dumpdev);
649 } else
650 dumpdev = NODEV;
651 cpu_dumpconf();
652 break;
653
654 case SWAP_CTL:
655 /*
656 * get new priority, remove old entry (if any) and then
657 * reinsert it in the correct place. finally, prune out
658 * any empty priority structures.
659 */
660 priority = SCARG(uap, misc);
661 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
662 mutex_enter(&uvm_swap_data_lock);
663 if ((sdp = swaplist_find(vp, true)) == NULL) {
664 error = ENOENT;
665 } else {
666 swaplist_insert(sdp, spp, priority);
667 swaplist_trim();
668 }
669 mutex_exit(&uvm_swap_data_lock);
670 if (error)
671 kmem_free(spp, sizeof(*spp));
672 break;
673
674 case SWAP_ON:
675
676 /*
677 * check for duplicates. if none found, then insert a
678 * dummy entry on the list to prevent someone else from
679 * trying to enable this device while we are working on
680 * it.
681 */
682
683 priority = SCARG(uap, misc);
684 sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
685 spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
686 sdp->swd_flags = SWF_FAKE;
687 sdp->swd_vp = vp;
688 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
689 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
690 mutex_enter(&uvm_swap_data_lock);
691 if (swaplist_find(vp, false) != NULL) {
692 error = EBUSY;
693 mutex_exit(&uvm_swap_data_lock);
694 bufq_free(sdp->swd_tab);
695 kmem_free(sdp, sizeof(*sdp));
696 kmem_free(spp, sizeof(*spp));
697 break;
698 }
699 swaplist_insert(sdp, spp, priority);
700 mutex_exit(&uvm_swap_data_lock);
701
702 KASSERT(len > 0);
703 sdp->swd_pathlen = len;
704 sdp->swd_path = kmem_alloc(len, KM_SLEEP);
705 if (copystr(userpath, sdp->swd_path, len, 0) != 0)
706 panic("swapctl: copystr");
707
708 /*
709 * we've now got a FAKE placeholder in the swap list.
710 * now attempt to enable swap on it. if we fail, undo
711 * what we've done and kill the fake entry we just inserted.
712 * if swap_on is a success, it will clear the SWF_FAKE flag
713 */
714
715 if ((error = swap_on(l, sdp)) != 0) {
716 mutex_enter(&uvm_swap_data_lock);
717 (void) swaplist_find(vp, true); /* kill fake entry */
718 swaplist_trim();
719 mutex_exit(&uvm_swap_data_lock);
720 bufq_free(sdp->swd_tab);
721 kmem_free(sdp->swd_path, sdp->swd_pathlen);
722 kmem_free(sdp, sizeof(*sdp));
723 break;
724 }
725 break;
726
727 case SWAP_OFF:
728 mutex_enter(&uvm_swap_data_lock);
729 if ((sdp = swaplist_find(vp, false)) == NULL) {
730 mutex_exit(&uvm_swap_data_lock);
731 error = ENXIO;
732 break;
733 }
734
735 /*
736 * If a device isn't in use or enabled, we
737 * can't stop swapping from it (again).
738 */
739 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
740 mutex_exit(&uvm_swap_data_lock);
741 error = EBUSY;
742 break;
743 }
744
745 /*
746 * do the real work.
747 */
748 error = swap_off(l, sdp);
749 break;
750
751 default:
752 error = EINVAL;
753 }
754
755 /*
756 * done! release the ref gained by namei() and unlock.
757 */
758 vput(vp);
759 out:
760 rw_exit(&swap_syscall_lock);
761 kmem_free(userpath, SWAP_PATH_MAX);
762
763 UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0);
764 return (error);
765 }
766
767 /*
768 * uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
769 * away from sys_swapctl() in order to allow COMPAT_* swapctl()
770 * emulation to use it directly without going through sys_swapctl().
771 * The problem with using sys_swapctl() there is that it involves
772 * copying the swapent array to the stackgap, and this array's size
773 * is not known at build time. Hence it would not be possible to
774 * ensure it would fit in the stackgap in any case.
775 */
776 int
uvm_swap_stats(char * ptr,int misc,void (* f)(void *,const struct swapent *),size_t len,register_t * retval)777 uvm_swap_stats(char *ptr, int misc,
778 void (*f)(void *, const struct swapent *), size_t len,
779 register_t *retval)
780 {
781 struct swappri *spp;
782 struct swapdev *sdp;
783 struct swapent sep;
784 int count = 0;
785 int error;
786
787 KASSERT(len <= sizeof(sep));
788 if (len == 0)
789 return ENOSYS;
790
791 if (misc < 0)
792 return EINVAL;
793
794 if (misc == 0 || uvmexp.nswapdev == 0)
795 return 0;
796
797 /* Make sure userland cannot exhaust kernel memory */
798 if ((size_t)misc > (size_t)uvmexp.nswapdev)
799 misc = uvmexp.nswapdev;
800
801 KASSERT(rw_lock_held(&swap_syscall_lock));
802
803 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
804 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
805 int inuse;
806
807 if (misc-- <= 0)
808 break;
809
810 inuse = btodb((uint64_t)sdp->swd_npginuse <<
811 PAGE_SHIFT);
812
813 memset(&sep, 0, sizeof(sep));
814 swapent_cvt(&sep, sdp, inuse);
815 if (f)
816 (*f)(&sep, &sep);
817 if ((error = copyout(&sep, ptr, len)) != 0)
818 return error;
819 ptr += len;
820 count++;
821 }
822 }
823 *retval = count;
824 return 0;
825 }
826
827 /*
828 * swap_on: attempt to enable a swapdev for swapping. note that the
829 * swapdev is already on the global list, but disabled (marked
830 * SWF_FAKE).
831 *
832 * => we avoid the start of the disk (to protect disk labels)
833 * => we also avoid the miniroot, if we are swapping to root.
834 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
835 * if needed.
836 */
837 static int
swap_on(struct lwp * l,struct swapdev * sdp)838 swap_on(struct lwp *l, struct swapdev *sdp)
839 {
840 struct vnode *vp;
841 int error, npages, nblocks, size;
842 long addr;
843 vmem_addr_t result;
844 struct vattr va;
845 dev_t dev;
846 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
847
848 /*
849 * we want to enable swapping on sdp. the swd_vp contains
850 * the vnode we want (locked and ref'd), and the swd_dev
851 * contains the dev_t of the file, if it a block device.
852 */
853
854 vp = sdp->swd_vp;
855 dev = sdp->swd_dev;
856
857 /*
858 * open the swap file (mostly useful for block device files to
859 * let device driver know what is up).
860 *
861 * we skip the open/close for root on swap because the root
862 * has already been opened when root was mounted (mountroot).
863 */
864 if (vp != rootvp) {
865 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
866 return (error);
867 }
868
869 /* XXX this only works for block devices */
870 UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
871
872 /*
873 * we now need to determine the size of the swap area. for
874 * block specials we can call the d_psize function.
875 * for normal files, we must stat [get attrs].
876 *
877 * we put the result in nblks.
878 * for normal files, we also want the filesystem block size
879 * (which we get with statfs).
880 */
881 switch (vp->v_type) {
882 case VBLK:
883 if ((nblocks = bdev_size(dev)) == -1) {
884 error = ENXIO;
885 goto bad;
886 }
887 break;
888
889 case VREG:
890 if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
891 goto bad;
892 nblocks = (int)btodb(va.va_size);
893 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
894 /*
895 * limit the max # of outstanding I/O requests we issue
896 * at any one time. take it easy on NFS servers.
897 */
898 if (vp->v_tag == VT_NFS)
899 sdp->swd_maxactive = 2; /* XXX */
900 else
901 sdp->swd_maxactive = 8; /* XXX */
902 break;
903
904 default:
905 error = ENXIO;
906 goto bad;
907 }
908
909 /*
910 * save nblocks in a safe place and convert to pages.
911 */
912
913 sdp->swd_nblks = nblocks;
914 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
915
916 /*
917 * for block special files, we want to make sure that leave
918 * the disklabel and bootblocks alone, so we arrange to skip
919 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
920 * note that because of this the "size" can be less than the
921 * actual number of blocks on the device.
922 */
923 if (vp->v_type == VBLK) {
924 /* we use pages 1 to (size - 1) [inclusive] */
925 size = npages - 1;
926 addr = 1;
927 } else {
928 /* we use pages 0 to (size - 1) [inclusive] */
929 size = npages;
930 addr = 0;
931 }
932
933 /*
934 * make sure we have enough blocks for a reasonable sized swap
935 * area. we want at least one page.
936 */
937
938 if (size < 1) {
939 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
940 error = EINVAL;
941 goto bad;
942 }
943
944 UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);
945
946 /*
947 * now we need to allocate an extent to manage this swap device
948 */
949
950 sdp->swd_blist = blist_create(npages);
951 /* mark all expect the `saved' region free. */
952 blist_free(sdp->swd_blist, addr, size);
953
954 /*
955 * allocate space to for swap encryption state and mark the
956 * keys uninitialized so we generate them lazily
957 */
958 sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
959 sdp->swd_encinit = false;
960
961 /*
962 * if the vnode we are swapping to is the root vnode
963 * (i.e. we are swapping to the miniroot) then we want
964 * to make sure we don't overwrite it. do a statfs to
965 * find its size and skip over it.
966 */
967 if (vp == rootvp) {
968 struct mount *mp;
969 struct statvfs *sp;
970 int rootblocks, rootpages;
971
972 mp = rootvnode->v_mount;
973 sp = &mp->mnt_stat;
974 rootblocks = sp->f_blocks * btodb(sp->f_frsize);
975 /*
976 * XXX: sp->f_blocks isn't the total number of
977 * blocks in the filesystem, it's the number of
978 * data blocks. so, our rootblocks almost
979 * definitely underestimates the total size
980 * of the filesystem - how badly depends on the
981 * details of the filesystem type. there isn't
982 * an obvious way to deal with this cleanly
983 * and perfectly, so for now we just pad our
984 * rootblocks estimate with an extra 5 percent.
985 */
986 rootblocks += (rootblocks >> 5) +
987 (rootblocks >> 6) +
988 (rootblocks >> 7);
989 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
990 if (rootpages > size)
991 panic("swap_on: miniroot larger than swap?");
992
993 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
994 panic("swap_on: unable to preserve miniroot");
995 }
996
997 size -= rootpages;
998 printf("Preserved %d pages of miniroot ", rootpages);
999 printf("leaving %d pages of swap\n", size);
1000 }
1001
1002 /*
1003 * add a ref to vp to reflect usage as a swap device.
1004 */
1005 vref(vp);
1006
1007 /*
1008 * now add the new swapdev to the drum and enable.
1009 */
1010 error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
1011 if (error != 0)
1012 panic("swapdrum_add");
1013 /*
1014 * If this is the first regular swap create the workqueue.
1015 * => Protected by swap_syscall_lock.
1016 */
1017 if (vp->v_type != VBLK) {
1018 if (sw_reg_count++ == 0) {
1019 KASSERT(sw_reg_workqueue == NULL);
1020 if (workqueue_create(&sw_reg_workqueue, "swapiod",
1021 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
1022 panic("%s: workqueue_create failed", __func__);
1023 }
1024 }
1025
1026 sdp->swd_drumoffset = (int)result;
1027 sdp->swd_drumsize = npages;
1028 sdp->swd_npages = size;
1029 mutex_enter(&uvm_swap_data_lock);
1030 sdp->swd_flags &= ~SWF_FAKE; /* going live */
1031 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1032 uvmexp.swpages += size;
1033 uvmexp.swpgavail += size;
1034 mutex_exit(&uvm_swap_data_lock);
1035 return (0);
1036
1037 /*
1038 * failure: clean up and return error.
1039 */
1040
1041 bad:
1042 if (sdp->swd_blist) {
1043 blist_destroy(sdp->swd_blist);
1044 }
1045 if (vp != rootvp) {
1046 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
1047 }
1048 return (error);
1049 }
1050
1051 /*
1052 * swap_off: stop swapping on swapdev
1053 *
1054 * => swap data should be locked, we will unlock.
1055 */
1056 static int
swap_off(struct lwp * l,struct swapdev * sdp)1057 swap_off(struct lwp *l, struct swapdev *sdp)
1058 {
1059 int npages = sdp->swd_npages;
1060 int error = 0;
1061
1062 UVMHIST_FUNC(__func__);
1063 UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
1064
1065 KASSERT(rw_write_held(&swap_syscall_lock));
1066 KASSERT(mutex_owned(&uvm_swap_data_lock));
1067
1068 /* disable the swap area being removed */
1069 sdp->swd_flags &= ~SWF_ENABLE;
1070 uvmexp.swpgavail -= npages;
1071 mutex_exit(&uvm_swap_data_lock);
1072
1073 /*
1074 * the idea is to find all the pages that are paged out to this
1075 * device, and page them all in. in uvm, swap-backed pageable
1076 * memory can take two forms: aobjs and anons. call the
1077 * swapoff hook for each subsystem to bring in pages.
1078 */
1079
1080 if (uao_swap_off(sdp->swd_drumoffset,
1081 sdp->swd_drumoffset + sdp->swd_drumsize) ||
1082 amap_swap_off(sdp->swd_drumoffset,
1083 sdp->swd_drumoffset + sdp->swd_drumsize)) {
1084 error = ENOMEM;
1085 } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1086 error = EBUSY;
1087 }
1088
1089 if (error) {
1090 mutex_enter(&uvm_swap_data_lock);
1091 sdp->swd_flags |= SWF_ENABLE;
1092 uvmexp.swpgavail += npages;
1093 mutex_exit(&uvm_swap_data_lock);
1094
1095 return error;
1096 }
1097
1098 /*
1099 * If this is the last regular swap destroy the workqueue.
1100 * => Protected by swap_syscall_lock.
1101 */
1102 if (sdp->swd_vp->v_type != VBLK) {
1103 KASSERT(sw_reg_count > 0);
1104 KASSERT(sw_reg_workqueue != NULL);
1105 if (--sw_reg_count == 0) {
1106 workqueue_destroy(sw_reg_workqueue);
1107 sw_reg_workqueue = NULL;
1108 }
1109 }
1110
1111 /*
1112 * done with the vnode.
1113 * drop our ref on the vnode before calling VOP_CLOSE()
1114 * so that spec_close() can tell if this is the last close.
1115 */
1116 vrele(sdp->swd_vp);
1117 if (sdp->swd_vp != rootvp) {
1118 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1119 }
1120
1121 mutex_enter(&uvm_swap_data_lock);
1122 uvmexp.swpages -= npages;
1123 uvmexp.swpginuse -= sdp->swd_npgbad;
1124
1125 if (swaplist_find(sdp->swd_vp, true) == NULL)
1126 panic("%s: swapdev not in list", __func__);
1127 swaplist_trim();
1128 mutex_exit(&uvm_swap_data_lock);
1129
1130 /*
1131 * free all resources!
1132 */
1133 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1134 blist_destroy(sdp->swd_blist);
1135 bufq_free(sdp->swd_tab);
1136 kmem_free(__UNVOLATILE(sdp->swd_encmap),
1137 encmap_size(sdp->swd_drumsize));
1138 explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
1139 explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
1140 kmem_free(sdp, sizeof(*sdp));
1141 return (0);
1142 }
1143
1144 void
uvm_swap_shutdown(struct lwp * l)1145 uvm_swap_shutdown(struct lwp *l)
1146 {
1147 struct swapdev *sdp;
1148 struct swappri *spp;
1149 struct vnode *vp;
1150 int error;
1151
1152 if (!uvm_swap_init_done || uvmexp.nswapdev == 0)
1153 return;
1154 printf("turning off swap...");
1155 rw_enter(&swap_syscall_lock, RW_WRITER);
1156 mutex_enter(&uvm_swap_data_lock);
1157 again:
1158 LIST_FOREACH(spp, &swap_priority, spi_swappri)
1159 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1160 if (sdp->swd_flags & SWF_FAKE)
1161 continue;
1162 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
1163 continue;
1164 #ifdef DEBUG
1165 printf("\nturning off swap on %s...", sdp->swd_path);
1166 #endif
1167 /* Have to lock and reference vnode for swap_off(). */
1168 vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
1169 vref(vp);
1170 error = swap_off(l, sdp);
1171 vput(vp);
1172 mutex_enter(&uvm_swap_data_lock);
1173 if (error) {
1174 printf("stopping swap on %s failed "
1175 "with error %d\n", sdp->swd_path, error);
1176 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1177 uvmexp.nswapdev--;
1178 swaplist_trim();
1179 }
1180 goto again;
1181 }
1182 printf(" done\n");
1183 mutex_exit(&uvm_swap_data_lock);
1184 rw_exit(&swap_syscall_lock);
1185 }
1186
1187
1188 /*
1189 * /dev/drum interface and i/o functions
1190 */
1191
1192 /*
1193 * swopen: allow the initial open from uvm_swap_init() and reject all others.
1194 */
1195
1196 static int
swopen(dev_t dev,int flag,int mode,struct lwp * l)1197 swopen(dev_t dev, int flag, int mode, struct lwp *l)
1198 {
1199 static bool inited = false;
1200
1201 if (!inited) {
1202 inited = true;
1203 return 0;
1204 }
1205 return ENODEV;
1206 }
1207
1208 /*
1209 * swstrategy: perform I/O on the drum
1210 *
1211 * => we must map the i/o request from the drum to the correct swapdev.
1212 */
1213 static void
swstrategy(struct buf * bp)1214 swstrategy(struct buf *bp)
1215 {
1216 struct swapdev *sdp;
1217 struct vnode *vp;
1218 int pageno, bn;
1219 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1220
1221 /*
1222 * convert block number to swapdev. note that swapdev can't
1223 * be yanked out from under us because we are holding resources
1224 * in it (i.e. the blocks we are doing I/O on).
1225 */
1226 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1227 mutex_enter(&uvm_swap_data_lock);
1228 sdp = swapdrum_getsdp(pageno);
1229 mutex_exit(&uvm_swap_data_lock);
1230 if (sdp == NULL) {
1231 bp->b_error = EINVAL;
1232 bp->b_resid = bp->b_bcount;
1233 biodone(bp);
1234 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
1235 return;
1236 }
1237
1238 /*
1239 * convert drum page number to block number on this swapdev.
1240 */
1241
1242 pageno -= sdp->swd_drumoffset; /* page # on swapdev */
1243 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1244
1245 UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
1246 ((bp->b_flags & B_READ) == 0) ? 1 : 0,
1247 sdp->swd_drumoffset, bn, bp->b_bcount);
1248
1249 /*
1250 * for block devices we finish up here.
1251 * for regular files we have to do more work which we delegate
1252 * to sw_reg_strategy().
1253 */
1254
1255 vp = sdp->swd_vp; /* swapdev vnode pointer */
1256 switch (vp->v_type) {
1257 default:
1258 panic("%s: vnode type 0x%x", __func__, vp->v_type);
1259
1260 case VBLK:
1261
1262 /*
1263 * must convert "bp" from an I/O on /dev/drum to an I/O
1264 * on the swapdev (sdp).
1265 */
1266 bp->b_blkno = bn; /* swapdev block number */
1267 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
1268
1269 /*
1270 * if we are doing a write, we have to redirect the i/o on
1271 * drum's v_numoutput counter to the swapdevs.
1272 */
1273 if ((bp->b_flags & B_READ) == 0) {
1274 mutex_enter(bp->b_objlock);
1275 vwakeup(bp); /* kills one 'v_numoutput' on drum */
1276 mutex_exit(bp->b_objlock);
1277 mutex_enter(vp->v_interlock);
1278 vp->v_numoutput++; /* put it on swapdev */
1279 mutex_exit(vp->v_interlock);
1280 }
1281
1282 /*
1283 * finally plug in swapdev vnode and start I/O
1284 */
1285 bp->b_vp = vp;
1286 bp->b_objlock = vp->v_interlock;
1287 VOP_STRATEGY(vp, bp);
1288 return;
1289
1290 case VREG:
1291 /*
1292 * delegate to sw_reg_strategy function.
1293 */
1294 sw_reg_strategy(sdp, bp, bn);
1295 return;
1296 }
1297 /* NOTREACHED */
1298 }
1299
1300 /*
1301 * swread: the read function for the drum (just a call to physio)
1302 */
1303 /*ARGSUSED*/
1304 static int
swread(dev_t dev,struct uio * uio,int ioflag)1305 swread(dev_t dev, struct uio *uio, int ioflag)
1306 {
1307 UVMHIST_FUNC(__func__);
1308 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1309
1310 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1311 }
1312
1313 /*
1314 * swwrite: the write function for the drum (just a call to physio)
1315 */
1316 /*ARGSUSED*/
1317 static int
swwrite(dev_t dev,struct uio * uio,int ioflag)1318 swwrite(dev_t dev, struct uio *uio, int ioflag)
1319 {
1320 UVMHIST_FUNC(__func__);
1321 UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
1322
1323 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1324 }
1325
1326 const struct bdevsw swap_bdevsw = {
1327 .d_open = swopen,
1328 .d_close = noclose,
1329 .d_strategy = swstrategy,
1330 .d_ioctl = noioctl,
1331 .d_dump = nodump,
1332 .d_psize = nosize,
1333 .d_discard = nodiscard,
1334 .d_flag = D_OTHER
1335 };
1336
1337 const struct cdevsw swap_cdevsw = {
1338 .d_open = nullopen,
1339 .d_close = nullclose,
1340 .d_read = swread,
1341 .d_write = swwrite,
1342 .d_ioctl = noioctl,
1343 .d_stop = nostop,
1344 .d_tty = notty,
1345 .d_poll = nopoll,
1346 .d_mmap = nommap,
1347 .d_kqfilter = nokqfilter,
1348 .d_discard = nodiscard,
1349 .d_flag = D_OTHER,
1350 };
1351
1352 /*
1353 * sw_reg_strategy: handle swap i/o to regular files
1354 */
1355 static void
sw_reg_strategy(struct swapdev * sdp,struct buf * bp,int bn)1356 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1357 {
1358 struct vnode *vp;
1359 struct vndxfer *vnx;
1360 daddr_t nbn;
1361 char *addr;
1362 off_t byteoff;
1363 int s, off, nra, error, sz, resid;
1364 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1365
1366 /*
1367 * allocate a vndxfer head for this transfer and point it to
1368 * our buffer.
1369 */
1370 vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1371 vnx->vx_flags = VX_BUSY;
1372 vnx->vx_error = 0;
1373 vnx->vx_pending = 0;
1374 vnx->vx_bp = bp;
1375 vnx->vx_sdp = sdp;
1376
1377 /*
1378 * setup for main loop where we read filesystem blocks into
1379 * our buffer.
1380 */
1381 error = 0;
1382 bp->b_resid = bp->b_bcount; /* nothing transferred yet! */
1383 addr = bp->b_data; /* current position in buffer */
1384 byteoff = dbtob((uint64_t)bn);
1385
1386 for (resid = bp->b_resid; resid; resid -= sz) {
1387 struct vndbuf *nbp;
1388
1389 /*
1390 * translate byteoffset into block number. return values:
1391 * vp = vnode of underlying device
1392 * nbn = new block number (on underlying vnode dev)
1393 * nra = num blocks we can read-ahead (excludes requested
1394 * block)
1395 */
1396 nra = 0;
1397 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1398 &vp, &nbn, &nra);
1399
1400 if (error == 0 && nbn == (daddr_t)-1) {
1401 /*
1402 * this used to just set error, but that doesn't
1403 * do the right thing. Instead, it causes random
1404 * memory errors. The panic() should remain until
1405 * this condition doesn't destabilize the system.
1406 */
1407 #if 1
1408 panic("%s: swap to sparse file", __func__);
1409 #else
1410 error = EIO; /* failure */
1411 #endif
1412 }
1413
1414 /*
1415 * punt if there was an error or a hole in the file.
1416 * we must wait for any i/o ops we have already started
1417 * to finish before returning.
1418 *
1419 * XXX we could deal with holes here but it would be
1420 * a hassle (in the write case).
1421 */
1422 if (error) {
1423 s = splbio();
1424 vnx->vx_error = error; /* pass error up */
1425 goto out;
1426 }
1427
1428 /*
1429 * compute the size ("sz") of this transfer (in bytes).
1430 */
1431 off = byteoff % sdp->swd_bsize;
1432 sz = (1 + nra) * sdp->swd_bsize - off;
1433 if (sz > resid)
1434 sz = resid;
1435
1436 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1437 "vp %#jx/%#jx offset %#jx/%#jx",
1438 (uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
1439
1440 /*
1441 * now get a buf structure. note that the vb_buf is
1442 * at the front of the nbp structure so that you can
1443 * cast pointers between the two structure easily.
1444 */
1445 nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1446 buf_init(&nbp->vb_buf);
1447 nbp->vb_buf.b_flags = bp->b_flags;
1448 nbp->vb_buf.b_cflags = bp->b_cflags;
1449 nbp->vb_buf.b_oflags = bp->b_oflags;
1450 nbp->vb_buf.b_bcount = sz;
1451 nbp->vb_buf.b_bufsize = sz;
1452 nbp->vb_buf.b_error = 0;
1453 nbp->vb_buf.b_data = addr;
1454 nbp->vb_buf.b_lblkno = 0;
1455 nbp->vb_buf.b_blkno = nbn + btodb(off);
1456 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1457 nbp->vb_buf.b_iodone = sw_reg_biodone;
1458 nbp->vb_buf.b_vp = vp;
1459 nbp->vb_buf.b_objlock = vp->v_interlock;
1460 if (vp->v_type == VBLK) {
1461 nbp->vb_buf.b_dev = vp->v_rdev;
1462 }
1463
1464 nbp->vb_xfer = vnx; /* patch it back in to vnx */
1465
1466 /*
1467 * Just sort by block number
1468 */
1469 s = splbio();
1470 if (vnx->vx_error != 0) {
1471 buf_destroy(&nbp->vb_buf);
1472 pool_put(&vndbuf_pool, nbp);
1473 goto out;
1474 }
1475 vnx->vx_pending++;
1476
1477 /* sort it in and start I/O if we are not over our limit */
1478 /* XXXAD locking */
1479 bufq_put(sdp->swd_tab, &nbp->vb_buf);
1480 sw_reg_start(sdp);
1481 splx(s);
1482
1483 /*
1484 * advance to the next I/O
1485 */
1486 byteoff += sz;
1487 addr += sz;
1488 }
1489
1490 s = splbio();
1491
1492 out: /* Arrive here at splbio */
1493 vnx->vx_flags &= ~VX_BUSY;
1494 if (vnx->vx_pending == 0) {
1495 error = vnx->vx_error;
1496 pool_put(&vndxfer_pool, vnx);
1497 bp->b_error = error;
1498 biodone(bp);
1499 }
1500 splx(s);
1501 }
1502
1503 /*
1504 * sw_reg_start: start an I/O request on the requested swapdev
1505 *
1506 * => reqs are sorted by b_rawblkno (above)
1507 */
1508 static void
sw_reg_start(struct swapdev * sdp)1509 sw_reg_start(struct swapdev *sdp)
1510 {
1511 struct buf *bp;
1512 struct vnode *vp;
1513 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1514
1515 /* recursion control */
1516 if ((sdp->swd_flags & SWF_BUSY) != 0)
1517 return;
1518
1519 sdp->swd_flags |= SWF_BUSY;
1520
1521 while (sdp->swd_active < sdp->swd_maxactive) {
1522 bp = bufq_get(sdp->swd_tab);
1523 if (bp == NULL)
1524 break;
1525 sdp->swd_active++;
1526
1527 UVMHIST_LOG(pdhist,
1528 "sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx",
1529 (uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
1530 bp->b_bcount);
1531 vp = bp->b_vp;
1532 KASSERT(bp->b_objlock == vp->v_interlock);
1533 if ((bp->b_flags & B_READ) == 0) {
1534 mutex_enter(vp->v_interlock);
1535 vp->v_numoutput++;
1536 mutex_exit(vp->v_interlock);
1537 }
1538 VOP_STRATEGY(vp, bp);
1539 }
1540 sdp->swd_flags &= ~SWF_BUSY;
1541 }
1542
1543 /*
1544 * sw_reg_biodone: one of our i/o's has completed
1545 */
1546 static void
sw_reg_biodone(struct buf * bp)1547 sw_reg_biodone(struct buf *bp)
1548 {
1549 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1550 }
1551
1552 /*
1553 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1554 *
1555 * => note that we can recover the vndbuf struct by casting the buf ptr
1556 */
1557 static void
sw_reg_iodone(struct work * wk,void * dummy)1558 sw_reg_iodone(struct work *wk, void *dummy)
1559 {
1560 struct vndbuf *vbp = (void *)wk;
1561 struct vndxfer *vnx = vbp->vb_xfer;
1562 struct buf *pbp = vnx->vx_bp; /* parent buffer */
1563 struct swapdev *sdp = vnx->vx_sdp;
1564 int s, resid, error;
1565 KASSERT(&vbp->vb_buf.b_work == wk);
1566 UVMHIST_FUNC(__func__);
1567 UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
1568 (uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
1569 (uintptr_t)vbp->vb_buf.b_data);
1570 UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx",
1571 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1572
1573 /*
1574 * protect vbp at splbio and update.
1575 */
1576
1577 s = splbio();
1578 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1579 pbp->b_resid -= resid;
1580 vnx->vx_pending--;
1581
1582 if (vbp->vb_buf.b_error != 0) {
1583 /* pass error upward */
1584 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1585 UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0);
1586 vnx->vx_error = error;
1587 }
1588
1589 /*
1590 * kill vbp structure
1591 */
1592 buf_destroy(&vbp->vb_buf);
1593 pool_put(&vndbuf_pool, vbp);
1594
1595 /*
1596 * wrap up this transaction if it has run to completion or, in
1597 * case of an error, when all auxiliary buffers have returned.
1598 */
1599 if (vnx->vx_error != 0) {
1600 /* pass error upward */
1601 error = vnx->vx_error;
1602 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1603 pbp->b_error = error;
1604 biodone(pbp);
1605 pool_put(&vndxfer_pool, vnx);
1606 }
1607 } else if (pbp->b_resid == 0) {
1608 KASSERT(vnx->vx_pending == 0);
1609 if ((vnx->vx_flags & VX_BUSY) == 0) {
1610 UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !",
1611 (uintptr_t)pbp, vnx->vx_error, 0, 0);
1612 biodone(pbp);
1613 pool_put(&vndxfer_pool, vnx);
1614 }
1615 }
1616
1617 /*
1618 * done! start next swapdev I/O if one is pending
1619 */
1620 sdp->swd_active--;
1621 sw_reg_start(sdp);
1622 splx(s);
1623 }
1624
1625
1626 /*
1627 * uvm_swap_alloc: allocate space on swap
1628 *
1629 * => allocation is done "round robin" down the priority list, as we
1630 * allocate in a priority we "rotate" the circle queue.
1631 * => space can be freed with uvm_swap_free
1632 * => we return the page slot number in /dev/drum (0 == invalid slot)
1633 * => we lock uvm_swap_data_lock
1634 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1635 */
1636 int
uvm_swap_alloc(int * nslots,bool lessok)1637 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1638 {
1639 struct swapdev *sdp;
1640 struct swappri *spp;
1641 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1642
1643 /*
1644 * no swap devices configured yet? definite failure.
1645 */
1646 if (uvmexp.nswapdev < 1)
1647 return 0;
1648
1649 /*
1650 * XXXJAK: BEGIN HACK
1651 *
1652 * blist_alloc() in subr_blist.c will panic if we try to allocate
1653 * too many slots.
1654 */
1655 if (*nslots > BLIST_MAX_ALLOC) {
1656 if (__predict_false(lessok == false))
1657 return 0;
1658 *nslots = BLIST_MAX_ALLOC;
1659 }
1660 /* XXXJAK: END HACK */
1661
1662 /*
1663 * lock data lock, convert slots into blocks, and enter loop
1664 */
1665 mutex_enter(&uvm_swap_data_lock);
1666
1667 ReTry: /* XXXMRG */
1668 LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1669 TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1670 uint64_t result;
1671
1672 /* if it's not enabled, then we can't swap from it */
1673 if ((sdp->swd_flags & SWF_ENABLE) == 0)
1674 continue;
1675 if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1676 continue;
1677 result = blist_alloc(sdp->swd_blist, *nslots);
1678 if (result == BLIST_NONE) {
1679 continue;
1680 }
1681 KASSERT(result < sdp->swd_drumsize);
1682
1683 /*
1684 * successful allocation! now rotate the tailq.
1685 */
1686 TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1687 TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1688 sdp->swd_npginuse += *nslots;
1689 uvmexp.swpginuse += *nslots;
1690 mutex_exit(&uvm_swap_data_lock);
1691 /* done! return drum slot number */
1692 UVMHIST_LOG(pdhist,
1693 "success! returning %jd slots starting at %jd",
1694 *nslots, result + sdp->swd_drumoffset, 0, 0);
1695 return (result + sdp->swd_drumoffset);
1696 }
1697 }
1698
1699 /* XXXMRG: BEGIN HACK */
1700 if (*nslots > 1 && lessok) {
1701 *nslots = 1;
1702 /* XXXMRG: ugh! blist should support this for us */
1703 goto ReTry;
1704 }
1705 /* XXXMRG: END HACK */
1706
1707 mutex_exit(&uvm_swap_data_lock);
1708 return 0;
1709 }
1710
1711 /*
1712 * uvm_swapisfull: return true if most of available swap is allocated
1713 * and in use. we don't count some small portion as it may be inaccessible
1714 * to us at any given moment, for example if there is lock contention or if
1715 * pages are busy.
1716 */
1717 bool
uvm_swapisfull(void)1718 uvm_swapisfull(void)
1719 {
1720 int swpgonly;
1721 bool rv;
1722
1723 if (uvmexp.swpages == 0) {
1724 return true;
1725 }
1726
1727 mutex_enter(&uvm_swap_data_lock);
1728 KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1729 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1730 uvm_swapisfull_factor);
1731 rv = (swpgonly >= uvmexp.swpgavail);
1732 mutex_exit(&uvm_swap_data_lock);
1733
1734 return (rv);
1735 }
1736
1737 /*
1738 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1739 *
1740 * => we lock uvm_swap_data_lock
1741 */
1742 void
uvm_swap_markbad(int startslot,int nslots)1743 uvm_swap_markbad(int startslot, int nslots)
1744 {
1745 struct swapdev *sdp;
1746 UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
1747
1748 mutex_enter(&uvm_swap_data_lock);
1749 sdp = swapdrum_getsdp(startslot);
1750 KASSERT(sdp != NULL);
1751
1752 /*
1753 * we just keep track of how many pages have been marked bad
1754 * in this device, to make everything add up in swap_off().
1755 * we assume here that the range of slots will all be within
1756 * one swap device.
1757 */
1758
1759 KASSERT(uvmexp.swpgonly >= nslots);
1760 atomic_add_int(&uvmexp.swpgonly, -nslots);
1761 sdp->swd_npgbad += nslots;
1762 UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
1763 mutex_exit(&uvm_swap_data_lock);
1764 }
1765
1766 /*
1767 * uvm_swap_free: free swap slots
1768 *
1769 * => this can be all or part of an allocation made by uvm_swap_alloc
1770 * => we lock uvm_swap_data_lock
1771 */
1772 void
uvm_swap_free(int startslot,int nslots)1773 uvm_swap_free(int startslot, int nslots)
1774 {
1775 struct swapdev *sdp;
1776 UVMHIST_FUNC(__func__);
1777 UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
1778 startslot, 0, 0);
1779
1780 /*
1781 * ignore attempts to free the "bad" slot.
1782 */
1783
1784 if (startslot == SWSLOT_BAD) {
1785 return;
1786 }
1787
1788 /*
1789 * convert drum slot offset back to sdp, free the blocks
1790 * in the extent, and return. must hold pri lock to do
1791 * lookup and access the extent.
1792 */
1793
1794 mutex_enter(&uvm_swap_data_lock);
1795 sdp = swapdrum_getsdp(startslot);
1796 KASSERT(uvmexp.nswapdev >= 1);
1797 KASSERT(sdp != NULL);
1798 KASSERT(sdp->swd_npginuse >= nslots);
1799 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1800 sdp->swd_npginuse -= nslots;
1801 uvmexp.swpginuse -= nslots;
1802 mutex_exit(&uvm_swap_data_lock);
1803 }
1804
1805 /*
1806 * uvm_swap_put: put any number of pages into a contig place on swap
1807 *
1808 * => can be sync or async
1809 */
1810
1811 int
uvm_swap_put(int swslot,struct vm_page ** ppsp,int npages,int flags)1812 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1813 {
1814 int error;
1815
1816 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1817 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1818 return error;
1819 }
1820
1821 /*
1822 * uvm_swap_get: get a single page from swap
1823 *
1824 * => usually a sync op (from fault)
1825 */
1826
1827 int
uvm_swap_get(struct vm_page * page,int swslot,int flags)1828 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1829 {
1830 int error;
1831
1832 atomic_inc_uint(&uvmexp.nswget);
1833 KASSERT(flags & PGO_SYNCIO);
1834 if (swslot == SWSLOT_BAD) {
1835 return EIO;
1836 }
1837
1838 error = uvm_swap_io(&page, swslot, 1, B_READ |
1839 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1840 if (error == 0) {
1841
1842 /*
1843 * this page is no longer only in swap.
1844 */
1845
1846 KASSERT(uvmexp.swpgonly > 0);
1847 atomic_dec_uint(&uvmexp.swpgonly);
1848 }
1849 return error;
1850 }
1851
1852 /*
1853 * uvm_swap_io: do an i/o operation to swap
1854 */
1855
1856 static int
uvm_swap_io(struct vm_page ** pps,int startslot,int npages,int flags)1857 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1858 {
1859 daddr_t startblk;
1860 struct buf *bp;
1861 vaddr_t kva;
1862 int error, mapinflags;
1863 bool write, async, swap_encrypt;
1864 UVMHIST_FUNC(__func__);
1865 UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
1866 startslot, npages, flags, 0);
1867
1868 write = (flags & B_READ) == 0;
1869 async = (flags & B_ASYNC) != 0;
1870 swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);
1871
1872 /*
1873 * allocate a buf for the i/o.
1874 */
1875
1876 KASSERT(curlwp != uvm.pagedaemon_lwp || write);
1877 KASSERT(curlwp != uvm.pagedaemon_lwp || async);
1878 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1879 if (bp == NULL) {
1880 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1881 return ENOMEM;
1882 }
1883
1884 /*
1885 * convert starting drum slot to block number
1886 */
1887
1888 startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1889
1890 /*
1891 * first, map the pages into the kernel.
1892 */
1893
1894 mapinflags = !write ?
1895 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1896 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1897 if (write && swap_encrypt) /* need to encrypt in-place */
1898 mapinflags |= UVMPAGER_MAPIN_READ;
1899 kva = uvm_pagermapin(pps, npages, mapinflags);
1900
1901 /*
1902 * encrypt writes in place if requested
1903 */
1904
1905 if (write) do {
1906 struct swapdev *sdp;
1907 int i;
1908
1909 /*
1910 * Get the swapdev so we can discriminate on the
1911 * encryption state. There may or may not be an
1912 * encryption key generated; we may or may not be asked
1913 * to encrypt swap.
1914 *
1915 * 1. NO KEY, NO ENCRYPTION: Nothing to do.
1916 *
1917 * 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
1918 * and mark the slots encrypted.
1919 *
1920 * 3. KEY, BUT NO ENCRYPTION: The slots may already be
1921 * marked encrypted from a past life. Mark them not
1922 * encrypted.
1923 *
1924 * 4. KEY, ENCRYPTION: Encrypt and mark the slots
1925 * encrypted.
1926 */
1927 mutex_enter(&uvm_swap_data_lock);
1928 sdp = swapdrum_getsdp(startslot);
1929 if (!sdp->swd_encinit) {
1930 if (!swap_encrypt) {
1931 mutex_exit(&uvm_swap_data_lock);
1932 break;
1933 }
1934 uvm_swap_genkey(sdp);
1935 }
1936 KASSERT(sdp->swd_encinit);
1937 mutex_exit(&uvm_swap_data_lock);
1938
1939 for (i = 0; i < npages; i++) {
1940 int s = startslot + i;
1941 KDASSERT(swapdrum_sdp_is(s, sdp));
1942 KASSERT(s >= sdp->swd_drumoffset);
1943 s -= sdp->swd_drumoffset;
1944 KASSERT(s < sdp->swd_drumsize);
1945
1946 if (swap_encrypt) {
1947 uvm_swap_encryptpage(sdp,
1948 (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
1949 atomic_or_32(&sdp->swd_encmap[s/32],
1950 __BIT(s%32));
1951 } else {
1952 atomic_and_32(&sdp->swd_encmap[s/32],
1953 ~__BIT(s%32));
1954 }
1955 }
1956 } while (0);
1957
1958 /*
1959 * fill in the bp/sbp. we currently route our i/o through
1960 * /dev/drum's vnode [swapdev_vp].
1961 */
1962
1963 bp->b_cflags = BC_BUSY | BC_NOCACHE;
1964 bp->b_flags = (flags & (B_READ|B_ASYNC));
1965 bp->b_proc = &proc0; /* XXX */
1966 bp->b_vnbufs.le_next = NOLIST;
1967 bp->b_data = (void *)kva;
1968 bp->b_blkno = startblk;
1969 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1970
1971 /*
1972 * bump v_numoutput (counter of number of active outputs).
1973 */
1974
1975 if (write) {
1976 mutex_enter(swapdev_vp->v_interlock);
1977 swapdev_vp->v_numoutput++;
1978 mutex_exit(swapdev_vp->v_interlock);
1979 }
1980
1981 /*
1982 * for async ops we must set up the iodone handler.
1983 */
1984
1985 if (async) {
1986 bp->b_iodone = uvm_aio_aiodone;
1987 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1988 if (curlwp == uvm.pagedaemon_lwp)
1989 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1990 else
1991 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1992 } else {
1993 bp->b_iodone = NULL;
1994 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1995 }
1996 UVMHIST_LOG(pdhist,
1997 "about to start io: data = %#jx blkno = %#jx, bcount = %jd",
1998 (uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1999
2000 /*
2001 * now we start the I/O, and if async, return.
2002 */
2003
2004 VOP_STRATEGY(swapdev_vp, bp);
2005 if (async) {
2006 /*
2007 * Reads are always synchronous; if this changes, we
2008 * need to add an asynchronous path for decryption.
2009 */
2010 KASSERT(write);
2011 return 0;
2012 }
2013
2014 /*
2015 * must be sync i/o. wait for it to finish
2016 */
2017
2018 error = biowait(bp);
2019 if (error)
2020 goto out;
2021
2022 /*
2023 * decrypt reads in place if needed
2024 */
2025
2026 if (!write) do {
2027 struct swapdev *sdp;
2028 bool encinit;
2029 int i;
2030
2031 /*
2032 * Get the sdp. Everything about it except the encinit
2033 * bit, saying whether the encryption key is
2034 * initialized or not, and the encrypted bit for each
2035 * page, is stable until all swap pages have been
2036 * released and the device is removed.
2037 */
2038 mutex_enter(&uvm_swap_data_lock);
2039 sdp = swapdrum_getsdp(startslot);
2040 encinit = sdp->swd_encinit;
2041 mutex_exit(&uvm_swap_data_lock);
2042
2043 if (!encinit)
2044 /*
2045 * If there's no encryption key, there's no way
2046 * any of these slots can be encrypted, so
2047 * nothing to do here.
2048 */
2049 break;
2050 for (i = 0; i < npages; i++) {
2051 int s = startslot + i;
2052 KDASSERT(swapdrum_sdp_is(s, sdp));
2053 KASSERT(s >= sdp->swd_drumoffset);
2054 s -= sdp->swd_drumoffset;
2055 KASSERT(s < sdp->swd_drumsize);
2056 if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
2057 __BIT(s%32)) == 0)
2058 continue;
2059 uvm_swap_decryptpage(sdp,
2060 (void *)(kva + (vsize_t)i*PAGE_SIZE), s);
2061 }
2062 } while (0);
2063 out:
2064 /*
2065 * kill the pager mapping
2066 */
2067
2068 uvm_pagermapout(kva, npages);
2069
2070 /*
2071 * now dispose of the buf and we're done.
2072 */
2073
2074 if (write) {
2075 mutex_enter(swapdev_vp->v_interlock);
2076 vwakeup(bp);
2077 mutex_exit(swapdev_vp->v_interlock);
2078 }
2079 putiobuf(bp);
2080 UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0);
2081
2082 return (error);
2083 }
2084
2085 /*
2086 * uvm_swap_genkey(sdp)
2087 *
2088 * Generate a key for swap encryption.
2089 */
2090 static void
uvm_swap_genkey(struct swapdev * sdp)2091 uvm_swap_genkey(struct swapdev *sdp)
2092 {
2093 uint8_t key[32];
2094
2095 KASSERT(!sdp->swd_encinit);
2096
2097 cprng_strong(kern_cprng, key, sizeof key, 0);
2098 aes_setenckey256(&sdp->swd_enckey, key);
2099 aes_setdeckey256(&sdp->swd_deckey, key);
2100 explicit_memset(key, 0, sizeof key);
2101
2102 sdp->swd_encinit = true;
2103 }
2104
2105 /*
2106 * uvm_swap_encryptpage(sdp, kva, slot)
2107 *
2108 * Encrypt one page of data at kva for the specified slot number
2109 * in the swap device.
2110 */
2111 static void
uvm_swap_encryptpage(struct swapdev * sdp,void * kva,int slot)2112 uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
2113 {
2114 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2115
2116 /* iv := AES_k(le32enc(slot) || 0^96) */
2117 le32enc(preiv, slot);
2118 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2119
2120 /* *kva := AES-CBC_k(iv, *kva) */
2121 aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
2122 AES_256_NROUNDS);
2123
2124 explicit_memset(&iv, 0, sizeof iv);
2125 }
2126
2127 /*
2128 * uvm_swap_decryptpage(sdp, kva, slot)
2129 *
2130 * Decrypt one page of data at kva for the specified slot number
2131 * in the swap device.
2132 */
2133 static void
uvm_swap_decryptpage(struct swapdev * sdp,void * kva,int slot)2134 uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
2135 {
2136 uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
2137
2138 /* iv := AES_k(le32enc(slot) || 0^96) */
2139 le32enc(preiv, slot);
2140 aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
2141
2142 /* *kva := AES-CBC^{-1}_k(iv, *kva) */
2143 aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
2144 AES_256_NROUNDS);
2145
2146 explicit_memset(&iv, 0, sizeof iv);
2147 }
2148
2149 SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
2150 {
2151
2152 sysctl_createv(clog, 0, NULL, NULL,
2153 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
2154 SYSCTL_DESCR("Encrypt data when swapped out to disk"),
2155 NULL, 0, &uvm_swap_encrypt, 0,
2156 CTL_VM, CTL_CREATE, CTL_EOL);
2157 }
2158