1 /* $NetBSD: uvm_swap.c,v 1.153 2010/11/19 06:44:47 dholland Exp $ */ 2 3 /* 4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp 29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.153 2010/11/19 06:44:47 dholland Exp $"); 34 35 #include "opt_uvmhist.h" 36 #include "opt_compat_netbsd.h" 37 #include "opt_ddb.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/buf.h> 42 #include <sys/bufq.h> 43 #include <sys/conf.h> 44 #include <sys/proc.h> 45 #include <sys/namei.h> 46 #include <sys/disklabel.h> 47 #include <sys/errno.h> 48 #include <sys/kernel.h> 49 #include <sys/malloc.h> 50 #include <sys/vnode.h> 51 #include <sys/file.h> 52 #include <sys/vmem.h> 53 #include <sys/blist.h> 54 #include <sys/mount.h> 55 #include <sys/pool.h> 56 #include <sys/syscallargs.h> 57 #include <sys/swap.h> 58 #include <sys/kauth.h> 59 #include <sys/sysctl.h> 60 #include <sys/workqueue.h> 61 62 #include <uvm/uvm.h> 63 64 #include <miscfs/specfs/specdev.h> 65 66 /* 67 * uvm_swap.c: manage configuration and i/o to swap space. 68 */ 69 70 /* 71 * swap space is managed in the following way: 72 * 73 * each swap partition or file is described by a "swapdev" structure. 74 * each "swapdev" structure contains a "swapent" structure which contains 75 * information that is passed up to the user (via system calls). 76 * 77 * each swap partition is assigned a "priority" (int) which controls 78 * swap parition usage. 79 * 80 * the system maintains a global data structure describing all swap 81 * partitions/files. there is a sorted LIST of "swappri" structures 82 * which describe "swapdev"'s at that priority. this LIST is headed 83 * by the "swap_priority" global var. each "swappri" contains a 84 * CIRCLEQ of "swapdev" structures at that priority. 85 * 86 * locking: 87 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl 88 * system call and prevents the swap priority list from changing 89 * while we are in the middle of a system call (e.g. SWAP_STATS). 90 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data 91 * structures including the priority list, the swapdev structures, 92 * and the swapmap arena. 93 * 94 * each swap device has the following info: 95 * - swap device in use (could be disabled, preventing future use) 96 * - swap enabled (allows new allocations on swap) 97 * - map info in /dev/drum 98 * - vnode pointer 99 * for swap files only: 100 * - block size 101 * - max byte count in buffer 102 * - buffer 103 * 104 * userland controls and configures swap with the swapctl(2) system call. 105 * the sys_swapctl performs the following operations: 106 * [1] SWAP_NSWAP: returns the number of swap devices currently configured 107 * [2] SWAP_STATS: given a pointer to an array of swapent structures 108 * (passed in via "arg") of a size passed in via "misc" ... we load 109 * the current swap config into the array. The actual work is done 110 * in the uvm_swap_stats(9) function. 111 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a 112 * priority in "misc", start swapping on it. 113 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device 114 * [5] SWAP_CTL: changes the priority of a swap device (new priority in 115 * "misc") 116 */ 117 118 /* 119 * swapdev: describes a single swap partition/file 120 * 121 * note the following should be true: 122 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] 123 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] 124 */ 125 struct swapdev { 126 dev_t swd_dev; /* device id */ 127 int swd_flags; /* flags:inuse/enable/fake */ 128 int swd_priority; /* our priority */ 129 int swd_nblks; /* blocks in this device */ 130 char *swd_path; /* saved pathname of device */ 131 int swd_pathlen; /* length of pathname */ 132 int swd_npages; /* #pages we can use */ 133 int swd_npginuse; /* #pages in use */ 134 int swd_npgbad; /* #pages bad */ 135 int swd_drumoffset; /* page0 offset in drum */ 136 int swd_drumsize; /* #pages in drum */ 137 blist_t swd_blist; /* blist for this swapdev */ 138 struct vnode *swd_vp; /* backing vnode */ 139 CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ 140 141 int swd_bsize; /* blocksize (bytes) */ 142 int swd_maxactive; /* max active i/o reqs */ 143 struct bufq_state *swd_tab; /* buffer list */ 144 int swd_active; /* number of active buffers */ 145 }; 146 147 /* 148 * swap device priority entry; the list is kept sorted on `spi_priority'. 149 */ 150 struct swappri { 151 int spi_priority; /* priority */ 152 CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; 153 /* circleq of swapdevs at this priority */ 154 LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ 155 }; 156 157 /* 158 * The following two structures are used to keep track of data transfers 159 * on swap devices associated with regular files. 160 * NOTE: this code is more or less a copy of vnd.c; we use the same 161 * structure names here to ease porting.. 162 */ 163 struct vndxfer { 164 struct buf *vx_bp; /* Pointer to parent buffer */ 165 struct swapdev *vx_sdp; 166 int vx_error; 167 int vx_pending; /* # of pending aux buffers */ 168 int vx_flags; 169 #define VX_BUSY 1 170 #define VX_DEAD 2 171 }; 172 173 struct vndbuf { 174 struct buf vb_buf; 175 struct vndxfer *vb_xfer; 176 }; 177 178 /* 179 * NetBSD 1.3 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit 180 * dev_t and has no se_path[] member. 181 */ 182 struct swapent13 { 183 int32_t se13_dev; /* device id */ 184 int se13_flags; /* flags */ 185 int se13_nblks; /* total blocks */ 186 int se13_inuse; /* blocks in use */ 187 int se13_priority; /* priority of this device */ 188 }; 189 190 /* 191 * NetBSD 5.0 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit 192 * dev_t. 193 */ 194 struct swapent50 { 195 int32_t se50_dev; /* device id */ 196 int se50_flags; /* flags */ 197 int se50_nblks; /* total blocks */ 198 int se50_inuse; /* blocks in use */ 199 int se50_priority; /* priority of this device */ 200 char se50_path[PATH_MAX+1]; /* path name */ 201 }; 202 203 /* 204 * We keep a of pool vndbuf's and vndxfer structures. 205 */ 206 static struct pool vndxfer_pool, vndbuf_pool; 207 208 /* 209 * local variables 210 */ 211 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures"); 212 static vmem_t *swapmap; /* controls the mapping of /dev/drum */ 213 214 /* list of all active swap devices [by priority] */ 215 LIST_HEAD(swap_priority, swappri); 216 static struct swap_priority swap_priority; 217 218 /* locks */ 219 static krwlock_t swap_syscall_lock; 220 221 /* workqueue and use counter for swap to regular files */ 222 static int sw_reg_count = 0; 223 static struct workqueue *sw_reg_workqueue; 224 225 /* tuneables */ 226 u_int uvm_swapisfull_factor = 99; 227 228 /* 229 * prototypes 230 */ 231 static struct swapdev *swapdrum_getsdp(int); 232 233 static struct swapdev *swaplist_find(struct vnode *, bool); 234 static void swaplist_insert(struct swapdev *, 235 struct swappri *, int); 236 static void swaplist_trim(void); 237 238 static int swap_on(struct lwp *, struct swapdev *); 239 static int swap_off(struct lwp *, struct swapdev *); 240 241 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *); 242 243 static void sw_reg_strategy(struct swapdev *, struct buf *, int); 244 static void sw_reg_biodone(struct buf *); 245 static void sw_reg_iodone(struct work *wk, void *dummy); 246 static void sw_reg_start(struct swapdev *); 247 248 static int uvm_swap_io(struct vm_page **, int, int, int); 249 250 /* 251 * uvm_swap_init: init the swap system data structures and locks 252 * 253 * => called at boot time from init_main.c after the filesystems 254 * are brought up (which happens after uvm_init()) 255 */ 256 void 257 uvm_swap_init(void) 258 { 259 UVMHIST_FUNC("uvm_swap_init"); 260 261 UVMHIST_CALLED(pdhist); 262 /* 263 * first, init the swap list, its counter, and its lock. 264 * then get a handle on the vnode for /dev/drum by using 265 * the its dev_t number ("swapdev", from MD conf.c). 266 */ 267 268 LIST_INIT(&swap_priority); 269 uvmexp.nswapdev = 0; 270 rw_init(&swap_syscall_lock); 271 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 272 273 if (bdevvp(swapdev, &swapdev_vp)) 274 panic("%s: can't get vnode for swap device", __func__); 275 if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY)) 276 panic("%s: can't lock swap device", __func__); 277 if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED)) 278 panic("%s: can't open swap device", __func__); 279 VOP_UNLOCK(swapdev_vp); 280 281 /* 282 * create swap block resource map to map /dev/drum. the range 283 * from 1 to INT_MAX allows 2 gigablocks of swap space. note 284 * that block 0 is reserved (used to indicate an allocation 285 * failure, or no allocation). 286 */ 287 swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0, 288 VM_NOSLEEP, IPL_NONE); 289 if (swapmap == 0) { 290 panic("%s: vmem_create failed", __func__); 291 } 292 293 pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 294 NULL, IPL_BIO); 295 pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 296 NULL, IPL_BIO); 297 298 UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); 299 } 300 301 /* 302 * swaplist functions: functions that operate on the list of swap 303 * devices on the system. 304 */ 305 306 /* 307 * swaplist_insert: insert swap device "sdp" into the global list 308 * 309 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 310 * => caller must provide a newly malloc'd swappri structure (we will 311 * FREE it if we don't need it... this it to prevent malloc blocking 312 * here while adding swap) 313 */ 314 static void 315 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority) 316 { 317 struct swappri *spp, *pspp; 318 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); 319 320 /* 321 * find entry at or after which to insert the new device. 322 */ 323 pspp = NULL; 324 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 325 if (priority <= spp->spi_priority) 326 break; 327 pspp = spp; 328 } 329 330 /* 331 * new priority? 332 */ 333 if (spp == NULL || spp->spi_priority != priority) { 334 spp = newspp; /* use newspp! */ 335 UVMHIST_LOG(pdhist, "created new swappri = %d", 336 priority, 0, 0, 0); 337 338 spp->spi_priority = priority; 339 CIRCLEQ_INIT(&spp->spi_swapdev); 340 341 if (pspp) 342 LIST_INSERT_AFTER(pspp, spp, spi_swappri); 343 else 344 LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); 345 } else { 346 /* we don't need a new priority structure, free it */ 347 free(newspp, M_VMSWAP); 348 } 349 350 /* 351 * priority found (or created). now insert on the priority's 352 * circleq list and bump the total number of swapdevs. 353 */ 354 sdp->swd_priority = priority; 355 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 356 uvmexp.nswapdev++; 357 } 358 359 /* 360 * swaplist_find: find and optionally remove a swap device from the 361 * global list. 362 * 363 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 364 * => we return the swapdev we found (and removed) 365 */ 366 static struct swapdev * 367 swaplist_find(struct vnode *vp, bool remove) 368 { 369 struct swapdev *sdp; 370 struct swappri *spp; 371 372 /* 373 * search the lists for the requested vp 374 */ 375 376 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 377 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 378 if (sdp->swd_vp == vp) { 379 if (remove) { 380 CIRCLEQ_REMOVE(&spp->spi_swapdev, 381 sdp, swd_next); 382 uvmexp.nswapdev--; 383 } 384 return(sdp); 385 } 386 } 387 } 388 return (NULL); 389 } 390 391 /* 392 * swaplist_trim: scan priority list for empty priority entries and kill 393 * them. 394 * 395 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock 396 */ 397 static void 398 swaplist_trim(void) 399 { 400 struct swappri *spp, *nextspp; 401 402 for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) { 403 nextspp = LIST_NEXT(spp, spi_swappri); 404 if (CIRCLEQ_FIRST(&spp->spi_swapdev) != 405 (void *)&spp->spi_swapdev) 406 continue; 407 LIST_REMOVE(spp, spi_swappri); 408 free(spp, M_VMSWAP); 409 } 410 } 411 412 /* 413 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back 414 * to the "swapdev" that maps that section of the drum. 415 * 416 * => each swapdev takes one big contig chunk of the drum 417 * => caller must hold uvm_swap_data_lock 418 */ 419 static struct swapdev * 420 swapdrum_getsdp(int pgno) 421 { 422 struct swapdev *sdp; 423 struct swappri *spp; 424 425 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 426 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 427 if (sdp->swd_flags & SWF_FAKE) 428 continue; 429 if (pgno >= sdp->swd_drumoffset && 430 pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { 431 return sdp; 432 } 433 } 434 } 435 return NULL; 436 } 437 438 439 /* 440 * sys_swapctl: main entry point for swapctl(2) system call 441 * [with two helper functions: swap_on and swap_off] 442 */ 443 int 444 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval) 445 { 446 /* { 447 syscallarg(int) cmd; 448 syscallarg(void *) arg; 449 syscallarg(int) misc; 450 } */ 451 struct vnode *vp; 452 struct nameidata nd; 453 struct swappri *spp; 454 struct swapdev *sdp; 455 struct swapent *sep; 456 #define SWAP_PATH_MAX (PATH_MAX + 1) 457 char *userpath; 458 size_t len; 459 int error, misc; 460 int priority; 461 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); 462 463 misc = SCARG(uap, misc); 464 465 /* 466 * ensure serialized syscall access by grabbing the swap_syscall_lock 467 */ 468 rw_enter(&swap_syscall_lock, RW_WRITER); 469 470 userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK); 471 /* 472 * we handle the non-priv NSWAP and STATS request first. 473 * 474 * SWAP_NSWAP: return number of config'd swap devices 475 * [can also be obtained with uvmexp sysctl] 476 */ 477 if (SCARG(uap, cmd) == SWAP_NSWAP) { 478 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, 479 0, 0, 0); 480 *retval = uvmexp.nswapdev; 481 error = 0; 482 goto out; 483 } 484 485 /* 486 * SWAP_STATS: get stats on current # of configured swap devs 487 * 488 * note that the swap_priority list can't change as long 489 * as we are holding the swap_syscall_lock. we don't want 490 * to grab the uvm_swap_data_lock because we may fault&sleep during 491 * copyout() and we don't want to be holding that lock then! 492 */ 493 if (SCARG(uap, cmd) == SWAP_STATS 494 #if defined(COMPAT_50) 495 || SCARG(uap, cmd) == SWAP_STATS50 496 #endif 497 #if defined(COMPAT_13) 498 || SCARG(uap, cmd) == SWAP_STATS13 499 #endif 500 ) { 501 if ((size_t)misc > (size_t)uvmexp.nswapdev) 502 misc = uvmexp.nswapdev; 503 #if defined(COMPAT_13) 504 if (SCARG(uap, cmd) == SWAP_STATS13) 505 len = sizeof(struct swapent13) * misc; 506 else 507 #endif 508 #if defined(COMPAT_50) 509 if (SCARG(uap, cmd) == SWAP_STATS50) 510 len = sizeof(struct swapent50) * misc; 511 else 512 #endif 513 len = sizeof(struct swapent) * misc; 514 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK); 515 516 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval); 517 error = copyout(sep, SCARG(uap, arg), len); 518 519 free(sep, M_TEMP); 520 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); 521 goto out; 522 } 523 if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) { 524 dev_t *devp = (dev_t *)SCARG(uap, arg); 525 526 error = copyout(&dumpdev, devp, sizeof(dumpdev)); 527 goto out; 528 } 529 530 /* 531 * all other requests require superuser privs. verify. 532 */ 533 if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL, 534 0, NULL, NULL, NULL))) 535 goto out; 536 537 if (SCARG(uap, cmd) == SWAP_DUMPOFF) { 538 /* drop the current dump device */ 539 dumpdev = NODEV; 540 dumpcdev = NODEV; 541 cpu_dumpconf(); 542 goto out; 543 } 544 545 /* 546 * at this point we expect a path name in arg. we will 547 * use namei() to gain a vnode reference (vref), and lock 548 * the vnode (VOP_LOCK). 549 * 550 * XXX: a NULL arg means use the root vnode pointer (e.g. for 551 * miniroot) 552 */ 553 if (SCARG(uap, arg) == NULL) { 554 vp = rootvp; /* miniroot */ 555 vref(vp); 556 if (vn_lock(vp, LK_EXCLUSIVE)) { 557 vrele(vp); 558 error = EBUSY; 559 goto out; 560 } 561 if (SCARG(uap, cmd) == SWAP_ON && 562 copystr("miniroot", userpath, SWAP_PATH_MAX, &len)) 563 panic("swapctl: miniroot copy failed"); 564 } else { 565 struct pathbuf *pb; 566 567 /* 568 * This used to allow copying in one extra byte 569 * (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON. 570 * This was completely pointless because if anyone 571 * used that extra byte namei would fail with 572 * ENAMETOOLONG anyway, so I've removed the excess 573 * logic. - dholland 20100215 574 */ 575 576 error = pathbuf_copyin(SCARG(uap, arg), &pb); 577 if (error) { 578 goto out; 579 } 580 if (SCARG(uap, cmd) == SWAP_ON) { 581 /* get a copy of the string */ 582 pathbuf_copystring(pb, userpath, SWAP_PATH_MAX); 583 len = strlen(userpath) + 1; 584 } 585 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb); 586 if ((error = namei(&nd))) { 587 pathbuf_destroy(pb); 588 goto out; 589 } 590 vp = nd.ni_vp; 591 pathbuf_destroy(pb); 592 } 593 /* note: "vp" is referenced and locked */ 594 595 error = 0; /* assume no error */ 596 switch(SCARG(uap, cmd)) { 597 598 case SWAP_DUMPDEV: 599 if (vp->v_type != VBLK) { 600 error = ENOTBLK; 601 break; 602 } 603 if (bdevsw_lookup(vp->v_rdev)) { 604 dumpdev = vp->v_rdev; 605 dumpcdev = devsw_blk2chr(dumpdev); 606 } else 607 dumpdev = NODEV; 608 cpu_dumpconf(); 609 break; 610 611 case SWAP_CTL: 612 /* 613 * get new priority, remove old entry (if any) and then 614 * reinsert it in the correct place. finally, prune out 615 * any empty priority structures. 616 */ 617 priority = SCARG(uap, misc); 618 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 619 mutex_enter(&uvm_swap_data_lock); 620 if ((sdp = swaplist_find(vp, true)) == NULL) { 621 error = ENOENT; 622 } else { 623 swaplist_insert(sdp, spp, priority); 624 swaplist_trim(); 625 } 626 mutex_exit(&uvm_swap_data_lock); 627 if (error) 628 free(spp, M_VMSWAP); 629 break; 630 631 case SWAP_ON: 632 633 /* 634 * check for duplicates. if none found, then insert a 635 * dummy entry on the list to prevent someone else from 636 * trying to enable this device while we are working on 637 * it. 638 */ 639 640 priority = SCARG(uap, misc); 641 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); 642 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK); 643 memset(sdp, 0, sizeof(*sdp)); 644 sdp->swd_flags = SWF_FAKE; 645 sdp->swd_vp = vp; 646 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; 647 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK); 648 mutex_enter(&uvm_swap_data_lock); 649 if (swaplist_find(vp, false) != NULL) { 650 error = EBUSY; 651 mutex_exit(&uvm_swap_data_lock); 652 bufq_free(sdp->swd_tab); 653 free(sdp, M_VMSWAP); 654 free(spp, M_VMSWAP); 655 break; 656 } 657 swaplist_insert(sdp, spp, priority); 658 mutex_exit(&uvm_swap_data_lock); 659 660 sdp->swd_pathlen = len; 661 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); 662 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) 663 panic("swapctl: copystr"); 664 665 /* 666 * we've now got a FAKE placeholder in the swap list. 667 * now attempt to enable swap on it. if we fail, undo 668 * what we've done and kill the fake entry we just inserted. 669 * if swap_on is a success, it will clear the SWF_FAKE flag 670 */ 671 672 if ((error = swap_on(l, sdp)) != 0) { 673 mutex_enter(&uvm_swap_data_lock); 674 (void) swaplist_find(vp, true); /* kill fake entry */ 675 swaplist_trim(); 676 mutex_exit(&uvm_swap_data_lock); 677 bufq_free(sdp->swd_tab); 678 free(sdp->swd_path, M_VMSWAP); 679 free(sdp, M_VMSWAP); 680 break; 681 } 682 break; 683 684 case SWAP_OFF: 685 mutex_enter(&uvm_swap_data_lock); 686 if ((sdp = swaplist_find(vp, false)) == NULL) { 687 mutex_exit(&uvm_swap_data_lock); 688 error = ENXIO; 689 break; 690 } 691 692 /* 693 * If a device isn't in use or enabled, we 694 * can't stop swapping from it (again). 695 */ 696 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { 697 mutex_exit(&uvm_swap_data_lock); 698 error = EBUSY; 699 break; 700 } 701 702 /* 703 * do the real work. 704 */ 705 error = swap_off(l, sdp); 706 break; 707 708 default: 709 error = EINVAL; 710 } 711 712 /* 713 * done! release the ref gained by namei() and unlock. 714 */ 715 vput(vp); 716 717 out: 718 free(userpath, M_TEMP); 719 rw_exit(&swap_syscall_lock); 720 721 UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); 722 return (error); 723 } 724 725 /* 726 * swap_stats: implements swapctl(SWAP_STATS). The function is kept 727 * away from sys_swapctl() in order to allow COMPAT_* swapctl() 728 * emulation to use it directly without going through sys_swapctl(). 729 * The problem with using sys_swapctl() there is that it involves 730 * copying the swapent array to the stackgap, and this array's size 731 * is not known at build time. Hence it would not be possible to 732 * ensure it would fit in the stackgap in any case. 733 */ 734 void 735 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval) 736 { 737 738 rw_enter(&swap_syscall_lock, RW_READER); 739 uvm_swap_stats_locked(cmd, sep, sec, retval); 740 rw_exit(&swap_syscall_lock); 741 } 742 743 static void 744 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval) 745 { 746 struct swappri *spp; 747 struct swapdev *sdp; 748 int count = 0; 749 750 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 751 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev); 752 sdp != (void *)&spp->spi_swapdev && sec-- > 0; 753 sdp = CIRCLEQ_NEXT(sdp, swd_next)) { 754 int inuse; 755 756 /* 757 * backwards compatibility for system call. 758 * For NetBSD 1.3 and 5.0, we have to use 759 * the 32 bit dev_t. For 5.0 and -current 760 * we have to add the path. 761 */ 762 inuse = btodb((uint64_t)sdp->swd_npginuse << 763 PAGE_SHIFT); 764 765 #if defined(COMPAT_13) || defined(COMPAT_50) 766 if (cmd == SWAP_STATS) { 767 #endif 768 sep->se_dev = sdp->swd_dev; 769 sep->se_flags = sdp->swd_flags; 770 sep->se_nblks = sdp->swd_nblks; 771 sep->se_inuse = inuse; 772 sep->se_priority = sdp->swd_priority; 773 memcpy(&sep->se_path, sdp->swd_path, 774 sizeof sep->se_path); 775 sep++; 776 #if defined(COMPAT_13) 777 } else if (cmd == SWAP_STATS13) { 778 struct swapent13 *sep13 = 779 (struct swapent13 *)sep; 780 781 sep13->se13_dev = sdp->swd_dev; 782 sep13->se13_flags = sdp->swd_flags; 783 sep13->se13_nblks = sdp->swd_nblks; 784 sep13->se13_inuse = inuse; 785 sep13->se13_priority = sdp->swd_priority; 786 sep = (struct swapent *)(sep13 + 1); 787 #endif 788 #if defined(COMPAT_50) 789 } else if (cmd == SWAP_STATS50) { 790 struct swapent50 *sep50 = 791 (struct swapent50 *)sep; 792 793 sep50->se50_dev = sdp->swd_dev; 794 sep50->se50_flags = sdp->swd_flags; 795 sep50->se50_nblks = sdp->swd_nblks; 796 sep50->se50_inuse = inuse; 797 sep50->se50_priority = sdp->swd_priority; 798 memcpy(&sep50->se50_path, sdp->swd_path, 799 sizeof sep50->se50_path); 800 sep = (struct swapent *)(sep50 + 1); 801 #endif 802 #if defined(COMPAT_13) || defined(COMPAT_50) 803 } 804 #endif 805 count++; 806 } 807 } 808 809 *retval = count; 810 return; 811 } 812 813 /* 814 * swap_on: attempt to enable a swapdev for swapping. note that the 815 * swapdev is already on the global list, but disabled (marked 816 * SWF_FAKE). 817 * 818 * => we avoid the start of the disk (to protect disk labels) 819 * => we also avoid the miniroot, if we are swapping to root. 820 * => caller should leave uvm_swap_data_lock unlocked, we may lock it 821 * if needed. 822 */ 823 static int 824 swap_on(struct lwp *l, struct swapdev *sdp) 825 { 826 struct vnode *vp; 827 int error, npages, nblocks, size; 828 long addr; 829 u_long result; 830 struct vattr va; 831 const struct bdevsw *bdev; 832 dev_t dev; 833 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); 834 835 /* 836 * we want to enable swapping on sdp. the swd_vp contains 837 * the vnode we want (locked and ref'd), and the swd_dev 838 * contains the dev_t of the file, if it a block device. 839 */ 840 841 vp = sdp->swd_vp; 842 dev = sdp->swd_dev; 843 844 /* 845 * open the swap file (mostly useful for block device files to 846 * let device driver know what is up). 847 * 848 * we skip the open/close for root on swap because the root 849 * has already been opened when root was mounted (mountroot). 850 */ 851 if (vp != rootvp) { 852 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred))) 853 return (error); 854 } 855 856 /* XXX this only works for block devices */ 857 UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); 858 859 /* 860 * we now need to determine the size of the swap area. for 861 * block specials we can call the d_psize function. 862 * for normal files, we must stat [get attrs]. 863 * 864 * we put the result in nblks. 865 * for normal files, we also want the filesystem block size 866 * (which we get with statfs). 867 */ 868 switch (vp->v_type) { 869 case VBLK: 870 bdev = bdevsw_lookup(dev); 871 if (bdev == NULL || bdev->d_psize == NULL || 872 (nblocks = (*bdev->d_psize)(dev)) == -1) { 873 error = ENXIO; 874 goto bad; 875 } 876 break; 877 878 case VREG: 879 if ((error = VOP_GETATTR(vp, &va, l->l_cred))) 880 goto bad; 881 nblocks = (int)btodb(va.va_size); 882 sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift; 883 /* 884 * limit the max # of outstanding I/O requests we issue 885 * at any one time. take it easy on NFS servers. 886 */ 887 if (vp->v_tag == VT_NFS) 888 sdp->swd_maxactive = 2; /* XXX */ 889 else 890 sdp->swd_maxactive = 8; /* XXX */ 891 break; 892 893 default: 894 error = ENXIO; 895 goto bad; 896 } 897 898 /* 899 * save nblocks in a safe place and convert to pages. 900 */ 901 902 sdp->swd_nblks = nblocks; 903 npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT; 904 905 /* 906 * for block special files, we want to make sure that leave 907 * the disklabel and bootblocks alone, so we arrange to skip 908 * over them (arbitrarily choosing to skip PAGE_SIZE bytes). 909 * note that because of this the "size" can be less than the 910 * actual number of blocks on the device. 911 */ 912 if (vp->v_type == VBLK) { 913 /* we use pages 1 to (size - 1) [inclusive] */ 914 size = npages - 1; 915 addr = 1; 916 } else { 917 /* we use pages 0 to (size - 1) [inclusive] */ 918 size = npages; 919 addr = 0; 920 } 921 922 /* 923 * make sure we have enough blocks for a reasonable sized swap 924 * area. we want at least one page. 925 */ 926 927 if (size < 1) { 928 UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); 929 error = EINVAL; 930 goto bad; 931 } 932 933 UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); 934 935 /* 936 * now we need to allocate an extent to manage this swap device 937 */ 938 939 sdp->swd_blist = blist_create(npages); 940 /* mark all expect the `saved' region free. */ 941 blist_free(sdp->swd_blist, addr, size); 942 943 /* 944 * if the vnode we are swapping to is the root vnode 945 * (i.e. we are swapping to the miniroot) then we want 946 * to make sure we don't overwrite it. do a statfs to 947 * find its size and skip over it. 948 */ 949 if (vp == rootvp) { 950 struct mount *mp; 951 struct statvfs *sp; 952 int rootblocks, rootpages; 953 954 mp = rootvnode->v_mount; 955 sp = &mp->mnt_stat; 956 rootblocks = sp->f_blocks * btodb(sp->f_frsize); 957 /* 958 * XXX: sp->f_blocks isn't the total number of 959 * blocks in the filesystem, it's the number of 960 * data blocks. so, our rootblocks almost 961 * definitely underestimates the total size 962 * of the filesystem - how badly depends on the 963 * details of the filesystem type. there isn't 964 * an obvious way to deal with this cleanly 965 * and perfectly, so for now we just pad our 966 * rootblocks estimate with an extra 5 percent. 967 */ 968 rootblocks += (rootblocks >> 5) + 969 (rootblocks >> 6) + 970 (rootblocks >> 7); 971 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; 972 if (rootpages > size) 973 panic("swap_on: miniroot larger than swap?"); 974 975 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) { 976 panic("swap_on: unable to preserve miniroot"); 977 } 978 979 size -= rootpages; 980 printf("Preserved %d pages of miniroot ", rootpages); 981 printf("leaving %d pages of swap\n", size); 982 } 983 984 /* 985 * add a ref to vp to reflect usage as a swap device. 986 */ 987 vref(vp); 988 989 /* 990 * now add the new swapdev to the drum and enable. 991 */ 992 result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP); 993 if (result == 0) 994 panic("swapdrum_add"); 995 /* 996 * If this is the first regular swap create the workqueue. 997 * => Protected by swap_syscall_lock. 998 */ 999 if (vp->v_type != VBLK) { 1000 if (sw_reg_count++ == 0) { 1001 KASSERT(sw_reg_workqueue == NULL); 1002 if (workqueue_create(&sw_reg_workqueue, "swapiod", 1003 sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0) 1004 panic("%s: workqueue_create failed", __func__); 1005 } 1006 } 1007 1008 sdp->swd_drumoffset = (int)result; 1009 sdp->swd_drumsize = npages; 1010 sdp->swd_npages = size; 1011 mutex_enter(&uvm_swap_data_lock); 1012 sdp->swd_flags &= ~SWF_FAKE; /* going live */ 1013 sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); 1014 uvmexp.swpages += size; 1015 uvmexp.swpgavail += size; 1016 mutex_exit(&uvm_swap_data_lock); 1017 return (0); 1018 1019 /* 1020 * failure: clean up and return error. 1021 */ 1022 1023 bad: 1024 if (sdp->swd_blist) { 1025 blist_destroy(sdp->swd_blist); 1026 } 1027 if (vp != rootvp) { 1028 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred); 1029 } 1030 return (error); 1031 } 1032 1033 /* 1034 * swap_off: stop swapping on swapdev 1035 * 1036 * => swap data should be locked, we will unlock. 1037 */ 1038 static int 1039 swap_off(struct lwp *l, struct swapdev *sdp) 1040 { 1041 int npages = sdp->swd_npages; 1042 int error = 0; 1043 1044 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); 1045 UVMHIST_LOG(pdhist, " dev=%x, npages=%d", sdp->swd_dev,npages,0,0); 1046 1047 /* disable the swap area being removed */ 1048 sdp->swd_flags &= ~SWF_ENABLE; 1049 uvmexp.swpgavail -= npages; 1050 mutex_exit(&uvm_swap_data_lock); 1051 1052 /* 1053 * the idea is to find all the pages that are paged out to this 1054 * device, and page them all in. in uvm, swap-backed pageable 1055 * memory can take two forms: aobjs and anons. call the 1056 * swapoff hook for each subsystem to bring in pages. 1057 */ 1058 1059 if (uao_swap_off(sdp->swd_drumoffset, 1060 sdp->swd_drumoffset + sdp->swd_drumsize) || 1061 amap_swap_off(sdp->swd_drumoffset, 1062 sdp->swd_drumoffset + sdp->swd_drumsize)) { 1063 error = ENOMEM; 1064 } else if (sdp->swd_npginuse > sdp->swd_npgbad) { 1065 error = EBUSY; 1066 } 1067 1068 if (error) { 1069 mutex_enter(&uvm_swap_data_lock); 1070 sdp->swd_flags |= SWF_ENABLE; 1071 uvmexp.swpgavail += npages; 1072 mutex_exit(&uvm_swap_data_lock); 1073 1074 return error; 1075 } 1076 1077 /* 1078 * If this is the last regular swap destroy the workqueue. 1079 * => Protected by swap_syscall_lock. 1080 */ 1081 if (sdp->swd_vp->v_type != VBLK) { 1082 KASSERT(sw_reg_count > 0); 1083 KASSERT(sw_reg_workqueue != NULL); 1084 if (--sw_reg_count == 0) { 1085 workqueue_destroy(sw_reg_workqueue); 1086 sw_reg_workqueue = NULL; 1087 } 1088 } 1089 1090 /* 1091 * done with the vnode. 1092 * drop our ref on the vnode before calling VOP_CLOSE() 1093 * so that spec_close() can tell if this is the last close. 1094 */ 1095 vrele(sdp->swd_vp); 1096 if (sdp->swd_vp != rootvp) { 1097 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred); 1098 } 1099 1100 mutex_enter(&uvm_swap_data_lock); 1101 uvmexp.swpages -= npages; 1102 uvmexp.swpginuse -= sdp->swd_npgbad; 1103 1104 if (swaplist_find(sdp->swd_vp, true) == NULL) 1105 panic("%s: swapdev not in list", __func__); 1106 swaplist_trim(); 1107 mutex_exit(&uvm_swap_data_lock); 1108 1109 /* 1110 * free all resources! 1111 */ 1112 vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize); 1113 blist_destroy(sdp->swd_blist); 1114 bufq_free(sdp->swd_tab); 1115 free(sdp, M_VMSWAP); 1116 return (0); 1117 } 1118 1119 /* 1120 * /dev/drum interface and i/o functions 1121 */ 1122 1123 /* 1124 * swstrategy: perform I/O on the drum 1125 * 1126 * => we must map the i/o request from the drum to the correct swapdev. 1127 */ 1128 static void 1129 swstrategy(struct buf *bp) 1130 { 1131 struct swapdev *sdp; 1132 struct vnode *vp; 1133 int pageno, bn; 1134 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); 1135 1136 /* 1137 * convert block number to swapdev. note that swapdev can't 1138 * be yanked out from under us because we are holding resources 1139 * in it (i.e. the blocks we are doing I/O on). 1140 */ 1141 pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; 1142 mutex_enter(&uvm_swap_data_lock); 1143 sdp = swapdrum_getsdp(pageno); 1144 mutex_exit(&uvm_swap_data_lock); 1145 if (sdp == NULL) { 1146 bp->b_error = EINVAL; 1147 biodone(bp); 1148 UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); 1149 return; 1150 } 1151 1152 /* 1153 * convert drum page number to block number on this swapdev. 1154 */ 1155 1156 pageno -= sdp->swd_drumoffset; /* page # on swapdev */ 1157 bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */ 1158 1159 UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", 1160 ((bp->b_flags & B_READ) == 0) ? "write" : "read", 1161 sdp->swd_drumoffset, bn, bp->b_bcount); 1162 1163 /* 1164 * for block devices we finish up here. 1165 * for regular files we have to do more work which we delegate 1166 * to sw_reg_strategy(). 1167 */ 1168 1169 vp = sdp->swd_vp; /* swapdev vnode pointer */ 1170 switch (vp->v_type) { 1171 default: 1172 panic("%s: vnode type 0x%x", __func__, vp->v_type); 1173 1174 case VBLK: 1175 1176 /* 1177 * must convert "bp" from an I/O on /dev/drum to an I/O 1178 * on the swapdev (sdp). 1179 */ 1180 bp->b_blkno = bn; /* swapdev block number */ 1181 bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ 1182 1183 /* 1184 * if we are doing a write, we have to redirect the i/o on 1185 * drum's v_numoutput counter to the swapdevs. 1186 */ 1187 if ((bp->b_flags & B_READ) == 0) { 1188 mutex_enter(bp->b_objlock); 1189 vwakeup(bp); /* kills one 'v_numoutput' on drum */ 1190 mutex_exit(bp->b_objlock); 1191 mutex_enter(&vp->v_interlock); 1192 vp->v_numoutput++; /* put it on swapdev */ 1193 mutex_exit(&vp->v_interlock); 1194 } 1195 1196 /* 1197 * finally plug in swapdev vnode and start I/O 1198 */ 1199 bp->b_vp = vp; 1200 bp->b_objlock = &vp->v_interlock; 1201 VOP_STRATEGY(vp, bp); 1202 return; 1203 1204 case VREG: 1205 /* 1206 * delegate to sw_reg_strategy function. 1207 */ 1208 sw_reg_strategy(sdp, bp, bn); 1209 return; 1210 } 1211 /* NOTREACHED */ 1212 } 1213 1214 /* 1215 * swread: the read function for the drum (just a call to physio) 1216 */ 1217 /*ARGSUSED*/ 1218 static int 1219 swread(dev_t dev, struct uio *uio, int ioflag) 1220 { 1221 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); 1222 1223 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1224 return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); 1225 } 1226 1227 /* 1228 * swwrite: the write function for the drum (just a call to physio) 1229 */ 1230 /*ARGSUSED*/ 1231 static int 1232 swwrite(dev_t dev, struct uio *uio, int ioflag) 1233 { 1234 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); 1235 1236 UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); 1237 return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); 1238 } 1239 1240 const struct bdevsw swap_bdevsw = { 1241 nullopen, nullclose, swstrategy, noioctl, nodump, nosize, D_OTHER, 1242 }; 1243 1244 const struct cdevsw swap_cdevsw = { 1245 nullopen, nullclose, swread, swwrite, noioctl, 1246 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER, 1247 }; 1248 1249 /* 1250 * sw_reg_strategy: handle swap i/o to regular files 1251 */ 1252 static void 1253 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn) 1254 { 1255 struct vnode *vp; 1256 struct vndxfer *vnx; 1257 daddr_t nbn; 1258 char *addr; 1259 off_t byteoff; 1260 int s, off, nra, error, sz, resid; 1261 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); 1262 1263 /* 1264 * allocate a vndxfer head for this transfer and point it to 1265 * our buffer. 1266 */ 1267 vnx = pool_get(&vndxfer_pool, PR_WAITOK); 1268 vnx->vx_flags = VX_BUSY; 1269 vnx->vx_error = 0; 1270 vnx->vx_pending = 0; 1271 vnx->vx_bp = bp; 1272 vnx->vx_sdp = sdp; 1273 1274 /* 1275 * setup for main loop where we read filesystem blocks into 1276 * our buffer. 1277 */ 1278 error = 0; 1279 bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ 1280 addr = bp->b_data; /* current position in buffer */ 1281 byteoff = dbtob((uint64_t)bn); 1282 1283 for (resid = bp->b_resid; resid; resid -= sz) { 1284 struct vndbuf *nbp; 1285 1286 /* 1287 * translate byteoffset into block number. return values: 1288 * vp = vnode of underlying device 1289 * nbn = new block number (on underlying vnode dev) 1290 * nra = num blocks we can read-ahead (excludes requested 1291 * block) 1292 */ 1293 nra = 0; 1294 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, 1295 &vp, &nbn, &nra); 1296 1297 if (error == 0 && nbn == (daddr_t)-1) { 1298 /* 1299 * this used to just set error, but that doesn't 1300 * do the right thing. Instead, it causes random 1301 * memory errors. The panic() should remain until 1302 * this condition doesn't destabilize the system. 1303 */ 1304 #if 1 1305 panic("%s: swap to sparse file", __func__); 1306 #else 1307 error = EIO; /* failure */ 1308 #endif 1309 } 1310 1311 /* 1312 * punt if there was an error or a hole in the file. 1313 * we must wait for any i/o ops we have already started 1314 * to finish before returning. 1315 * 1316 * XXX we could deal with holes here but it would be 1317 * a hassle (in the write case). 1318 */ 1319 if (error) { 1320 s = splbio(); 1321 vnx->vx_error = error; /* pass error up */ 1322 goto out; 1323 } 1324 1325 /* 1326 * compute the size ("sz") of this transfer (in bytes). 1327 */ 1328 off = byteoff % sdp->swd_bsize; 1329 sz = (1 + nra) * sdp->swd_bsize - off; 1330 if (sz > resid) 1331 sz = resid; 1332 1333 UVMHIST_LOG(pdhist, "sw_reg_strategy: " 1334 "vp %p/%p offset 0x%x/0x%x", 1335 sdp->swd_vp, vp, byteoff, nbn); 1336 1337 /* 1338 * now get a buf structure. note that the vb_buf is 1339 * at the front of the nbp structure so that you can 1340 * cast pointers between the two structure easily. 1341 */ 1342 nbp = pool_get(&vndbuf_pool, PR_WAITOK); 1343 buf_init(&nbp->vb_buf); 1344 nbp->vb_buf.b_flags = bp->b_flags; 1345 nbp->vb_buf.b_cflags = bp->b_cflags; 1346 nbp->vb_buf.b_oflags = bp->b_oflags; 1347 nbp->vb_buf.b_bcount = sz; 1348 nbp->vb_buf.b_bufsize = sz; 1349 nbp->vb_buf.b_error = 0; 1350 nbp->vb_buf.b_data = addr; 1351 nbp->vb_buf.b_lblkno = 0; 1352 nbp->vb_buf.b_blkno = nbn + btodb(off); 1353 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; 1354 nbp->vb_buf.b_iodone = sw_reg_biodone; 1355 nbp->vb_buf.b_vp = vp; 1356 nbp->vb_buf.b_objlock = &vp->v_interlock; 1357 if (vp->v_type == VBLK) { 1358 nbp->vb_buf.b_dev = vp->v_rdev; 1359 } 1360 1361 nbp->vb_xfer = vnx; /* patch it back in to vnx */ 1362 1363 /* 1364 * Just sort by block number 1365 */ 1366 s = splbio(); 1367 if (vnx->vx_error != 0) { 1368 buf_destroy(&nbp->vb_buf); 1369 pool_put(&vndbuf_pool, nbp); 1370 goto out; 1371 } 1372 vnx->vx_pending++; 1373 1374 /* sort it in and start I/O if we are not over our limit */ 1375 /* XXXAD locking */ 1376 bufq_put(sdp->swd_tab, &nbp->vb_buf); 1377 sw_reg_start(sdp); 1378 splx(s); 1379 1380 /* 1381 * advance to the next I/O 1382 */ 1383 byteoff += sz; 1384 addr += sz; 1385 } 1386 1387 s = splbio(); 1388 1389 out: /* Arrive here at splbio */ 1390 vnx->vx_flags &= ~VX_BUSY; 1391 if (vnx->vx_pending == 0) { 1392 error = vnx->vx_error; 1393 pool_put(&vndxfer_pool, vnx); 1394 bp->b_error = error; 1395 biodone(bp); 1396 } 1397 splx(s); 1398 } 1399 1400 /* 1401 * sw_reg_start: start an I/O request on the requested swapdev 1402 * 1403 * => reqs are sorted by b_rawblkno (above) 1404 */ 1405 static void 1406 sw_reg_start(struct swapdev *sdp) 1407 { 1408 struct buf *bp; 1409 struct vnode *vp; 1410 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); 1411 1412 /* recursion control */ 1413 if ((sdp->swd_flags & SWF_BUSY) != 0) 1414 return; 1415 1416 sdp->swd_flags |= SWF_BUSY; 1417 1418 while (sdp->swd_active < sdp->swd_maxactive) { 1419 bp = bufq_get(sdp->swd_tab); 1420 if (bp == NULL) 1421 break; 1422 sdp->swd_active++; 1423 1424 UVMHIST_LOG(pdhist, 1425 "sw_reg_start: bp %p vp %p blkno %p cnt %lx", 1426 bp, bp->b_vp, bp->b_blkno, bp->b_bcount); 1427 vp = bp->b_vp; 1428 KASSERT(bp->b_objlock == &vp->v_interlock); 1429 if ((bp->b_flags & B_READ) == 0) { 1430 mutex_enter(&vp->v_interlock); 1431 vp->v_numoutput++; 1432 mutex_exit(&vp->v_interlock); 1433 } 1434 VOP_STRATEGY(vp, bp); 1435 } 1436 sdp->swd_flags &= ~SWF_BUSY; 1437 } 1438 1439 /* 1440 * sw_reg_biodone: one of our i/o's has completed 1441 */ 1442 static void 1443 sw_reg_biodone(struct buf *bp) 1444 { 1445 workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL); 1446 } 1447 1448 /* 1449 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup 1450 * 1451 * => note that we can recover the vndbuf struct by casting the buf ptr 1452 */ 1453 static void 1454 sw_reg_iodone(struct work *wk, void *dummy) 1455 { 1456 struct vndbuf *vbp = (void *)wk; 1457 struct vndxfer *vnx = vbp->vb_xfer; 1458 struct buf *pbp = vnx->vx_bp; /* parent buffer */ 1459 struct swapdev *sdp = vnx->vx_sdp; 1460 int s, resid, error; 1461 KASSERT(&vbp->vb_buf.b_work == wk); 1462 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); 1463 1464 UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", 1465 vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); 1466 UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", 1467 vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); 1468 1469 /* 1470 * protect vbp at splbio and update. 1471 */ 1472 1473 s = splbio(); 1474 resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; 1475 pbp->b_resid -= resid; 1476 vnx->vx_pending--; 1477 1478 if (vbp->vb_buf.b_error != 0) { 1479 /* pass error upward */ 1480 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO; 1481 UVMHIST_LOG(pdhist, " got error=%d !", error, 0, 0, 0); 1482 vnx->vx_error = error; 1483 } 1484 1485 /* 1486 * kill vbp structure 1487 */ 1488 buf_destroy(&vbp->vb_buf); 1489 pool_put(&vndbuf_pool, vbp); 1490 1491 /* 1492 * wrap up this transaction if it has run to completion or, in 1493 * case of an error, when all auxiliary buffers have returned. 1494 */ 1495 if (vnx->vx_error != 0) { 1496 /* pass error upward */ 1497 error = vnx->vx_error; 1498 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { 1499 pbp->b_error = error; 1500 biodone(pbp); 1501 pool_put(&vndxfer_pool, vnx); 1502 } 1503 } else if (pbp->b_resid == 0) { 1504 KASSERT(vnx->vx_pending == 0); 1505 if ((vnx->vx_flags & VX_BUSY) == 0) { 1506 UVMHIST_LOG(pdhist, " iodone error=%d !", 1507 pbp, vnx->vx_error, 0, 0); 1508 biodone(pbp); 1509 pool_put(&vndxfer_pool, vnx); 1510 } 1511 } 1512 1513 /* 1514 * done! start next swapdev I/O if one is pending 1515 */ 1516 sdp->swd_active--; 1517 sw_reg_start(sdp); 1518 splx(s); 1519 } 1520 1521 1522 /* 1523 * uvm_swap_alloc: allocate space on swap 1524 * 1525 * => allocation is done "round robin" down the priority list, as we 1526 * allocate in a priority we "rotate" the circle queue. 1527 * => space can be freed with uvm_swap_free 1528 * => we return the page slot number in /dev/drum (0 == invalid slot) 1529 * => we lock uvm_swap_data_lock 1530 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM 1531 */ 1532 int 1533 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok) 1534 { 1535 struct swapdev *sdp; 1536 struct swappri *spp; 1537 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); 1538 1539 /* 1540 * no swap devices configured yet? definite failure. 1541 */ 1542 if (uvmexp.nswapdev < 1) 1543 return 0; 1544 1545 /* 1546 * lock data lock, convert slots into blocks, and enter loop 1547 */ 1548 mutex_enter(&uvm_swap_data_lock); 1549 1550 ReTry: /* XXXMRG */ 1551 LIST_FOREACH(spp, &swap_priority, spi_swappri) { 1552 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { 1553 uint64_t result; 1554 1555 /* if it's not enabled, then we can't swap from it */ 1556 if ((sdp->swd_flags & SWF_ENABLE) == 0) 1557 continue; 1558 if (sdp->swd_npginuse + *nslots > sdp->swd_npages) 1559 continue; 1560 result = blist_alloc(sdp->swd_blist, *nslots); 1561 if (result == BLIST_NONE) { 1562 continue; 1563 } 1564 KASSERT(result < sdp->swd_drumsize); 1565 1566 /* 1567 * successful allocation! now rotate the circleq. 1568 */ 1569 CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); 1570 CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); 1571 sdp->swd_npginuse += *nslots; 1572 uvmexp.swpginuse += *nslots; 1573 mutex_exit(&uvm_swap_data_lock); 1574 /* done! return drum slot number */ 1575 UVMHIST_LOG(pdhist, 1576 "success! returning %d slots starting at %d", 1577 *nslots, result + sdp->swd_drumoffset, 0, 0); 1578 return (result + sdp->swd_drumoffset); 1579 } 1580 } 1581 1582 /* XXXMRG: BEGIN HACK */ 1583 if (*nslots > 1 && lessok) { 1584 *nslots = 1; 1585 /* XXXMRG: ugh! blist should support this for us */ 1586 goto ReTry; 1587 } 1588 /* XXXMRG: END HACK */ 1589 1590 mutex_exit(&uvm_swap_data_lock); 1591 return 0; 1592 } 1593 1594 /* 1595 * uvm_swapisfull: return true if most of available swap is allocated 1596 * and in use. we don't count some small portion as it may be inaccessible 1597 * to us at any given moment, for example if there is lock contention or if 1598 * pages are busy. 1599 */ 1600 bool 1601 uvm_swapisfull(void) 1602 { 1603 int swpgonly; 1604 bool rv; 1605 1606 mutex_enter(&uvm_swap_data_lock); 1607 KASSERT(uvmexp.swpgonly <= uvmexp.swpages); 1608 swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 / 1609 uvm_swapisfull_factor); 1610 rv = (swpgonly >= uvmexp.swpgavail); 1611 mutex_exit(&uvm_swap_data_lock); 1612 1613 return (rv); 1614 } 1615 1616 /* 1617 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors 1618 * 1619 * => we lock uvm_swap_data_lock 1620 */ 1621 void 1622 uvm_swap_markbad(int startslot, int nslots) 1623 { 1624 struct swapdev *sdp; 1625 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist); 1626 1627 mutex_enter(&uvm_swap_data_lock); 1628 sdp = swapdrum_getsdp(startslot); 1629 KASSERT(sdp != NULL); 1630 1631 /* 1632 * we just keep track of how many pages have been marked bad 1633 * in this device, to make everything add up in swap_off(). 1634 * we assume here that the range of slots will all be within 1635 * one swap device. 1636 */ 1637 1638 KASSERT(uvmexp.swpgonly >= nslots); 1639 uvmexp.swpgonly -= nslots; 1640 sdp->swd_npgbad += nslots; 1641 UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); 1642 mutex_exit(&uvm_swap_data_lock); 1643 } 1644 1645 /* 1646 * uvm_swap_free: free swap slots 1647 * 1648 * => this can be all or part of an allocation made by uvm_swap_alloc 1649 * => we lock uvm_swap_data_lock 1650 */ 1651 void 1652 uvm_swap_free(int startslot, int nslots) 1653 { 1654 struct swapdev *sdp; 1655 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); 1656 1657 UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, 1658 startslot, 0, 0); 1659 1660 /* 1661 * ignore attempts to free the "bad" slot. 1662 */ 1663 1664 if (startslot == SWSLOT_BAD) { 1665 return; 1666 } 1667 1668 /* 1669 * convert drum slot offset back to sdp, free the blocks 1670 * in the extent, and return. must hold pri lock to do 1671 * lookup and access the extent. 1672 */ 1673 1674 mutex_enter(&uvm_swap_data_lock); 1675 sdp = swapdrum_getsdp(startslot); 1676 KASSERT(uvmexp.nswapdev >= 1); 1677 KASSERT(sdp != NULL); 1678 KASSERT(sdp->swd_npginuse >= nslots); 1679 blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots); 1680 sdp->swd_npginuse -= nslots; 1681 uvmexp.swpginuse -= nslots; 1682 mutex_exit(&uvm_swap_data_lock); 1683 } 1684 1685 /* 1686 * uvm_swap_put: put any number of pages into a contig place on swap 1687 * 1688 * => can be sync or async 1689 */ 1690 1691 int 1692 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags) 1693 { 1694 int error; 1695 1696 error = uvm_swap_io(ppsp, swslot, npages, B_WRITE | 1697 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1698 return error; 1699 } 1700 1701 /* 1702 * uvm_swap_get: get a single page from swap 1703 * 1704 * => usually a sync op (from fault) 1705 */ 1706 1707 int 1708 uvm_swap_get(struct vm_page *page, int swslot, int flags) 1709 { 1710 int error; 1711 1712 uvmexp.nswget++; 1713 KASSERT(flags & PGO_SYNCIO); 1714 if (swslot == SWSLOT_BAD) { 1715 return EIO; 1716 } 1717 1718 error = uvm_swap_io(&page, swslot, 1, B_READ | 1719 ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); 1720 if (error == 0) { 1721 1722 /* 1723 * this page is no longer only in swap. 1724 */ 1725 1726 mutex_enter(&uvm_swap_data_lock); 1727 KASSERT(uvmexp.swpgonly > 0); 1728 uvmexp.swpgonly--; 1729 mutex_exit(&uvm_swap_data_lock); 1730 } 1731 return error; 1732 } 1733 1734 /* 1735 * uvm_swap_io: do an i/o operation to swap 1736 */ 1737 1738 static int 1739 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags) 1740 { 1741 daddr_t startblk; 1742 struct buf *bp; 1743 vaddr_t kva; 1744 int error, mapinflags; 1745 bool write, async; 1746 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); 1747 1748 UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", 1749 startslot, npages, flags, 0); 1750 1751 write = (flags & B_READ) == 0; 1752 async = (flags & B_ASYNC) != 0; 1753 1754 /* 1755 * allocate a buf for the i/o. 1756 */ 1757 1758 KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async)); 1759 bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp); 1760 if (bp == NULL) { 1761 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM); 1762 return ENOMEM; 1763 } 1764 1765 /* 1766 * convert starting drum slot to block number 1767 */ 1768 1769 startblk = btodb((uint64_t)startslot << PAGE_SHIFT); 1770 1771 /* 1772 * first, map the pages into the kernel. 1773 */ 1774 1775 mapinflags = !write ? 1776 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ : 1777 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE; 1778 kva = uvm_pagermapin(pps, npages, mapinflags); 1779 1780 /* 1781 * fill in the bp/sbp. we currently route our i/o through 1782 * /dev/drum's vnode [swapdev_vp]. 1783 */ 1784 1785 bp->b_cflags = BC_BUSY | BC_NOCACHE; 1786 bp->b_flags = (flags & (B_READ|B_ASYNC)); 1787 bp->b_proc = &proc0; /* XXX */ 1788 bp->b_vnbufs.le_next = NOLIST; 1789 bp->b_data = (void *)kva; 1790 bp->b_blkno = startblk; 1791 bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; 1792 1793 /* 1794 * bump v_numoutput (counter of number of active outputs). 1795 */ 1796 1797 if (write) { 1798 mutex_enter(&swapdev_vp->v_interlock); 1799 swapdev_vp->v_numoutput++; 1800 mutex_exit(&swapdev_vp->v_interlock); 1801 } 1802 1803 /* 1804 * for async ops we must set up the iodone handler. 1805 */ 1806 1807 if (async) { 1808 bp->b_iodone = uvm_aio_biodone; 1809 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); 1810 if (curlwp == uvm.pagedaemon_lwp) 1811 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1812 else 1813 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 1814 } else { 1815 bp->b_iodone = NULL; 1816 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 1817 } 1818 UVMHIST_LOG(pdhist, 1819 "about to start io: data = %p blkno = 0x%x, bcount = %ld", 1820 bp->b_data, bp->b_blkno, bp->b_bcount, 0); 1821 1822 /* 1823 * now we start the I/O, and if async, return. 1824 */ 1825 1826 VOP_STRATEGY(swapdev_vp, bp); 1827 if (async) 1828 return 0; 1829 1830 /* 1831 * must be sync i/o. wait for it to finish 1832 */ 1833 1834 error = biowait(bp); 1835 1836 /* 1837 * kill the pager mapping 1838 */ 1839 1840 uvm_pagermapout(kva, npages); 1841 1842 /* 1843 * now dispose of the buf and we're done. 1844 */ 1845 1846 if (write) { 1847 mutex_enter(&swapdev_vp->v_interlock); 1848 vwakeup(bp); 1849 mutex_exit(&swapdev_vp->v_interlock); 1850 } 1851 putiobuf(bp); 1852 UVMHIST_LOG(pdhist, "<- done (sync) error=%d", error, 0, 0, 0); 1853 1854 return (error); 1855 } 1856