1 /* 2 * Copyright (c) 2003-2014 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * --- 35 * 36 * Copyright (c) 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * Copyright (c) 1994 John S. Dyson 39 * All rights reserved. 40 * Copyright (c) 1994 David Greenman 41 * All rights reserved. 42 * 43 * 44 * This code is derived from software contributed to Berkeley by 45 * The Mach Operating System project at Carnegie-Mellon University. 46 * 47 * Redistribution and use in source and binary forms, with or without 48 * modification, are permitted provided that the following conditions 49 * are met: 50 * 1. Redistributions of source code must retain the above copyright 51 * notice, this list of conditions and the following disclaimer. 52 * 2. Redistributions in binary form must reproduce the above copyright 53 * notice, this list of conditions and the following disclaimer in the 54 * documentation and/or other materials provided with the distribution. 55 * 3. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * --- 72 * 73 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 74 * All rights reserved. 75 * 76 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 77 * 78 * Permission to use, copy, modify and distribute this software and 79 * its documentation is hereby granted, provided that both the copyright 80 * notice and this permission notice appear in all copies of the 81 * software, derivative works or modified versions, and any portions 82 * thereof, and that both notices appear in supporting documentation. 83 * 84 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 85 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 86 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 87 * 88 * Carnegie Mellon requests users of this software to return to 89 * 90 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 91 * School of Computer Science 92 * Carnegie Mellon University 93 * Pittsburgh PA 15213-3890 94 * 95 * any improvements or extensions that they make and grant Carnegie the 96 * rights to redistribute these changes. 97 */ 98 99 /* 100 * Page fault handling module. 101 */ 102 103 #include <sys/param.h> 104 #include <sys/systm.h> 105 #include <sys/kernel.h> 106 #include <sys/proc.h> 107 #include <sys/vnode.h> 108 #include <sys/resourcevar.h> 109 #include <sys/vmmeter.h> 110 #include <sys/vkernel.h> 111 #include <sys/lock.h> 112 #include <sys/sysctl.h> 113 114 #include <cpu/lwbuf.h> 115 116 #include <vm/vm.h> 117 #include <vm/vm_param.h> 118 #include <vm/pmap.h> 119 #include <vm/vm_map.h> 120 #include <vm/vm_object.h> 121 #include <vm/vm_page.h> 122 #include <vm/vm_pageout.h> 123 #include <vm/vm_kern.h> 124 #include <vm/vm_pager.h> 125 #include <vm/vnode_pager.h> 126 #include <vm/vm_extern.h> 127 128 #include <vm/vm_page2.h> 129 130 struct faultstate { 131 vm_page_t m; 132 vm_object_t object; 133 vm_pindex_t pindex; 134 vm_prot_t prot; 135 vm_page_t first_m; 136 vm_object_t first_object; 137 vm_prot_t first_prot; 138 vm_map_t map; 139 vm_map_entry_t entry; 140 int lookup_still_valid; 141 int hardfault; 142 int fault_flags; 143 int map_generation; 144 int shared; 145 int msoftonly; 146 int first_shared; 147 int wflags; 148 struct vnode *vp; 149 }; 150 151 __read_mostly static int debug_fault = 0; 152 SYSCTL_INT(_vm, OID_AUTO, debug_fault, CTLFLAG_RW, &debug_fault, 0, ""); 153 __read_mostly static int debug_cluster = 0; 154 SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, ""); 155 __read_mostly static int virtual_copy_enable = 1; 156 SYSCTL_INT(_vm, OID_AUTO, virtual_copy_enable, CTLFLAG_RW, 157 &virtual_copy_enable, 0, ""); 158 __read_mostly int vm_shared_fault = 1; 159 TUNABLE_INT("vm.shared_fault", &vm_shared_fault); 160 SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW, 161 &vm_shared_fault, 0, "Allow shared token on vm_object"); 162 __read_mostly static int vm_fault_quick_enable = 0; 163 TUNABLE_INT("vm.fault_quick", &vm_fault_quick_enable); 164 SYSCTL_INT(_vm, OID_AUTO, fault_quick, CTLFLAG_RW, 165 &vm_fault_quick_enable, 0, "Allow fast vm_fault shortcut"); 166 #ifdef VM_FAULT_QUICK_DEBUG 167 static long vm_fault_quick_success_count = 0; 168 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_success_count, CTLFLAG_RW, 169 &vm_fault_quick_success_count, 0, ""); 170 static long vm_fault_quick_failure_count1 = 0; 171 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count1, CTLFLAG_RW, 172 &vm_fault_quick_failure_count1, 0, ""); 173 static long vm_fault_quick_failure_count2 = 0; 174 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count2, CTLFLAG_RW, 175 &vm_fault_quick_failure_count2, 0, ""); 176 static long vm_fault_quick_failure_count3 = 0; 177 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count3, CTLFLAG_RW, 178 &vm_fault_quick_failure_count3, 0, ""); 179 static long vm_fault_quick_failure_count4 = 0; 180 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count4, CTLFLAG_RW, 181 &vm_fault_quick_failure_count4, 0, ""); 182 #endif 183 184 static int vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, 185 vm_prot_t fault_type); 186 static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t, int); 187 static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, 188 vpte_t, int, int); 189 #if 0 190 static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *); 191 #endif 192 static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry); 193 static void vm_prefault(pmap_t pmap, vm_offset_t addra, 194 vm_map_entry_t entry, int prot, int fault_flags); 195 static void vm_prefault_quick(pmap_t pmap, vm_offset_t addra, 196 vm_map_entry_t entry, int prot, int fault_flags); 197 198 static __inline void 199 release_page(struct faultstate *fs) 200 { 201 vm_page_deactivate(fs->m); 202 vm_page_wakeup(fs->m); 203 fs->m = NULL; 204 } 205 206 /* 207 * NOTE: Once unlocked any cached fs->entry becomes invalid, any reuse 208 * requires relocking and then checking the timestamp. 209 * 210 * NOTE: vm_map_lock_read() does not bump fs->map->timestamp so we do 211 * not have to update fs->map_generation here. 212 * 213 * NOTE: This function can fail due to a deadlock against the caller's 214 * holding of a vm_page BUSY. 215 */ 216 static __inline int 217 relock_map(struct faultstate *fs) 218 { 219 int error; 220 221 if (fs->lookup_still_valid == FALSE && fs->map) { 222 error = vm_map_lock_read_to(fs->map); 223 if (error == 0) 224 fs->lookup_still_valid = TRUE; 225 } else { 226 error = 0; 227 } 228 return error; 229 } 230 231 static __inline void 232 unlock_map(struct faultstate *fs) 233 { 234 if (fs->lookup_still_valid && fs->map) { 235 vm_map_lookup_done(fs->map, fs->entry, 0); 236 fs->lookup_still_valid = FALSE; 237 } 238 } 239 240 /* 241 * Clean up after a successful call to vm_fault_object() so another call 242 * to vm_fault_object() can be made. 243 */ 244 static void 245 _cleanup_successful_fault(struct faultstate *fs, int relock) 246 { 247 /* 248 * We allocated a junk page for a COW operation that did 249 * not occur, the page must be freed. 250 */ 251 if (fs->object != fs->first_object) { 252 KKASSERT(fs->first_shared == 0); 253 vm_page_free(fs->first_m); 254 vm_object_pip_wakeup(fs->object); 255 fs->first_m = NULL; 256 } 257 258 /* 259 * Reset fs->object. 260 */ 261 fs->object = fs->first_object; 262 if (relock && fs->lookup_still_valid == FALSE) { 263 if (fs->map) 264 vm_map_lock_read(fs->map); 265 fs->lookup_still_valid = TRUE; 266 } 267 } 268 269 static void 270 _unlock_things(struct faultstate *fs, int dealloc) 271 { 272 _cleanup_successful_fault(fs, 0); 273 if (dealloc) { 274 /*vm_object_deallocate(fs->first_object);*/ 275 /*fs->first_object = NULL; drop used later on */ 276 } 277 unlock_map(fs); 278 if (fs->vp != NULL) { 279 vput(fs->vp); 280 fs->vp = NULL; 281 } 282 } 283 284 #define unlock_things(fs) _unlock_things(fs, 0) 285 #define unlock_and_deallocate(fs) _unlock_things(fs, 1) 286 #define cleanup_successful_fault(fs) _cleanup_successful_fault(fs, 1) 287 288 /* 289 * Virtual copy tests. Used by the fault code to determine if a 290 * page can be moved from an orphan vm_object into its shadow 291 * instead of copying its contents. 292 */ 293 static __inline int 294 virtual_copy_test(struct faultstate *fs) 295 { 296 /* 297 * Must be holding exclusive locks 298 */ 299 if (fs->first_shared || fs->shared || virtual_copy_enable == 0) 300 return 0; 301 302 /* 303 * Map, if present, has not changed 304 */ 305 if (fs->map && fs->map_generation != fs->map->timestamp) 306 return 0; 307 308 /* 309 * Only one shadow object 310 */ 311 if (fs->object->shadow_count != 1) 312 return 0; 313 314 /* 315 * No COW refs, except us 316 */ 317 if (fs->object->ref_count != 1) 318 return 0; 319 320 /* 321 * No one else can look this object up 322 */ 323 if (fs->object->handle != NULL) 324 return 0; 325 326 /* 327 * No other ways to look the object up 328 */ 329 if (fs->object->type != OBJT_DEFAULT && 330 fs->object->type != OBJT_SWAP) 331 return 0; 332 333 /* 334 * We don't chase down the shadow chain 335 */ 336 if (fs->object != fs->first_object->backing_object) 337 return 0; 338 339 return 1; 340 } 341 342 static __inline int 343 virtual_copy_ok(struct faultstate *fs) 344 { 345 if (virtual_copy_test(fs)) { 346 /* 347 * Grab the lock and re-test changeable items. 348 */ 349 if (fs->lookup_still_valid == FALSE && fs->map) { 350 if (lockmgr(&fs->map->lock, LK_EXCLUSIVE|LK_NOWAIT)) 351 return 0; 352 fs->lookup_still_valid = TRUE; 353 if (virtual_copy_test(fs)) { 354 fs->map_generation = ++fs->map->timestamp; 355 return 1; 356 } 357 fs->lookup_still_valid = FALSE; 358 lockmgr(&fs->map->lock, LK_RELEASE); 359 } 360 } 361 return 0; 362 } 363 364 /* 365 * TRYPAGER 366 * 367 * Determine if the pager for the current object *might* contain the page. 368 * 369 * We only need to try the pager if this is not a default object (default 370 * objects are zero-fill and have no real pager), and if we are not taking 371 * a wiring fault or if the FS entry is wired. 372 */ 373 #define TRYPAGER(fs) \ 374 (fs->object->type != OBJT_DEFAULT && \ 375 (((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) || \ 376 (fs->wflags & FW_WIRED))) 377 378 /* 379 * vm_fault: 380 * 381 * Handle a page fault occuring at the given address, requiring the given 382 * permissions, in the map specified. If successful, the page is inserted 383 * into the associated physical map. 384 * 385 * NOTE: The given address should be truncated to the proper page address. 386 * 387 * KERN_SUCCESS is returned if the page fault is handled; otherwise, 388 * a standard error specifying why the fault is fatal is returned. 389 * 390 * The map in question must be referenced, and remains so. 391 * The caller may hold no locks. 392 * No other requirements. 393 */ 394 int 395 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags) 396 { 397 int result; 398 vm_pindex_t first_pindex; 399 struct faultstate fs; 400 struct lwp *lp; 401 struct proc *p; 402 thread_t td; 403 struct vm_map_ilock ilock; 404 int didilock; 405 int didhold; 406 int growstack; 407 int retry = 0; 408 int inherit_prot; 409 410 inherit_prot = fault_type & VM_PROT_NOSYNC; 411 fs.hardfault = 0; 412 fs.fault_flags = fault_flags; 413 fs.vp = NULL; 414 fs.shared = vm_shared_fault; 415 fs.first_shared = vm_shared_fault; 416 growstack = 1; 417 418 /* 419 * vm_map interactions 420 */ 421 td = curthread; 422 if ((lp = td->td_lwp) != NULL) 423 lp->lwp_flags |= LWP_PAGING; 424 425 RetryFault: 426 /* 427 * vm_fault_quick() can shortcut us. 428 */ 429 fs.msoftonly = 0; 430 didhold = 0; 431 432 /* 433 * Find the vm_map_entry representing the backing store and resolve 434 * the top level object and page index. This may have the side 435 * effect of executing a copy-on-write on the map entry, 436 * creating a shadow object, or splitting an anonymous entry for 437 * performance, but will not COW any actual VM pages. 438 * 439 * On success fs.map is left read-locked and various other fields 440 * are initialized but not otherwise referenced or locked. 441 * 442 * NOTE! vm_map_lookup will try to upgrade the fault_type to 443 * VM_FAULT_WRITE if the map entry is a virtual page table 444 * and also writable, so we can set the 'A'accessed bit in 445 * the virtual page table entry. 446 */ 447 fs.map = map; 448 result = vm_map_lookup(&fs.map, vaddr, fault_type, 449 &fs.entry, &fs.first_object, 450 &first_pindex, &fs.first_prot, &fs.wflags); 451 452 /* 453 * If the lookup failed or the map protections are incompatible, 454 * the fault generally fails. 455 * 456 * The failure could be due to TDF_NOFAULT if vm_map_lookup() 457 * tried to do a COW fault. 458 * 459 * If the caller is trying to do a user wiring we have more work 460 * to do. 461 */ 462 if (result != KERN_SUCCESS) { 463 if (result == KERN_FAILURE_NOFAULT) { 464 result = KERN_FAILURE; 465 goto done; 466 } 467 if (result != KERN_PROTECTION_FAILURE || 468 (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) 469 { 470 if (result == KERN_INVALID_ADDRESS && growstack && 471 map != &kernel_map && curproc != NULL) { 472 result = vm_map_growstack(map, vaddr); 473 if (result == KERN_SUCCESS) { 474 growstack = 0; 475 ++retry; 476 goto RetryFault; 477 } 478 result = KERN_FAILURE; 479 } 480 goto done; 481 } 482 483 /* 484 * If we are user-wiring a r/w segment, and it is COW, then 485 * we need to do the COW operation. Note that we don't 486 * currently COW RO sections now, because it is NOT desirable 487 * to COW .text. We simply keep .text from ever being COW'ed 488 * and take the heat that one cannot debug wired .text sections. 489 * 490 * XXX Try to allow the above by specifying OVERRIDE_WRITE. 491 */ 492 result = vm_map_lookup(&fs.map, vaddr, 493 VM_PROT_READ|VM_PROT_WRITE| 494 VM_PROT_OVERRIDE_WRITE, 495 &fs.entry, &fs.first_object, 496 &first_pindex, &fs.first_prot, 497 &fs.wflags); 498 if (result != KERN_SUCCESS) { 499 /* could also be KERN_FAILURE_NOFAULT */ 500 result = KERN_FAILURE; 501 goto done; 502 } 503 504 /* 505 * If we don't COW now, on a user wire, the user will never 506 * be able to write to the mapping. If we don't make this 507 * restriction, the bookkeeping would be nearly impossible. 508 * 509 * XXX We have a shared lock, this will have a MP race but 510 * I don't see how it can hurt anything. 511 */ 512 if ((fs.entry->protection & VM_PROT_WRITE) == 0) { 513 atomic_clear_char(&fs.entry->max_protection, 514 VM_PROT_WRITE); 515 } 516 } 517 518 /* 519 * fs.map is read-locked 520 * 521 * Misc checks. Save the map generation number to detect races. 522 */ 523 fs.map_generation = fs.map->timestamp; 524 fs.lookup_still_valid = TRUE; 525 fs.first_m = NULL; 526 fs.object = fs.first_object; /* so unlock_and_deallocate works */ 527 fs.prot = fs.first_prot; /* default (used by uksmap) */ 528 529 if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) { 530 if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { 531 panic("vm_fault: fault on nofault entry, addr: %p", 532 (void *)vaddr); 533 } 534 if ((fs.entry->eflags & MAP_ENTRY_KSTACK) && 535 vaddr >= fs.entry->start && 536 vaddr < fs.entry->start + PAGE_SIZE) { 537 panic("vm_fault: fault on stack guard, addr: %p", 538 (void *)vaddr); 539 } 540 } 541 542 /* 543 * A user-kernel shared map has no VM object and bypasses 544 * everything. We execute the uksmap function with a temporary 545 * fictitious vm_page. The address is directly mapped with no 546 * management. 547 */ 548 if (fs.entry->maptype == VM_MAPTYPE_UKSMAP) { 549 struct vm_page fakem; 550 551 bzero(&fakem, sizeof(fakem)); 552 fakem.pindex = first_pindex; 553 fakem.flags = PG_FICTITIOUS | PG_UNMANAGED; 554 fakem.busy_count = PBUSY_LOCKED; 555 fakem.valid = VM_PAGE_BITS_ALL; 556 fakem.pat_mode = VM_MEMATTR_DEFAULT; 557 if (fs.entry->object.uksmap(fs.entry->aux.dev, &fakem)) { 558 result = KERN_FAILURE; 559 unlock_things(&fs); 560 goto done2; 561 } 562 pmap_enter(fs.map->pmap, vaddr, &fakem, fs.prot | inherit_prot, 563 (fs.wflags & FW_WIRED), fs.entry); 564 goto done_success; 565 } 566 567 /* 568 * A system map entry may return a NULL object. No object means 569 * no pager means an unrecoverable kernel fault. 570 */ 571 if (fs.first_object == NULL) { 572 panic("vm_fault: unrecoverable fault at %p in entry %p", 573 (void *)vaddr, fs.entry); 574 } 575 576 /* 577 * Fail here if not a trivial anonymous page fault and TDF_NOFAULT 578 * is set. 579 * 580 * Unfortunately a deadlock can occur if we are forced to page-in 581 * from swap, but diving all the way into the vm_pager_get_page() 582 * function to find out is too much. Just check the object type. 583 * 584 * The deadlock is a CAM deadlock on a busy VM page when trying 585 * to finish an I/O if another process gets stuck in 586 * vop_helper_read_shortcut() due to a swap fault. 587 */ 588 if ((td->td_flags & TDF_NOFAULT) && 589 (retry || 590 fs.first_object->type == OBJT_VNODE || 591 fs.first_object->type == OBJT_SWAP || 592 fs.first_object->backing_object)) { 593 result = KERN_FAILURE; 594 unlock_things(&fs); 595 goto done2; 596 } 597 598 /* 599 * If the entry is wired we cannot change the page protection. 600 */ 601 if (fs.wflags & FW_WIRED) 602 fault_type = fs.first_prot; 603 604 /* 605 * We generally want to avoid unnecessary exclusive modes on backing 606 * and terminal objects because this can seriously interfere with 607 * heavily fork()'d processes (particularly /bin/sh scripts). 608 * 609 * However, we also want to avoid unnecessary retries due to needed 610 * shared->exclusive promotion for common faults. Exclusive mode is 611 * always needed if any page insertion, rename, or free occurs in an 612 * object (and also indirectly if any I/O is done). 613 * 614 * The main issue here is going to be fs.first_shared. If the 615 * first_object has a backing object which isn't shadowed and the 616 * process is single-threaded we might as well use an exclusive 617 * lock/chain right off the bat. 618 */ 619 if (fs.first_shared && fs.first_object->backing_object && 620 LIST_EMPTY(&fs.first_object->shadow_head) && 621 td->td_proc && td->td_proc->p_nthreads == 1) { 622 fs.first_shared = 0; 623 } 624 625 /* 626 * VM_FAULT_UNSWAP - swap_pager_unswapped() needs an exclusive object 627 * VM_FAULT_DIRTY - may require swap_pager_unswapped() later, but 628 * we can try shared first. 629 */ 630 if (fault_flags & VM_FAULT_UNSWAP) { 631 fs.first_shared = 0; 632 } 633 634 /* 635 * Try to shortcut the entire mess and run the fault lockless. 636 */ 637 if (vm_fault_quick_enable && 638 vm_fault_quick(&fs, first_pindex, fault_type) == KERN_SUCCESS) { 639 didilock = 0; 640 fault_flags &= ~VM_FAULT_BURST; 641 goto success; 642 } 643 644 /* 645 * Obtain a top-level object lock, shared or exclusive depending 646 * on fs.first_shared. If a shared lock winds up being insufficient 647 * we will retry with an exclusive lock. 648 * 649 * The vnode pager lock is always shared. 650 */ 651 if (fs.first_shared) 652 vm_object_hold_shared(fs.first_object); 653 else 654 vm_object_hold(fs.first_object); 655 if (fs.vp == NULL) 656 fs.vp = vnode_pager_lock(fs.first_object); 657 didhold = 1; 658 659 /* 660 * The page we want is at (first_object, first_pindex), but if the 661 * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the 662 * page table to figure out the actual pindex. 663 * 664 * NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION 665 * ONLY 666 */ 667 didilock = 0; 668 if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { 669 vm_map_interlock(fs.map, &ilock, vaddr, vaddr + PAGE_SIZE); 670 didilock = 1; 671 result = vm_fault_vpagetable(&fs, &first_pindex, 672 fs.entry->aux.master_pde, 673 fault_type, 1); 674 if (result == KERN_TRY_AGAIN) { 675 vm_map_deinterlock(fs.map, &ilock); 676 vm_object_drop(fs.first_object); 677 ++retry; 678 goto RetryFault; 679 } 680 if (result != KERN_SUCCESS) { 681 vm_map_deinterlock(fs.map, &ilock); 682 goto done; 683 } 684 } 685 686 /* 687 * Now we have the actual (object, pindex), fault in the page. If 688 * vm_fault_object() fails it will unlock and deallocate the FS 689 * data. If it succeeds everything remains locked and fs->object 690 * will have an additional PIP count if it is not equal to 691 * fs->first_object 692 * 693 * vm_fault_object will set fs->prot for the pmap operation. It is 694 * allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the 695 * page can be safely written. However, it will force a read-only 696 * mapping for a read fault if the memory is managed by a virtual 697 * page table. 698 * 699 * If the fault code uses the shared object lock shortcut 700 * we must not try to burst (we can't allocate VM pages). 701 */ 702 result = vm_fault_object(&fs, first_pindex, fault_type, 1); 703 704 if (debug_fault > 0) { 705 --debug_fault; 706 kprintf("VM_FAULT result %d addr=%jx type=%02x flags=%02x " 707 "fs.m=%p fs.prot=%02x fs.wflags=%02x fs.entry=%p\n", 708 result, (intmax_t)vaddr, fault_type, fault_flags, 709 fs.m, fs.prot, fs.wflags, fs.entry); 710 } 711 712 if (result == KERN_TRY_AGAIN) { 713 if (didilock) 714 vm_map_deinterlock(fs.map, &ilock); 715 vm_object_drop(fs.first_object); 716 ++retry; 717 goto RetryFault; 718 } 719 if (result != KERN_SUCCESS) { 720 if (didilock) 721 vm_map_deinterlock(fs.map, &ilock); 722 goto done; 723 } 724 725 success: 726 /* 727 * On success vm_fault_object() does not unlock or deallocate, and fs.m 728 * will contain a busied page. 729 * 730 * Enter the page into the pmap and do pmap-related adjustments. 731 * 732 * WARNING! Soft-busied fs.m's can only be manipulated in limited 733 * ways. 734 */ 735 KKASSERT(fs.lookup_still_valid == TRUE); 736 vm_page_flag_set(fs.m, PG_REFERENCED); 737 pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot | inherit_prot, 738 fs.wflags & FW_WIRED, fs.entry); 739 740 if (didilock) 741 vm_map_deinterlock(fs.map, &ilock); 742 743 /* 744 * If the page is not wired down, then put it where the pageout daemon 745 * can find it. 746 * 747 * NOTE: We cannot safely wire, unwire, or adjust queues for a 748 * soft-busied page. 749 */ 750 if (fs.msoftonly) { 751 KKASSERT(fs.m->busy_count & PBUSY_MASK); 752 KKASSERT((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0); 753 vm_page_sbusy_drop(fs.m); 754 } else { 755 if (fs.fault_flags & VM_FAULT_WIRE_MASK) { 756 if (fs.wflags & FW_WIRED) 757 vm_page_wire(fs.m); 758 else 759 vm_page_unwire(fs.m, 1); 760 } else { 761 vm_page_activate(fs.m); 762 } 763 KKASSERT(fs.m->busy_count & PBUSY_LOCKED); 764 vm_page_wakeup(fs.m); 765 } 766 767 /* 768 * Burst in a few more pages if possible. The fs.map should still 769 * be locked. To avoid interlocking against a vnode->getblk 770 * operation we had to be sure to unbusy our primary vm_page above 771 * first. 772 * 773 * A normal burst can continue down backing store, only execute 774 * if we are holding an exclusive lock, otherwise the exclusive 775 * locks the burst code gets might cause excessive SMP collisions. 776 * 777 * A quick burst can be utilized when there is no backing object 778 * (i.e. a shared file mmap). 779 */ 780 if ((fault_flags & VM_FAULT_BURST) && 781 (fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 && 782 (fs.wflags & FW_WIRED) == 0) { 783 if (fs.first_shared == 0 && fs.shared == 0) { 784 vm_prefault(fs.map->pmap, vaddr, 785 fs.entry, fs.prot, fault_flags); 786 } else { 787 vm_prefault_quick(fs.map->pmap, vaddr, 788 fs.entry, fs.prot, fault_flags); 789 } 790 } 791 792 done_success: 793 mycpu->gd_cnt.v_vm_faults++; 794 if (td->td_lwp) 795 ++td->td_lwp->lwp_ru.ru_minflt; 796 797 /* 798 * Unlock everything, and return 799 */ 800 unlock_things(&fs); 801 802 if (td->td_lwp) { 803 if (fs.hardfault) { 804 td->td_lwp->lwp_ru.ru_majflt++; 805 } else { 806 td->td_lwp->lwp_ru.ru_minflt++; 807 } 808 } 809 810 /*vm_object_deallocate(fs.first_object);*/ 811 /*fs.m = NULL; */ 812 /*fs.first_object = NULL; must still drop later */ 813 814 result = KERN_SUCCESS; 815 done: 816 if (fs.first_object && didhold) 817 vm_object_drop(fs.first_object); 818 done2: 819 if (lp) 820 lp->lwp_flags &= ~LWP_PAGING; 821 822 #if !defined(NO_SWAPPING) 823 /* 824 * Check the process RSS limit and force deactivation and 825 * (asynchronous) paging if necessary. This is a complex operation, 826 * only do it for direct user-mode faults, for now. 827 * 828 * To reduce overhead implement approximately a ~16MB hysteresis. 829 */ 830 p = td->td_proc; 831 if ((fault_flags & VM_FAULT_USERMODE) && lp && 832 p->p_limit && map->pmap && vm_pageout_memuse_mode >= 1 && 833 map != &kernel_map) { 834 vm_pindex_t limit; 835 vm_pindex_t size; 836 837 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 838 p->p_rlimit[RLIMIT_RSS].rlim_max)); 839 size = pmap_resident_tlnw_count(map->pmap); 840 if (limit >= 0 && size > 4096 && size - 4096 >= limit) { 841 vm_pageout_map_deactivate_pages(map, limit); 842 } 843 } 844 #endif 845 846 return (result); 847 } 848 849 /* 850 * Attempt a lockless vm_fault() shortcut. The stars have to align for this 851 * to work. But if it does we can get our page only soft-busied and not 852 * have to touch the vm_object or vnode locks at all. 853 */ 854 static 855 int 856 vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, 857 vm_prot_t fault_type) 858 { 859 vm_page_t m; 860 vm_object_t obj; /* NOT LOCKED */ 861 862 /* 863 * Don't waste time if the object is only being used by one vm_map. 864 */ 865 obj = fs->first_object; 866 if (obj->flags & OBJ_ONEMAPPING) 867 return KERN_FAILURE; 868 869 /* 870 * This will try to wire/unwire a page, which can't be done with 871 * a soft-busied page. 872 */ 873 if (fs->fault_flags & VM_FAULT_WIRE_MASK) 874 return KERN_FAILURE; 875 876 /* 877 * Ick, can't handle this 878 */ 879 if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) { 880 #ifdef VM_FAULT_QUICK_DEBUG 881 ++vm_fault_quick_failure_count1; 882 #endif 883 return KERN_FAILURE; 884 } 885 886 /* 887 * Ok, try to get the vm_page quickly via the hash table. The 888 * page will be soft-busied on success (NOT hard-busied). 889 */ 890 m = vm_page_hash_get(obj, first_pindex); 891 if (m == NULL) { 892 #ifdef VM_FAULT_QUICK_DEBUG 893 ++vm_fault_quick_failure_count2; 894 #endif 895 return KERN_FAILURE; 896 } 897 if ((obj->flags & OBJ_DEAD) || 898 m->valid != VM_PAGE_BITS_ALL || 899 m->queue - m->pc == PQ_CACHE || 900 (m->flags & PG_SWAPPED)) { 901 vm_page_sbusy_drop(m); 902 #ifdef VM_FAULT_QUICK_DEBUG 903 ++vm_fault_quick_failure_count3; 904 #endif 905 return KERN_FAILURE; 906 } 907 908 /* 909 * The page is already fully valid, ACTIVE, and is not PG_SWAPPED. 910 * 911 * Don't map the page writable when emulating the dirty bit, a 912 * fault must be taken for proper emulation (vkernel). 913 */ 914 if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace && 915 pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) { 916 if ((fault_type & VM_PROT_WRITE) == 0) 917 fs->prot &= ~VM_PROT_WRITE; 918 } 919 920 /* 921 * If this is a write fault the object and the page must already 922 * be writable. Since we don't hold an object lock and only a 923 * soft-busy on the page, we cannot manipulate the object or 924 * the page state (other than the page queue). 925 */ 926 if (fs->prot & VM_PROT_WRITE) { 927 if ((obj->flags & (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY)) != 928 (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY) || 929 m->dirty != VM_PAGE_BITS_ALL) { 930 vm_page_sbusy_drop(m); 931 #ifdef VM_FAULT_QUICK_DEBUG 932 ++vm_fault_quick_failure_count4; 933 #endif 934 return KERN_FAILURE; 935 } 936 vm_set_nosync(m, fs->entry); 937 } 938 939 /* 940 * Even though we are only soft-busied we can still move pages 941 * around in the normal queue(s). The soft-busy prevents the 942 * page from being removed from the object, etc (normal operation). 943 */ 944 vm_page_activate(m); 945 fs->m = m; 946 fs->msoftonly = 1; 947 #ifdef VM_FAULT_QUICK_DEBUG 948 ++vm_fault_quick_success_count; 949 #endif 950 951 return KERN_SUCCESS; 952 } 953 954 /* 955 * Fault in the specified virtual address in the current process map, 956 * returning a held VM page or NULL. See vm_fault_page() for more 957 * information. 958 * 959 * No requirements. 960 */ 961 vm_page_t 962 vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, 963 int *errorp, int *busyp) 964 { 965 struct lwp *lp = curthread->td_lwp; 966 vm_page_t m; 967 968 m = vm_fault_page(&lp->lwp_vmspace->vm_map, va, 969 fault_type, VM_FAULT_NORMAL, 970 errorp, busyp); 971 return(m); 972 } 973 974 /* 975 * Fault in the specified virtual address in the specified map, doing all 976 * necessary manipulation of the object store and all necessary I/O. Return 977 * a held VM page or NULL, and set *errorp. The related pmap is not 978 * updated. 979 * 980 * If busyp is not NULL then *busyp will be set to TRUE if this routine 981 * decides to return a busied page (aka VM_PROT_WRITE), or FALSE if it 982 * does not (VM_PROT_WRITE not specified or busyp is NULL). If busyp is 983 * NULL the returned page is only held. 984 * 985 * If the caller has no intention of writing to the page's contents, busyp 986 * can be passed as NULL along with VM_PROT_WRITE to force a COW operation 987 * without busying the page. 988 * 989 * The returned page will also be marked PG_REFERENCED. 990 * 991 * If the page cannot be faulted writable and VM_PROT_WRITE was specified, an 992 * error will be returned. 993 * 994 * No requirements. 995 */ 996 vm_page_t 997 vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, 998 int fault_flags, int *errorp, int *busyp) 999 { 1000 vm_pindex_t first_pindex; 1001 struct faultstate fs; 1002 int result; 1003 int retry; 1004 int growstack; 1005 int didcow; 1006 vm_prot_t orig_fault_type = fault_type; 1007 1008 retry = 0; 1009 didcow = 0; 1010 fs.hardfault = 0; 1011 fs.fault_flags = fault_flags; 1012 KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); 1013 1014 /* 1015 * Dive the pmap (concurrency possible). If we find the 1016 * appropriate page we can terminate early and quickly. 1017 * 1018 * This works great for normal programs but will always return 1019 * NULL for host lookups of vkernel maps in VMM mode. 1020 * 1021 * NOTE: pmap_fault_page_quick() might not busy the page. If 1022 * VM_PROT_WRITE is set in fault_type and pmap_fault_page_quick() 1023 * returns non-NULL, it will safely dirty the returned vm_page_t 1024 * for us. We cannot safely dirty it here (it might not be 1025 * busy). 1026 */ 1027 fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp); 1028 if (fs.m) { 1029 *errorp = 0; 1030 return(fs.m); 1031 } 1032 1033 /* 1034 * Otherwise take a concurrency hit and do a formal page 1035 * fault. 1036 */ 1037 fs.vp = NULL; 1038 fs.shared = vm_shared_fault; 1039 fs.first_shared = vm_shared_fault; 1040 fs.msoftonly = 0; 1041 growstack = 1; 1042 1043 /* 1044 * VM_FAULT_UNSWAP - swap_pager_unswapped() needs an exclusive object 1045 * VM_FAULT_DIRTY - may require swap_pager_unswapped() later, but 1046 * we can try shared first. 1047 */ 1048 if (fault_flags & VM_FAULT_UNSWAP) { 1049 fs.first_shared = 0; 1050 } 1051 1052 RetryFault: 1053 /* 1054 * Find the vm_map_entry representing the backing store and resolve 1055 * the top level object and page index. This may have the side 1056 * effect of executing a copy-on-write on the map entry and/or 1057 * creating a shadow object, but will not COW any actual VM pages. 1058 * 1059 * On success fs.map is left read-locked and various other fields 1060 * are initialized but not otherwise referenced or locked. 1061 * 1062 * NOTE! vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE 1063 * if the map entry is a virtual page table and also writable, 1064 * so we can set the 'A'accessed bit in the virtual page table 1065 * entry. 1066 */ 1067 fs.map = map; 1068 result = vm_map_lookup(&fs.map, vaddr, fault_type, 1069 &fs.entry, &fs.first_object, 1070 &first_pindex, &fs.first_prot, &fs.wflags); 1071 1072 if (result != KERN_SUCCESS) { 1073 if (result == KERN_FAILURE_NOFAULT) { 1074 *errorp = KERN_FAILURE; 1075 fs.m = NULL; 1076 goto done; 1077 } 1078 if (result != KERN_PROTECTION_FAILURE || 1079 (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) 1080 { 1081 if (result == KERN_INVALID_ADDRESS && growstack && 1082 map != &kernel_map && curproc != NULL) { 1083 result = vm_map_growstack(map, vaddr); 1084 if (result == KERN_SUCCESS) { 1085 growstack = 0; 1086 ++retry; 1087 goto RetryFault; 1088 } 1089 result = KERN_FAILURE; 1090 } 1091 fs.m = NULL; 1092 *errorp = result; 1093 goto done; 1094 } 1095 1096 /* 1097 * If we are user-wiring a r/w segment, and it is COW, then 1098 * we need to do the COW operation. Note that we don't 1099 * currently COW RO sections now, because it is NOT desirable 1100 * to COW .text. We simply keep .text from ever being COW'ed 1101 * and take the heat that one cannot debug wired .text sections. 1102 */ 1103 result = vm_map_lookup(&fs.map, vaddr, 1104 VM_PROT_READ|VM_PROT_WRITE| 1105 VM_PROT_OVERRIDE_WRITE, 1106 &fs.entry, &fs.first_object, 1107 &first_pindex, &fs.first_prot, 1108 &fs.wflags); 1109 if (result != KERN_SUCCESS) { 1110 /* could also be KERN_FAILURE_NOFAULT */ 1111 *errorp = KERN_FAILURE; 1112 fs.m = NULL; 1113 goto done; 1114 } 1115 1116 /* 1117 * If we don't COW now, on a user wire, the user will never 1118 * be able to write to the mapping. If we don't make this 1119 * restriction, the bookkeeping would be nearly impossible. 1120 * 1121 * XXX We have a shared lock, this will have a MP race but 1122 * I don't see how it can hurt anything. 1123 */ 1124 if ((fs.entry->protection & VM_PROT_WRITE) == 0) { 1125 atomic_clear_char(&fs.entry->max_protection, 1126 VM_PROT_WRITE); 1127 } 1128 } 1129 1130 /* 1131 * fs.map is read-locked 1132 * 1133 * Misc checks. Save the map generation number to detect races. 1134 */ 1135 fs.map_generation = fs.map->timestamp; 1136 fs.lookup_still_valid = TRUE; 1137 fs.first_m = NULL; 1138 fs.object = fs.first_object; /* so unlock_and_deallocate works */ 1139 1140 if (fs.entry->eflags & MAP_ENTRY_NOFAULT) { 1141 panic("vm_fault: fault on nofault entry, addr: %lx", 1142 (u_long)vaddr); 1143 } 1144 1145 /* 1146 * A user-kernel shared map has no VM object and bypasses 1147 * everything. We execute the uksmap function with a temporary 1148 * fictitious vm_page. The address is directly mapped with no 1149 * management. 1150 */ 1151 if (fs.entry->maptype == VM_MAPTYPE_UKSMAP) { 1152 struct vm_page fakem; 1153 1154 bzero(&fakem, sizeof(fakem)); 1155 fakem.pindex = first_pindex; 1156 fakem.flags = PG_FICTITIOUS | PG_UNMANAGED; 1157 fakem.busy_count = PBUSY_LOCKED; 1158 fakem.valid = VM_PAGE_BITS_ALL; 1159 fakem.pat_mode = VM_MEMATTR_DEFAULT; 1160 if (fs.entry->object.uksmap(fs.entry->aux.dev, &fakem)) { 1161 *errorp = KERN_FAILURE; 1162 fs.m = NULL; 1163 unlock_things(&fs); 1164 goto done2; 1165 } 1166 fs.m = PHYS_TO_VM_PAGE(fakem.phys_addr); 1167 vm_page_hold(fs.m); 1168 if (busyp) 1169 *busyp = 0; /* don't need to busy R or W */ 1170 unlock_things(&fs); 1171 *errorp = 0; 1172 goto done; 1173 } 1174 1175 1176 /* 1177 * A system map entry may return a NULL object. No object means 1178 * no pager means an unrecoverable kernel fault. 1179 */ 1180 if (fs.first_object == NULL) { 1181 panic("vm_fault: unrecoverable fault at %p in entry %p", 1182 (void *)vaddr, fs.entry); 1183 } 1184 1185 /* 1186 * Fail here if not a trivial anonymous page fault and TDF_NOFAULT 1187 * is set. 1188 * 1189 * Unfortunately a deadlock can occur if we are forced to page-in 1190 * from swap, but diving all the way into the vm_pager_get_page() 1191 * function to find out is too much. Just check the object type. 1192 */ 1193 if ((curthread->td_flags & TDF_NOFAULT) && 1194 (retry || 1195 fs.first_object->type == OBJT_VNODE || 1196 fs.first_object->type == OBJT_SWAP || 1197 fs.first_object->backing_object)) { 1198 *errorp = KERN_FAILURE; 1199 unlock_things(&fs); 1200 fs.m = NULL; 1201 goto done2; 1202 } 1203 1204 /* 1205 * If the entry is wired we cannot change the page protection. 1206 */ 1207 if (fs.wflags & FW_WIRED) 1208 fault_type = fs.first_prot; 1209 1210 /* 1211 * Make a reference to this object to prevent its disposal while we 1212 * are messing with it. Once we have the reference, the map is free 1213 * to be diddled. Since objects reference their shadows (and copies), 1214 * they will stay around as well. 1215 * 1216 * The reference should also prevent an unexpected collapse of the 1217 * parent that might move pages from the current object into the 1218 * parent unexpectedly, resulting in corruption. 1219 * 1220 * Bump the paging-in-progress count to prevent size changes (e.g. 1221 * truncation operations) during I/O. This must be done after 1222 * obtaining the vnode lock in order to avoid possible deadlocks. 1223 */ 1224 if (fs.first_shared) 1225 vm_object_hold_shared(fs.first_object); 1226 else 1227 vm_object_hold(fs.first_object); 1228 if (fs.vp == NULL) 1229 fs.vp = vnode_pager_lock(fs.first_object); /* shared */ 1230 1231 /* 1232 * The page we want is at (first_object, first_pindex), but if the 1233 * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the 1234 * page table to figure out the actual pindex. 1235 * 1236 * NOTE! DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION 1237 * ONLY 1238 */ 1239 if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { 1240 result = vm_fault_vpagetable(&fs, &first_pindex, 1241 fs.entry->aux.master_pde, 1242 fault_type, 1); 1243 if (result == KERN_TRY_AGAIN) { 1244 vm_object_drop(fs.first_object); 1245 ++retry; 1246 goto RetryFault; 1247 } 1248 if (result != KERN_SUCCESS) { 1249 *errorp = result; 1250 fs.m = NULL; 1251 goto done; 1252 } 1253 } 1254 1255 /* 1256 * Now we have the actual (object, pindex), fault in the page. If 1257 * vm_fault_object() fails it will unlock and deallocate the FS 1258 * data. If it succeeds everything remains locked and fs->object 1259 * will have an additinal PIP count if it is not equal to 1260 * fs->first_object 1261 */ 1262 fs.m = NULL; 1263 result = vm_fault_object(&fs, first_pindex, fault_type, 1); 1264 1265 if (result == KERN_TRY_AGAIN) { 1266 vm_object_drop(fs.first_object); 1267 ++retry; 1268 didcow |= fs.wflags & FW_DIDCOW; 1269 goto RetryFault; 1270 } 1271 if (result != KERN_SUCCESS) { 1272 *errorp = result; 1273 fs.m = NULL; 1274 goto done; 1275 } 1276 1277 if ((orig_fault_type & VM_PROT_WRITE) && 1278 (fs.prot & VM_PROT_WRITE) == 0) { 1279 *errorp = KERN_PROTECTION_FAILURE; 1280 unlock_and_deallocate(&fs); 1281 fs.m = NULL; 1282 goto done; 1283 } 1284 1285 /* 1286 * Generally speaking we don't want to update the pmap because 1287 * this routine can be called many times for situations that do 1288 * not require updating the pmap, not to mention the page might 1289 * already be in the pmap. 1290 * 1291 * However, if our vm_map_lookup() results in a COW, we need to 1292 * at least remove the pte from the pmap to guarantee proper 1293 * visibility of modifications made to the process. For example, 1294 * modifications made by vkernel uiocopy/related routines and 1295 * modifications made by ptrace(). 1296 */ 1297 vm_page_flag_set(fs.m, PG_REFERENCED); 1298 #if 0 1299 pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, 1300 fs.wflags & FW_WIRED, NULL); 1301 mycpu->gd_cnt.v_vm_faults++; 1302 if (curthread->td_lwp) 1303 ++curthread->td_lwp->lwp_ru.ru_minflt; 1304 #endif 1305 if ((fs.wflags | didcow) | FW_DIDCOW) { 1306 pmap_remove(fs.map->pmap, 1307 vaddr & ~PAGE_MASK, 1308 (vaddr & ~PAGE_MASK) + PAGE_SIZE); 1309 } 1310 1311 /* 1312 * On success vm_fault_object() does not unlock or deallocate, and fs.m 1313 * will contain a busied page. So we must unlock here after having 1314 * messed with the pmap. 1315 */ 1316 unlock_things(&fs); 1317 1318 /* 1319 * Return a held page. We are not doing any pmap manipulation so do 1320 * not set PG_MAPPED. However, adjust the page flags according to 1321 * the fault type because the caller may not use a managed pmapping 1322 * (so we don't want to lose the fact that the page will be dirtied 1323 * if a write fault was specified). 1324 */ 1325 if (fault_type & VM_PROT_WRITE) 1326 vm_page_dirty(fs.m); 1327 vm_page_activate(fs.m); 1328 1329 if (curthread->td_lwp) { 1330 if (fs.hardfault) { 1331 curthread->td_lwp->lwp_ru.ru_majflt++; 1332 } else { 1333 curthread->td_lwp->lwp_ru.ru_minflt++; 1334 } 1335 } 1336 1337 /* 1338 * Unlock everything, and return the held or busied page. 1339 */ 1340 if (busyp) { 1341 if (fault_type & VM_PROT_WRITE) { 1342 vm_page_dirty(fs.m); 1343 *busyp = 1; 1344 } else { 1345 *busyp = 0; 1346 vm_page_hold(fs.m); 1347 vm_page_wakeup(fs.m); 1348 } 1349 } else { 1350 vm_page_hold(fs.m); 1351 vm_page_wakeup(fs.m); 1352 } 1353 /*vm_object_deallocate(fs.first_object);*/ 1354 /*fs.first_object = NULL; */ 1355 *errorp = 0; 1356 1357 done: 1358 if (fs.first_object) 1359 vm_object_drop(fs.first_object); 1360 done2: 1361 return(fs.m); 1362 } 1363 1364 /* 1365 * Fault in the specified (object,offset), dirty the returned page as 1366 * needed. If the requested fault_type cannot be done NULL and an 1367 * error is returned. 1368 * 1369 * A held (but not busied) page is returned. 1370 * 1371 * The passed in object must be held as specified by the shared 1372 * argument. 1373 */ 1374 vm_page_t 1375 vm_fault_object_page(vm_object_t object, vm_ooffset_t offset, 1376 vm_prot_t fault_type, int fault_flags, 1377 int *sharedp, int *errorp) 1378 { 1379 int result; 1380 vm_pindex_t first_pindex; 1381 struct faultstate fs; 1382 struct vm_map_entry entry; 1383 1384 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1385 bzero(&entry, sizeof(entry)); 1386 entry.object.vm_object = object; 1387 entry.maptype = VM_MAPTYPE_NORMAL; 1388 entry.protection = entry.max_protection = fault_type; 1389 1390 fs.hardfault = 0; 1391 fs.fault_flags = fault_flags; 1392 fs.map = NULL; 1393 fs.shared = vm_shared_fault; 1394 fs.first_shared = *sharedp; 1395 fs.msoftonly = 0; 1396 fs.vp = NULL; 1397 KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0); 1398 1399 /* 1400 * VM_FAULT_UNSWAP - swap_pager_unswapped() needs an exclusive object 1401 * VM_FAULT_DIRTY - may require swap_pager_unswapped() later, but 1402 * we can try shared first. 1403 */ 1404 if (fs.first_shared && (fault_flags & VM_FAULT_UNSWAP)) { 1405 fs.first_shared = 0; 1406 vm_object_upgrade(object); 1407 } 1408 1409 /* 1410 * Retry loop as needed (typically for shared->exclusive transitions) 1411 */ 1412 RetryFault: 1413 *sharedp = fs.first_shared; 1414 first_pindex = OFF_TO_IDX(offset); 1415 fs.first_object = object; 1416 fs.entry = &entry; 1417 fs.first_prot = fault_type; 1418 fs.wflags = 0; 1419 /*fs.map_generation = 0; unused */ 1420 1421 /* 1422 * Make a reference to this object to prevent its disposal while we 1423 * are messing with it. Once we have the reference, the map is free 1424 * to be diddled. Since objects reference their shadows (and copies), 1425 * they will stay around as well. 1426 * 1427 * The reference should also prevent an unexpected collapse of the 1428 * parent that might move pages from the current object into the 1429 * parent unexpectedly, resulting in corruption. 1430 * 1431 * Bump the paging-in-progress count to prevent size changes (e.g. 1432 * truncation operations) during I/O. This must be done after 1433 * obtaining the vnode lock in order to avoid possible deadlocks. 1434 */ 1435 if (fs.vp == NULL) 1436 fs.vp = vnode_pager_lock(fs.first_object); 1437 1438 fs.lookup_still_valid = TRUE; 1439 fs.first_m = NULL; 1440 fs.object = fs.first_object; /* so unlock_and_deallocate works */ 1441 1442 #if 0 1443 /* XXX future - ability to operate on VM object using vpagetable */ 1444 if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) { 1445 result = vm_fault_vpagetable(&fs, &first_pindex, 1446 fs.entry->aux.master_pde, 1447 fault_type, 0); 1448 if (result == KERN_TRY_AGAIN) { 1449 if (fs.first_shared == 0 && *sharedp) 1450 vm_object_upgrade(object); 1451 goto RetryFault; 1452 } 1453 if (result != KERN_SUCCESS) { 1454 *errorp = result; 1455 return (NULL); 1456 } 1457 } 1458 #endif 1459 1460 /* 1461 * Now we have the actual (object, pindex), fault in the page. If 1462 * vm_fault_object() fails it will unlock and deallocate the FS 1463 * data. If it succeeds everything remains locked and fs->object 1464 * will have an additinal PIP count if it is not equal to 1465 * fs->first_object 1466 * 1467 * On KERN_TRY_AGAIN vm_fault_object() leaves fs.first_object intact. 1468 * We may have to upgrade its lock to handle the requested fault. 1469 */ 1470 result = vm_fault_object(&fs, first_pindex, fault_type, 0); 1471 1472 if (result == KERN_TRY_AGAIN) { 1473 if (fs.first_shared == 0 && *sharedp) 1474 vm_object_upgrade(object); 1475 goto RetryFault; 1476 } 1477 if (result != KERN_SUCCESS) { 1478 *errorp = result; 1479 return(NULL); 1480 } 1481 1482 if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) { 1483 *errorp = KERN_PROTECTION_FAILURE; 1484 unlock_and_deallocate(&fs); 1485 return(NULL); 1486 } 1487 1488 /* 1489 * On success vm_fault_object() does not unlock or deallocate, so we 1490 * do it here. Note that the returned fs.m will be busied. 1491 */ 1492 unlock_things(&fs); 1493 1494 /* 1495 * Return a held page. We are not doing any pmap manipulation so do 1496 * not set PG_MAPPED. However, adjust the page flags according to 1497 * the fault type because the caller may not use a managed pmapping 1498 * (so we don't want to lose the fact that the page will be dirtied 1499 * if a write fault was specified). 1500 */ 1501 vm_page_hold(fs.m); 1502 vm_page_activate(fs.m); 1503 if ((fault_type & VM_PROT_WRITE) || (fault_flags & VM_FAULT_DIRTY)) 1504 vm_page_dirty(fs.m); 1505 if (fault_flags & VM_FAULT_UNSWAP) 1506 swap_pager_unswapped(fs.m); 1507 1508 /* 1509 * Indicate that the page was accessed. 1510 */ 1511 vm_page_flag_set(fs.m, PG_REFERENCED); 1512 1513 if (curthread->td_lwp) { 1514 if (fs.hardfault) { 1515 curthread->td_lwp->lwp_ru.ru_majflt++; 1516 } else { 1517 curthread->td_lwp->lwp_ru.ru_minflt++; 1518 } 1519 } 1520 1521 /* 1522 * Unlock everything, and return the held page. 1523 */ 1524 vm_page_wakeup(fs.m); 1525 /*vm_object_deallocate(fs.first_object);*/ 1526 /*fs.first_object = NULL; */ 1527 1528 *errorp = 0; 1529 return(fs.m); 1530 } 1531 1532 /* 1533 * Translate the virtual page number (first_pindex) that is relative 1534 * to the address space into a logical page number that is relative to the 1535 * backing object. Use the virtual page table pointed to by (vpte). 1536 * 1537 * Possibly downgrade the protection based on the vpte bits. 1538 * 1539 * This implements an N-level page table. Any level can terminate the 1540 * scan by setting VPTE_PS. A linear mapping is accomplished by setting 1541 * VPTE_PS in the master page directory entry set via mcontrol(MADV_SETMAP). 1542 */ 1543 static 1544 int 1545 vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex, 1546 vpte_t vpte, int fault_type, int allow_nofault) 1547 { 1548 struct lwbuf *lwb; 1549 struct lwbuf lwb_cache; 1550 int vshift = VPTE_FRAME_END - PAGE_SHIFT; /* index bits remaining */ 1551 int result; 1552 vpte_t *ptep; 1553 1554 ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object)); 1555 for (;;) { 1556 /* 1557 * We cannot proceed if the vpte is not valid, not readable 1558 * for a read fault, not writable for a write fault, or 1559 * not executable for an instruction execution fault. 1560 */ 1561 if ((vpte & VPTE_V) == 0) { 1562 unlock_and_deallocate(fs); 1563 return (KERN_FAILURE); 1564 } 1565 if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_RW) == 0) { 1566 unlock_and_deallocate(fs); 1567 return (KERN_FAILURE); 1568 } 1569 if ((fault_type & VM_PROT_EXECUTE) && (vpte & VPTE_NX)) { 1570 unlock_and_deallocate(fs); 1571 return (KERN_FAILURE); 1572 } 1573 if ((vpte & VPTE_PS) || vshift == 0) 1574 break; 1575 1576 /* 1577 * Get the page table page. Nominally we only read the page 1578 * table, but since we are actively setting VPTE_M and VPTE_A, 1579 * tell vm_fault_object() that we are writing it. 1580 * 1581 * There is currently no real need to optimize this. 1582 */ 1583 result = vm_fault_object(fs, (vpte & VPTE_FRAME) >> PAGE_SHIFT, 1584 VM_PROT_READ|VM_PROT_WRITE, 1585 allow_nofault); 1586 if (result != KERN_SUCCESS) 1587 return (result); 1588 1589 /* 1590 * Process the returned fs.m and look up the page table 1591 * entry in the page table page. 1592 */ 1593 vshift -= VPTE_PAGE_BITS; 1594 lwb = lwbuf_alloc(fs->m, &lwb_cache); 1595 ptep = ((vpte_t *)lwbuf_kva(lwb) + 1596 ((*pindex >> vshift) & VPTE_PAGE_MASK)); 1597 vm_page_activate(fs->m); 1598 1599 /* 1600 * Page table write-back - entire operation including 1601 * validation of the pte must be atomic to avoid races 1602 * against the vkernel changing the pte. 1603 * 1604 * If the vpte is valid for the* requested operation, do 1605 * a write-back to the page table. 1606 * 1607 * XXX VPTE_M is not set properly for page directory pages. 1608 * It doesn't get set in the page directory if the page table 1609 * is modified during a read access. 1610 */ 1611 for (;;) { 1612 vpte_t nvpte; 1613 1614 /* 1615 * Reload for the cmpset, but make sure the pte is 1616 * still valid. 1617 */ 1618 vpte = *ptep; 1619 cpu_ccfence(); 1620 nvpte = vpte; 1621 1622 if ((vpte & VPTE_V) == 0) 1623 break; 1624 1625 if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_RW)) 1626 nvpte |= VPTE_M | VPTE_A; 1627 if (fault_type & (VM_PROT_READ | VM_PROT_EXECUTE)) 1628 nvpte |= VPTE_A; 1629 if (vpte == nvpte) 1630 break; 1631 if (atomic_cmpset_long(ptep, vpte, nvpte)) { 1632 vm_page_dirty(fs->m); 1633 break; 1634 } 1635 } 1636 lwbuf_free(lwb); 1637 vm_page_flag_set(fs->m, PG_REFERENCED); 1638 vm_page_wakeup(fs->m); 1639 fs->m = NULL; 1640 cleanup_successful_fault(fs); 1641 } 1642 1643 /* 1644 * When the vkernel sets VPTE_RW it expects the real kernel to 1645 * reflect VPTE_M back when the page is modified via the mapping. 1646 * In order to accomplish this the real kernel must map the page 1647 * read-only for read faults and use write faults to reflect VPTE_M 1648 * back. 1649 * 1650 * Once VPTE_M has been set, the real kernel's pte allows writing. 1651 * If the vkernel clears VPTE_M the vkernel must be sure to 1652 * MADV_INVAL the real kernel's mappings to force the real kernel 1653 * to re-fault on the next write so oit can set VPTE_M again. 1654 */ 1655 if ((fault_type & VM_PROT_WRITE) == 0 && 1656 (vpte & (VPTE_RW | VPTE_M)) != (VPTE_RW | VPTE_M)) { 1657 fs->first_prot &= ~VM_PROT_WRITE; 1658 } 1659 1660 /* 1661 * Disable EXECUTE perms if NX bit is set. 1662 */ 1663 if (vpte & VPTE_NX) 1664 fs->first_prot &= ~VM_PROT_EXECUTE; 1665 1666 /* 1667 * Combine remaining address bits with the vpte. 1668 */ 1669 *pindex = ((vpte & VPTE_FRAME) >> PAGE_SHIFT) + 1670 (*pindex & ((1L << vshift) - 1)); 1671 return (KERN_SUCCESS); 1672 } 1673 1674 1675 /* 1676 * This is the core of the vm_fault code. 1677 * 1678 * Do all operations required to fault-in (fs.first_object, pindex). Run 1679 * through the shadow chain as necessary and do required COW or virtual 1680 * copy operations. The caller has already fully resolved the vm_map_entry 1681 * and, if appropriate, has created a copy-on-write layer. All we need to 1682 * do is iterate the object chain. 1683 * 1684 * On failure (fs) is unlocked and deallocated and the caller may return or 1685 * retry depending on the failure code. On success (fs) is NOT unlocked or 1686 * deallocated, fs.m will contained a resolved, busied page, and fs.object 1687 * will have an additional PIP count if it is not equal to fs.first_object. 1688 * 1689 * If locks based on fs->first_shared or fs->shared are insufficient, 1690 * clear the appropriate field(s) and return RETRY. COWs require that 1691 * first_shared be 0, while page allocations (or frees) require that 1692 * shared be 0. Renames require that both be 0. 1693 * 1694 * NOTE! fs->[first_]shared might be set with VM_FAULT_DIRTY also set. 1695 * we will have to retry with it exclusive if the vm_page is 1696 * PG_SWAPPED. 1697 * 1698 * fs->first_object must be held on call. 1699 */ 1700 static 1701 int 1702 vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex, 1703 vm_prot_t fault_type, int allow_nofault) 1704 { 1705 vm_object_t next_object; 1706 vm_pindex_t pindex; 1707 int error; 1708 1709 ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object)); 1710 fs->prot = fs->first_prot; 1711 fs->object = fs->first_object; 1712 pindex = first_pindex; 1713 1714 vm_object_chain_acquire(fs->first_object, fs->shared); 1715 vm_object_pip_add(fs->first_object, 1); 1716 1717 /* 1718 * If a read fault occurs we try to upgrade the page protection 1719 * and make it also writable if possible. There are three cases 1720 * where we cannot make the page mapping writable: 1721 * 1722 * (1) The mapping is read-only or the VM object is read-only, 1723 * fs->prot above will simply not have VM_PROT_WRITE set. 1724 * 1725 * (2) If the mapping is a virtual page table fs->first_prot will 1726 * have already been properly adjusted by vm_fault_vpagetable(). 1727 * to detect writes so we can set VPTE_M in the virtual page 1728 * table. Used by vkernels. 1729 * 1730 * (3) If the VM page is read-only or copy-on-write, upgrading would 1731 * just result in an unnecessary COW fault. 1732 * 1733 * (4) If the pmap specifically requests A/M bit emulation, downgrade 1734 * here. 1735 */ 1736 #if 0 1737 /* see vpagetable code */ 1738 if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) { 1739 if ((fault_type & VM_PROT_WRITE) == 0) 1740 fs->prot &= ~VM_PROT_WRITE; 1741 } 1742 #endif 1743 1744 if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace && 1745 pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) { 1746 if ((fault_type & VM_PROT_WRITE) == 0) 1747 fs->prot &= ~VM_PROT_WRITE; 1748 } 1749 1750 /* vm_object_hold(fs->object); implied b/c object == first_object */ 1751 1752 for (;;) { 1753 /* 1754 * The entire backing chain from first_object to object 1755 * inclusive is chainlocked. 1756 * 1757 * If the object is dead, we stop here 1758 */ 1759 if (fs->object->flags & OBJ_DEAD) { 1760 vm_object_pip_wakeup(fs->first_object); 1761 vm_object_chain_release_all(fs->first_object, 1762 fs->object); 1763 if (fs->object != fs->first_object) 1764 vm_object_drop(fs->object); 1765 unlock_and_deallocate(fs); 1766 return (KERN_PROTECTION_FAILURE); 1767 } 1768 1769 /* 1770 * See if the page is resident. Wait/Retry if the page is 1771 * busy (lots of stuff may have changed so we can't continue 1772 * in that case). 1773 * 1774 * We can theoretically allow the soft-busy case on a read 1775 * fault if the page is marked valid, but since such 1776 * pages are typically already pmap'd, putting that 1777 * special case in might be more effort then it is 1778 * worth. We cannot under any circumstances mess 1779 * around with a vm_page_t->busy page except, perhaps, 1780 * to pmap it. 1781 */ 1782 fs->m = vm_page_lookup_busy_try(fs->object, pindex, 1783 TRUE, &error); 1784 if (error) { 1785 vm_object_pip_wakeup(fs->first_object); 1786 vm_object_chain_release_all(fs->first_object, 1787 fs->object); 1788 if (fs->object != fs->first_object) 1789 vm_object_drop(fs->object); 1790 unlock_things(fs); 1791 vm_page_sleep_busy(fs->m, TRUE, "vmpfw"); 1792 mycpu->gd_cnt.v_intrans++; 1793 /*vm_object_deallocate(fs->first_object);*/ 1794 /*fs->first_object = NULL;*/ 1795 fs->m = NULL; 1796 return (KERN_TRY_AGAIN); 1797 } 1798 if (fs->m) { 1799 /* 1800 * The page is busied for us. 1801 * 1802 * If reactivating a page from PQ_CACHE we may have 1803 * to rate-limit. 1804 */ 1805 int queue = fs->m->queue; 1806 vm_page_unqueue_nowakeup(fs->m); 1807 1808 if ((queue - fs->m->pc) == PQ_CACHE && 1809 vm_page_count_severe()) { 1810 vm_page_activate(fs->m); 1811 vm_page_wakeup(fs->m); 1812 fs->m = NULL; 1813 vm_object_pip_wakeup(fs->first_object); 1814 vm_object_chain_release_all(fs->first_object, 1815 fs->object); 1816 if (fs->object != fs->first_object) 1817 vm_object_drop(fs->object); 1818 unlock_and_deallocate(fs); 1819 if (allow_nofault == 0 || 1820 (curthread->td_flags & TDF_NOFAULT) == 0) { 1821 thread_t td; 1822 1823 vm_wait_pfault(); 1824 td = curthread; 1825 if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL)) 1826 return (KERN_PROTECTION_FAILURE); 1827 } 1828 return (KERN_TRY_AGAIN); 1829 } 1830 1831 /* 1832 * If it still isn't completely valid (readable), 1833 * or if a read-ahead-mark is set on the VM page, 1834 * jump to readrest, else we found the page and 1835 * can return. 1836 * 1837 * We can release the spl once we have marked the 1838 * page busy. 1839 */ 1840 if (fs->m->object != &kernel_object) { 1841 if ((fs->m->valid & VM_PAGE_BITS_ALL) != 1842 VM_PAGE_BITS_ALL) { 1843 goto readrest; 1844 } 1845 if (fs->m->flags & PG_RAM) { 1846 if (debug_cluster) 1847 kprintf("R"); 1848 vm_page_flag_clear(fs->m, PG_RAM); 1849 goto readrest; 1850 } 1851 } 1852 break; /* break to PAGE HAS BEEN FOUND */ 1853 } 1854 1855 /* 1856 * Page is not resident, If this is the search termination 1857 * or the pager might contain the page, allocate a new page. 1858 */ 1859 if (TRYPAGER(fs) || fs->object == fs->first_object) { 1860 /* 1861 * Allocating, must be exclusive. 1862 */ 1863 if (fs->object == fs->first_object && 1864 fs->first_shared) { 1865 fs->first_shared = 0; 1866 vm_object_pip_wakeup(fs->first_object); 1867 vm_object_chain_release_all(fs->first_object, 1868 fs->object); 1869 if (fs->object != fs->first_object) 1870 vm_object_drop(fs->object); 1871 unlock_and_deallocate(fs); 1872 return (KERN_TRY_AGAIN); 1873 } 1874 if (fs->object != fs->first_object && 1875 fs->shared) { 1876 fs->first_shared = 0; 1877 fs->shared = 0; 1878 vm_object_pip_wakeup(fs->first_object); 1879 vm_object_chain_release_all(fs->first_object, 1880 fs->object); 1881 if (fs->object != fs->first_object) 1882 vm_object_drop(fs->object); 1883 unlock_and_deallocate(fs); 1884 return (KERN_TRY_AGAIN); 1885 } 1886 1887 /* 1888 * If the page is beyond the object size we fail 1889 */ 1890 if (pindex >= fs->object->size) { 1891 vm_object_pip_wakeup(fs->first_object); 1892 vm_object_chain_release_all(fs->first_object, 1893 fs->object); 1894 if (fs->object != fs->first_object) 1895 vm_object_drop(fs->object); 1896 unlock_and_deallocate(fs); 1897 return (KERN_PROTECTION_FAILURE); 1898 } 1899 1900 /* 1901 * Allocate a new page for this object/offset pair. 1902 * 1903 * It is possible for the allocation to race, so 1904 * handle the case. 1905 */ 1906 fs->m = NULL; 1907 if (!vm_page_count_severe()) { 1908 fs->m = vm_page_alloc(fs->object, pindex, 1909 ((fs->vp || fs->object->backing_object) ? 1910 VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL : 1911 VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL | 1912 VM_ALLOC_USE_GD | VM_ALLOC_ZERO)); 1913 } 1914 if (fs->m == NULL) { 1915 vm_object_pip_wakeup(fs->first_object); 1916 vm_object_chain_release_all(fs->first_object, 1917 fs->object); 1918 if (fs->object != fs->first_object) 1919 vm_object_drop(fs->object); 1920 unlock_and_deallocate(fs); 1921 if (allow_nofault == 0 || 1922 (curthread->td_flags & TDF_NOFAULT) == 0) { 1923 thread_t td; 1924 1925 vm_wait_pfault(); 1926 td = curthread; 1927 if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL)) 1928 return (KERN_PROTECTION_FAILURE); 1929 } 1930 return (KERN_TRY_AGAIN); 1931 } 1932 1933 /* 1934 * Fall through to readrest. We have a new page which 1935 * will have to be paged (since m->valid will be 0). 1936 */ 1937 } 1938 1939 readrest: 1940 /* 1941 * We have found an invalid or partially valid page, a 1942 * page with a read-ahead mark which might be partially or 1943 * fully valid (and maybe dirty too), or we have allocated 1944 * a new page. 1945 * 1946 * Attempt to fault-in the page if there is a chance that the 1947 * pager has it, and potentially fault in additional pages 1948 * at the same time. 1949 * 1950 * If TRYPAGER is true then fs.m will be non-NULL and busied 1951 * for us. 1952 */ 1953 if (TRYPAGER(fs)) { 1954 int rv; 1955 int seqaccess; 1956 u_char behavior = vm_map_entry_behavior(fs->entry); 1957 1958 if (behavior == MAP_ENTRY_BEHAV_RANDOM) 1959 seqaccess = 0; 1960 else 1961 seqaccess = -1; 1962 1963 /* 1964 * Doing I/O may synchronously insert additional 1965 * pages so we can't be shared at this point either. 1966 * 1967 * NOTE: We can't free fs->m here in the allocated 1968 * case (fs->object != fs->first_object) as 1969 * this would require an exclusively locked 1970 * VM object. 1971 */ 1972 if (fs->object == fs->first_object && 1973 fs->first_shared) { 1974 vm_page_deactivate(fs->m); 1975 vm_page_wakeup(fs->m); 1976 fs->m = NULL; 1977 fs->first_shared = 0; 1978 vm_object_pip_wakeup(fs->first_object); 1979 vm_object_chain_release_all(fs->first_object, 1980 fs->object); 1981 if (fs->object != fs->first_object) 1982 vm_object_drop(fs->object); 1983 unlock_and_deallocate(fs); 1984 return (KERN_TRY_AGAIN); 1985 } 1986 if (fs->object != fs->first_object && 1987 fs->shared) { 1988 vm_page_deactivate(fs->m); 1989 vm_page_wakeup(fs->m); 1990 fs->m = NULL; 1991 fs->first_shared = 0; 1992 fs->shared = 0; 1993 vm_object_pip_wakeup(fs->first_object); 1994 vm_object_chain_release_all(fs->first_object, 1995 fs->object); 1996 if (fs->object != fs->first_object) 1997 vm_object_drop(fs->object); 1998 unlock_and_deallocate(fs); 1999 return (KERN_TRY_AGAIN); 2000 } 2001 2002 /* 2003 * Avoid deadlocking against the map when doing I/O. 2004 * fs.object and the page is BUSY'd. 2005 * 2006 * NOTE: Once unlocked, fs->entry can become stale 2007 * so this will NULL it out. 2008 * 2009 * NOTE: fs->entry is invalid until we relock the 2010 * map and verify that the timestamp has not 2011 * changed. 2012 */ 2013 unlock_map(fs); 2014 2015 /* 2016 * Acquire the page data. We still hold a ref on 2017 * fs.object and the page has been BUSY's. 2018 * 2019 * The pager may replace the page (for example, in 2020 * order to enter a fictitious page into the 2021 * object). If it does so it is responsible for 2022 * cleaning up the passed page and properly setting 2023 * the new page BUSY. 2024 * 2025 * If we got here through a PG_RAM read-ahead 2026 * mark the page may be partially dirty and thus 2027 * not freeable. Don't bother checking to see 2028 * if the pager has the page because we can't free 2029 * it anyway. We have to depend on the get_page 2030 * operation filling in any gaps whether there is 2031 * backing store or not. 2032 */ 2033 rv = vm_pager_get_page(fs->object, &fs->m, seqaccess); 2034 2035 if (rv == VM_PAGER_OK) { 2036 /* 2037 * Relookup in case pager changed page. Pager 2038 * is responsible for disposition of old page 2039 * if moved. 2040 * 2041 * XXX other code segments do relookups too. 2042 * It's a bad abstraction that needs to be 2043 * fixed/removed. 2044 */ 2045 fs->m = vm_page_lookup(fs->object, pindex); 2046 if (fs->m == NULL) { 2047 vm_object_pip_wakeup(fs->first_object); 2048 vm_object_chain_release_all( 2049 fs->first_object, fs->object); 2050 if (fs->object != fs->first_object) 2051 vm_object_drop(fs->object); 2052 unlock_and_deallocate(fs); 2053 return (KERN_TRY_AGAIN); 2054 } 2055 ++fs->hardfault; 2056 break; /* break to PAGE HAS BEEN FOUND */ 2057 } 2058 2059 /* 2060 * Remove the bogus page (which does not exist at this 2061 * object/offset); before doing so, we must get back 2062 * our object lock to preserve our invariant. 2063 * 2064 * Also wake up any other process that may want to bring 2065 * in this page. 2066 * 2067 * If this is the top-level object, we must leave the 2068 * busy page to prevent another process from rushing 2069 * past us, and inserting the page in that object at 2070 * the same time that we are. 2071 */ 2072 if (rv == VM_PAGER_ERROR) { 2073 if (curproc) { 2074 kprintf("vm_fault: pager read error, " 2075 "pid %d (%s)\n", 2076 curproc->p_pid, 2077 curproc->p_comm); 2078 } else { 2079 kprintf("vm_fault: pager read error, " 2080 "thread %p (%s)\n", 2081 curthread, 2082 curthread->td_comm); 2083 } 2084 } 2085 2086 /* 2087 * Data outside the range of the pager or an I/O error 2088 * 2089 * The page may have been wired during the pagein, 2090 * e.g. by the buffer cache, and cannot simply be 2091 * freed. Call vnode_pager_freepage() to deal with it. 2092 * 2093 * Also note that we cannot free the page if we are 2094 * holding the related object shared. XXX not sure 2095 * what to do in that case. 2096 */ 2097 if (fs->object != fs->first_object) { 2098 /* 2099 * Scrap the page. Check to see if the 2100 * vm_pager_get_page() call has already 2101 * dealt with it. 2102 */ 2103 if (fs->m) { 2104 vnode_pager_freepage(fs->m); 2105 fs->m = NULL; 2106 } 2107 2108 /* 2109 * XXX - we cannot just fall out at this 2110 * point, m has been freed and is invalid! 2111 */ 2112 } 2113 /* 2114 * XXX - the check for kernel_map is a kludge to work 2115 * around having the machine panic on a kernel space 2116 * fault w/ I/O error. 2117 */ 2118 if (((fs->map != &kernel_map) && 2119 (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) { 2120 if (fs->m) { 2121 if (fs->first_shared) { 2122 vm_page_deactivate(fs->m); 2123 vm_page_wakeup(fs->m); 2124 } else { 2125 vnode_pager_freepage(fs->m); 2126 } 2127 fs->m = NULL; 2128 } 2129 vm_object_pip_wakeup(fs->first_object); 2130 vm_object_chain_release_all(fs->first_object, 2131 fs->object); 2132 if (fs->object != fs->first_object) 2133 vm_object_drop(fs->object); 2134 unlock_and_deallocate(fs); 2135 if (rv == VM_PAGER_ERROR) 2136 return (KERN_FAILURE); 2137 else 2138 return (KERN_PROTECTION_FAILURE); 2139 /* NOT REACHED */ 2140 } 2141 } 2142 2143 /* 2144 * We get here if the object has a default pager (or unwiring) 2145 * or the pager doesn't have the page. 2146 * 2147 * fs->first_m will be used for the COW unless we find a 2148 * deeper page to be mapped read-only, in which case the 2149 * unlock*(fs) will free first_m. 2150 */ 2151 if (fs->object == fs->first_object) 2152 fs->first_m = fs->m; 2153 2154 /* 2155 * Move on to the next object. The chain lock should prevent 2156 * the backing_object from getting ripped out from under us. 2157 * 2158 * The object lock for the next object is governed by 2159 * fs->shared. 2160 */ 2161 if ((next_object = fs->object->backing_object) != NULL) { 2162 if (fs->shared) 2163 vm_object_hold_shared(next_object); 2164 else 2165 vm_object_hold(next_object); 2166 vm_object_chain_acquire(next_object, fs->shared); 2167 KKASSERT(next_object == fs->object->backing_object); 2168 pindex += OFF_TO_IDX(fs->object->backing_object_offset); 2169 } 2170 2171 if (next_object == NULL) { 2172 /* 2173 * If there's no object left, fill the page in the top 2174 * object with zeros. 2175 */ 2176 if (fs->object != fs->first_object) { 2177 #if 0 2178 if (fs->first_object->backing_object != 2179 fs->object) { 2180 vm_object_hold(fs->first_object->backing_object); 2181 } 2182 #endif 2183 vm_object_chain_release_all( 2184 fs->first_object->backing_object, 2185 fs->object); 2186 #if 0 2187 if (fs->first_object->backing_object != 2188 fs->object) { 2189 vm_object_drop(fs->first_object->backing_object); 2190 } 2191 #endif 2192 vm_object_pip_wakeup(fs->object); 2193 vm_object_drop(fs->object); 2194 fs->object = fs->first_object; 2195 pindex = first_pindex; 2196 fs->m = fs->first_m; 2197 } 2198 fs->first_m = NULL; 2199 2200 /* 2201 * Zero the page and mark it valid. 2202 */ 2203 vm_page_zero_fill(fs->m); 2204 mycpu->gd_cnt.v_zfod++; 2205 fs->m->valid = VM_PAGE_BITS_ALL; 2206 break; /* break to PAGE HAS BEEN FOUND */ 2207 } 2208 if (fs->object != fs->first_object) { 2209 vm_object_pip_wakeup(fs->object); 2210 vm_object_lock_swap(); 2211 vm_object_drop(fs->object); 2212 } 2213 KASSERT(fs->object != next_object, 2214 ("object loop %p", next_object)); 2215 fs->object = next_object; 2216 vm_object_pip_add(fs->object, 1); 2217 } 2218 2219 /* 2220 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock 2221 * is held.] 2222 * 2223 * object still held. 2224 * vm_map may not be locked (determined by fs->lookup_still_valid) 2225 * 2226 * local shared variable may be different from fs->shared. 2227 * 2228 * If the page is being written, but isn't already owned by the 2229 * top-level object, we have to copy it into a new page owned by the 2230 * top-level object. 2231 */ 2232 KASSERT((fs->m->busy_count & PBUSY_LOCKED) != 0, 2233 ("vm_fault: not busy after main loop")); 2234 2235 if (fs->object != fs->first_object) { 2236 /* 2237 * We only really need to copy if we want to write it. 2238 */ 2239 if (fault_type & VM_PROT_WRITE) { 2240 /* 2241 * This allows pages to be virtually copied from a 2242 * backing_object into the first_object, where the 2243 * backing object has no other refs to it, and cannot 2244 * gain any more refs. Instead of a bcopy, we just 2245 * move the page from the backing object to the 2246 * first object. Note that we must mark the page 2247 * dirty in the first object so that it will go out 2248 * to swap when needed. 2249 */ 2250 if (virtual_copy_ok(fs)) { 2251 /* 2252 * (first_m) and (m) are both busied. We have 2253 * move (m) into (first_m)'s object/pindex 2254 * in an atomic fashion, then free (first_m). 2255 * 2256 * first_object is held so second remove 2257 * followed by the rename should wind 2258 * up being atomic. vm_page_free() might 2259 * block so we don't do it until after the 2260 * rename. 2261 */ 2262 vm_page_protect(fs->first_m, VM_PROT_NONE); 2263 vm_page_remove(fs->first_m); 2264 vm_page_rename(fs->m, fs->first_object, 2265 first_pindex); 2266 vm_page_free(fs->first_m); 2267 fs->first_m = fs->m; 2268 fs->m = NULL; 2269 mycpu->gd_cnt.v_cow_optim++; 2270 } else { 2271 /* 2272 * Oh, well, lets copy it. 2273 * 2274 * Why are we unmapping the original page 2275 * here? Well, in short, not all accessors 2276 * of user memory go through the pmap. The 2277 * procfs code doesn't have access user memory 2278 * via a local pmap, so vm_fault_page*() 2279 * can't call pmap_enter(). And the umtx*() 2280 * code may modify the COW'd page via a DMAP 2281 * or kernel mapping and not via the pmap, 2282 * leaving the original page still mapped 2283 * read-only into the pmap. 2284 * 2285 * So we have to remove the page from at 2286 * least the current pmap if it is in it. 2287 * 2288 * We used to just remove it from all pmaps 2289 * but that creates inefficiencies on SMP, 2290 * particularly for COW program & library 2291 * mappings that are concurrently exec'd. 2292 * Only remove the page from the current 2293 * pmap. 2294 */ 2295 KKASSERT(fs->first_shared == 0); 2296 vm_page_copy(fs->m, fs->first_m); 2297 /*vm_page_protect(fs->m, VM_PROT_NONE);*/ 2298 pmap_remove_specific( 2299 &curthread->td_lwp->lwp_vmspace->vm_pmap, 2300 fs->m); 2301 } 2302 2303 /* 2304 * We no longer need the old page or object. 2305 */ 2306 if (fs->m) 2307 release_page(fs); 2308 2309 /* 2310 * We intend to revert to first_object, undo the 2311 * chain lock through to that. 2312 */ 2313 #if 0 2314 if (fs->first_object->backing_object != fs->object) 2315 vm_object_hold(fs->first_object->backing_object); 2316 #endif 2317 vm_object_chain_release_all( 2318 fs->first_object->backing_object, 2319 fs->object); 2320 #if 0 2321 if (fs->first_object->backing_object != fs->object) 2322 vm_object_drop(fs->first_object->backing_object); 2323 #endif 2324 2325 /* 2326 * fs->object != fs->first_object due to above 2327 * conditional 2328 */ 2329 vm_object_pip_wakeup(fs->object); 2330 vm_object_drop(fs->object); 2331 2332 /* 2333 * Only use the new page below... 2334 */ 2335 mycpu->gd_cnt.v_cow_faults++; 2336 fs->m = fs->first_m; 2337 fs->object = fs->first_object; 2338 pindex = first_pindex; 2339 } else { 2340 /* 2341 * If it wasn't a write fault avoid having to copy 2342 * the page by mapping it read-only. 2343 */ 2344 fs->prot &= ~VM_PROT_WRITE; 2345 } 2346 } 2347 2348 /* 2349 * Relock the map if necessary, then check the generation count. 2350 * relock_map() will update fs->timestamp to account for the 2351 * relocking if necessary. 2352 * 2353 * If the count has changed after relocking then all sorts of 2354 * crap may have happened and we have to retry. 2355 * 2356 * NOTE: The relock_map() can fail due to a deadlock against 2357 * the vm_page we are holding BUSY. 2358 */ 2359 if (fs->lookup_still_valid == FALSE && fs->map) { 2360 if (relock_map(fs) || 2361 fs->map->timestamp != fs->map_generation) { 2362 release_page(fs); 2363 vm_object_pip_wakeup(fs->first_object); 2364 vm_object_chain_release_all(fs->first_object, 2365 fs->object); 2366 if (fs->object != fs->first_object) 2367 vm_object_drop(fs->object); 2368 unlock_and_deallocate(fs); 2369 return (KERN_TRY_AGAIN); 2370 } 2371 } 2372 2373 /* 2374 * If the fault is a write, we know that this page is being 2375 * written NOW so dirty it explicitly to save on pmap_is_modified() 2376 * calls later. 2377 * 2378 * If this is a NOSYNC mmap we do not want to set PG_NOSYNC 2379 * if the page is already dirty to prevent data written with 2380 * the expectation of being synced from not being synced. 2381 * Likewise if this entry does not request NOSYNC then make 2382 * sure the page isn't marked NOSYNC. Applications sharing 2383 * data should use the same flags to avoid ping ponging. 2384 * 2385 * Also tell the backing pager, if any, that it should remove 2386 * any swap backing since the page is now dirty. 2387 */ 2388 vm_page_activate(fs->m); 2389 if (fs->prot & VM_PROT_WRITE) { 2390 vm_object_set_writeable_dirty(fs->m->object); 2391 vm_set_nosync(fs->m, fs->entry); 2392 if (fs->fault_flags & VM_FAULT_DIRTY) { 2393 vm_page_dirty(fs->m); 2394 if (fs->m->flags & PG_SWAPPED) { 2395 /* 2396 * If the page is swapped out we have to call 2397 * swap_pager_unswapped() which requires an 2398 * exclusive object lock. If we are shared, 2399 * we must clear the shared flag and retry. 2400 */ 2401 if ((fs->object == fs->first_object && 2402 fs->first_shared) || 2403 (fs->object != fs->first_object && 2404 fs->shared)) { 2405 vm_page_wakeup(fs->m); 2406 fs->m = NULL; 2407 if (fs->object == fs->first_object) 2408 fs->first_shared = 0; 2409 else 2410 fs->shared = 0; 2411 vm_object_pip_wakeup(fs->first_object); 2412 vm_object_chain_release_all( 2413 fs->first_object, fs->object); 2414 if (fs->object != fs->first_object) 2415 vm_object_drop(fs->object); 2416 unlock_and_deallocate(fs); 2417 return (KERN_TRY_AGAIN); 2418 } 2419 swap_pager_unswapped(fs->m); 2420 } 2421 } 2422 } 2423 2424 vm_object_pip_wakeup(fs->first_object); 2425 vm_object_chain_release_all(fs->first_object, fs->object); 2426 if (fs->object != fs->first_object) 2427 vm_object_drop(fs->object); 2428 2429 /* 2430 * Page had better still be busy. We are still locked up and 2431 * fs->object will have another PIP reference if it is not equal 2432 * to fs->first_object. 2433 */ 2434 KASSERT(fs->m->busy_count & PBUSY_LOCKED, 2435 ("vm_fault: page %p not busy!", fs->m)); 2436 2437 /* 2438 * Sanity check: page must be completely valid or it is not fit to 2439 * map into user space. vm_pager_get_pages() ensures this. 2440 */ 2441 if (fs->m->valid != VM_PAGE_BITS_ALL) { 2442 vm_page_zero_invalid(fs->m, TRUE); 2443 kprintf("Warning: page %p partially invalid on fault\n", fs->m); 2444 } 2445 2446 return (KERN_SUCCESS); 2447 } 2448 2449 /* 2450 * Wire down a range of virtual addresses in a map. The entry in question 2451 * should be marked in-transition and the map must be locked. We must 2452 * release the map temporarily while faulting-in the page to avoid a 2453 * deadlock. Note that the entry may be clipped while we are blocked but 2454 * will never be freed. 2455 * 2456 * No requirements. 2457 */ 2458 int 2459 vm_fault_wire(vm_map_t map, vm_map_entry_t entry, 2460 boolean_t user_wire, int kmflags) 2461 { 2462 boolean_t fictitious; 2463 vm_offset_t start; 2464 vm_offset_t end; 2465 vm_offset_t va; 2466 pmap_t pmap; 2467 int rv; 2468 int wire_prot; 2469 int fault_flags; 2470 vm_page_t m; 2471 2472 if (user_wire) { 2473 wire_prot = VM_PROT_READ; 2474 fault_flags = VM_FAULT_USER_WIRE; 2475 } else { 2476 wire_prot = VM_PROT_READ | VM_PROT_WRITE; 2477 fault_flags = VM_FAULT_CHANGE_WIRING; 2478 } 2479 if (kmflags & KM_NOTLBSYNC) 2480 wire_prot |= VM_PROT_NOSYNC; 2481 2482 pmap = vm_map_pmap(map); 2483 start = entry->start; 2484 end = entry->end; 2485 2486 switch(entry->maptype) { 2487 case VM_MAPTYPE_NORMAL: 2488 case VM_MAPTYPE_VPAGETABLE: 2489 fictitious = entry->object.vm_object && 2490 ((entry->object.vm_object->type == OBJT_DEVICE) || 2491 (entry->object.vm_object->type == OBJT_MGTDEVICE)); 2492 break; 2493 case VM_MAPTYPE_UKSMAP: 2494 fictitious = TRUE; 2495 break; 2496 default: 2497 fictitious = FALSE; 2498 break; 2499 } 2500 2501 if (entry->eflags & MAP_ENTRY_KSTACK) 2502 start += PAGE_SIZE; 2503 map->timestamp++; 2504 vm_map_unlock(map); 2505 2506 /* 2507 * We simulate a fault to get the page and enter it in the physical 2508 * map. 2509 */ 2510 for (va = start; va < end; va += PAGE_SIZE) { 2511 rv = vm_fault(map, va, wire_prot, fault_flags); 2512 if (rv) { 2513 while (va > start) { 2514 va -= PAGE_SIZE; 2515 m = pmap_unwire(pmap, va); 2516 if (m && !fictitious) { 2517 vm_page_busy_wait(m, FALSE, "vmwrpg"); 2518 vm_page_unwire(m, 1); 2519 vm_page_wakeup(m); 2520 } 2521 } 2522 goto done; 2523 } 2524 } 2525 rv = KERN_SUCCESS; 2526 done: 2527 vm_map_lock(map); 2528 2529 return (rv); 2530 } 2531 2532 /* 2533 * Unwire a range of virtual addresses in a map. The map should be 2534 * locked. 2535 */ 2536 void 2537 vm_fault_unwire(vm_map_t map, vm_map_entry_t entry) 2538 { 2539 boolean_t fictitious; 2540 vm_offset_t start; 2541 vm_offset_t end; 2542 vm_offset_t va; 2543 pmap_t pmap; 2544 vm_page_t m; 2545 2546 pmap = vm_map_pmap(map); 2547 start = entry->start; 2548 end = entry->end; 2549 fictitious = entry->object.vm_object && 2550 ((entry->object.vm_object->type == OBJT_DEVICE) || 2551 (entry->object.vm_object->type == OBJT_MGTDEVICE)); 2552 if (entry->eflags & MAP_ENTRY_KSTACK) 2553 start += PAGE_SIZE; 2554 2555 /* 2556 * Since the pages are wired down, we must be able to get their 2557 * mappings from the physical map system. 2558 */ 2559 for (va = start; va < end; va += PAGE_SIZE) { 2560 m = pmap_unwire(pmap, va); 2561 if (m && !fictitious) { 2562 vm_page_busy_wait(m, FALSE, "vmwrpg"); 2563 vm_page_unwire(m, 1); 2564 vm_page_wakeup(m); 2565 } 2566 } 2567 } 2568 2569 /* 2570 * Copy all of the pages from a wired-down map entry to another. 2571 * 2572 * The source and destination maps must be locked for write. 2573 * The source and destination maps token must be held 2574 * The source map entry must be wired down (or be a sharing map 2575 * entry corresponding to a main map entry that is wired down). 2576 * 2577 * No other requirements. 2578 * 2579 * XXX do segment optimization 2580 */ 2581 void 2582 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, 2583 vm_map_entry_t dst_entry, vm_map_entry_t src_entry) 2584 { 2585 vm_object_t dst_object; 2586 vm_object_t src_object; 2587 vm_ooffset_t dst_offset; 2588 vm_ooffset_t src_offset; 2589 vm_prot_t prot; 2590 vm_offset_t vaddr; 2591 vm_page_t dst_m; 2592 vm_page_t src_m; 2593 2594 src_object = src_entry->object.vm_object; 2595 src_offset = src_entry->offset; 2596 2597 /* 2598 * Create the top-level object for the destination entry. (Doesn't 2599 * actually shadow anything - we copy the pages directly.) 2600 */ 2601 vm_map_entry_allocate_object(dst_entry); 2602 dst_object = dst_entry->object.vm_object; 2603 2604 prot = dst_entry->max_protection; 2605 2606 /* 2607 * Loop through all of the pages in the entry's range, copying each 2608 * one from the source object (it should be there) to the destination 2609 * object. 2610 */ 2611 vm_object_hold(src_object); 2612 vm_object_hold(dst_object); 2613 2614 for (vaddr = dst_entry->start, dst_offset = 0; 2615 vaddr < dst_entry->end; 2616 vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) { 2617 2618 /* 2619 * Allocate a page in the destination object 2620 */ 2621 do { 2622 dst_m = vm_page_alloc(dst_object, 2623 OFF_TO_IDX(dst_offset), 2624 VM_ALLOC_NORMAL); 2625 if (dst_m == NULL) { 2626 vm_wait(0); 2627 } 2628 } while (dst_m == NULL); 2629 2630 /* 2631 * Find the page in the source object, and copy it in. 2632 * (Because the source is wired down, the page will be in 2633 * memory.) 2634 */ 2635 src_m = vm_page_lookup(src_object, 2636 OFF_TO_IDX(dst_offset + src_offset)); 2637 if (src_m == NULL) 2638 panic("vm_fault_copy_wired: page missing"); 2639 2640 vm_page_copy(src_m, dst_m); 2641 2642 /* 2643 * Enter it in the pmap... 2644 */ 2645 pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE, dst_entry); 2646 2647 /* 2648 * Mark it no longer busy, and put it on the active list. 2649 */ 2650 vm_page_activate(dst_m); 2651 vm_page_wakeup(dst_m); 2652 } 2653 vm_object_drop(dst_object); 2654 vm_object_drop(src_object); 2655 } 2656 2657 #if 0 2658 2659 /* 2660 * This routine checks around the requested page for other pages that 2661 * might be able to be faulted in. This routine brackets the viable 2662 * pages for the pages to be paged in. 2663 * 2664 * Inputs: 2665 * m, rbehind, rahead 2666 * 2667 * Outputs: 2668 * marray (array of vm_page_t), reqpage (index of requested page) 2669 * 2670 * Return value: 2671 * number of pages in marray 2672 */ 2673 static int 2674 vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead, 2675 vm_page_t *marray, int *reqpage) 2676 { 2677 int i,j; 2678 vm_object_t object; 2679 vm_pindex_t pindex, startpindex, endpindex, tpindex; 2680 vm_page_t rtm; 2681 int cbehind, cahead; 2682 2683 object = m->object; 2684 pindex = m->pindex; 2685 2686 /* 2687 * we don't fault-ahead for device pager 2688 */ 2689 if ((object->type == OBJT_DEVICE) || 2690 (object->type == OBJT_MGTDEVICE)) { 2691 *reqpage = 0; 2692 marray[0] = m; 2693 return 1; 2694 } 2695 2696 /* 2697 * if the requested page is not available, then give up now 2698 */ 2699 if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) { 2700 *reqpage = 0; /* not used by caller, fix compiler warn */ 2701 return 0; 2702 } 2703 2704 if ((cbehind == 0) && (cahead == 0)) { 2705 *reqpage = 0; 2706 marray[0] = m; 2707 return 1; 2708 } 2709 2710 if (rahead > cahead) { 2711 rahead = cahead; 2712 } 2713 2714 if (rbehind > cbehind) { 2715 rbehind = cbehind; 2716 } 2717 2718 /* 2719 * Do not do any readahead if we have insufficient free memory. 2720 * 2721 * XXX code was broken disabled before and has instability 2722 * with this conditonal fixed, so shortcut for now. 2723 */ 2724 if (burst_fault == 0 || vm_page_count_severe()) { 2725 marray[0] = m; 2726 *reqpage = 0; 2727 return 1; 2728 } 2729 2730 /* 2731 * scan backward for the read behind pages -- in memory 2732 * 2733 * Assume that if the page is not found an interrupt will not 2734 * create it. Theoretically interrupts can only remove (busy) 2735 * pages, not create new associations. 2736 */ 2737 if (pindex > 0) { 2738 if (rbehind > pindex) { 2739 rbehind = pindex; 2740 startpindex = 0; 2741 } else { 2742 startpindex = pindex - rbehind; 2743 } 2744 2745 vm_object_hold(object); 2746 for (tpindex = pindex; tpindex > startpindex; --tpindex) { 2747 if (vm_page_lookup(object, tpindex - 1)) 2748 break; 2749 } 2750 2751 i = 0; 2752 while (tpindex < pindex) { 2753 rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | 2754 VM_ALLOC_NULL_OK); 2755 if (rtm == NULL) { 2756 for (j = 0; j < i; j++) { 2757 vm_page_free(marray[j]); 2758 } 2759 vm_object_drop(object); 2760 marray[0] = m; 2761 *reqpage = 0; 2762 return 1; 2763 } 2764 marray[i] = rtm; 2765 ++i; 2766 ++tpindex; 2767 } 2768 vm_object_drop(object); 2769 } else { 2770 i = 0; 2771 } 2772 2773 /* 2774 * Assign requested page 2775 */ 2776 marray[i] = m; 2777 *reqpage = i; 2778 ++i; 2779 2780 /* 2781 * Scan forwards for read-ahead pages 2782 */ 2783 tpindex = pindex + 1; 2784 endpindex = tpindex + rahead; 2785 if (endpindex > object->size) 2786 endpindex = object->size; 2787 2788 vm_object_hold(object); 2789 while (tpindex < endpindex) { 2790 if (vm_page_lookup(object, tpindex)) 2791 break; 2792 rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | 2793 VM_ALLOC_NULL_OK); 2794 if (rtm == NULL) 2795 break; 2796 marray[i] = rtm; 2797 ++i; 2798 ++tpindex; 2799 } 2800 vm_object_drop(object); 2801 2802 return (i); 2803 } 2804 2805 #endif 2806 2807 /* 2808 * vm_prefault() provides a quick way of clustering pagefaults into a 2809 * processes address space. It is a "cousin" of pmap_object_init_pt, 2810 * except it runs at page fault time instead of mmap time. 2811 * 2812 * vm.fast_fault Enables pre-faulting zero-fill pages 2813 * 2814 * vm.prefault_pages Number of pages (1/2 negative, 1/2 positive) to 2815 * prefault. Scan stops in either direction when 2816 * a page is found to already exist. 2817 * 2818 * This code used to be per-platform pmap_prefault(). It is now 2819 * machine-independent and enhanced to also pre-fault zero-fill pages 2820 * (see vm.fast_fault) as well as make them writable, which greatly 2821 * reduces the number of page faults programs incur. 2822 * 2823 * Application performance when pre-faulting zero-fill pages is heavily 2824 * dependent on the application. Very tiny applications like /bin/echo 2825 * lose a little performance while applications of any appreciable size 2826 * gain performance. Prefaulting multiple pages also reduces SMP 2827 * congestion and can improve SMP performance significantly. 2828 * 2829 * NOTE! prot may allow writing but this only applies to the top level 2830 * object. If we wind up mapping a page extracted from a backing 2831 * object we have to make sure it is read-only. 2832 * 2833 * NOTE! The caller has already handled any COW operations on the 2834 * vm_map_entry via the normal fault code. Do NOT call this 2835 * shortcut unless the normal fault code has run on this entry. 2836 * 2837 * The related map must be locked. 2838 * No other requirements. 2839 */ 2840 __read_mostly static int vm_prefault_pages = 8; 2841 SYSCTL_INT(_vm, OID_AUTO, prefault_pages, CTLFLAG_RW, &vm_prefault_pages, 0, 2842 "Maximum number of pages to pre-fault"); 2843 __read_mostly static int vm_fast_fault = 1; 2844 SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0, 2845 "Burst fault zero-fill regions"); 2846 2847 /* 2848 * Set PG_NOSYNC if the map entry indicates so, but only if the page 2849 * is not already dirty by other means. This will prevent passive 2850 * filesystem syncing as well as 'sync' from writing out the page. 2851 */ 2852 static void 2853 vm_set_nosync(vm_page_t m, vm_map_entry_t entry) 2854 { 2855 if (entry->eflags & MAP_ENTRY_NOSYNC) { 2856 if (m->dirty == 0) 2857 vm_page_flag_set(m, PG_NOSYNC); 2858 } else { 2859 vm_page_flag_clear(m, PG_NOSYNC); 2860 } 2861 } 2862 2863 static void 2864 vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot, 2865 int fault_flags) 2866 { 2867 struct lwp *lp; 2868 vm_page_t m; 2869 vm_offset_t addr; 2870 vm_pindex_t index; 2871 vm_pindex_t pindex; 2872 vm_object_t object; 2873 int pprot; 2874 int i; 2875 int noneg; 2876 int nopos; 2877 int maxpages; 2878 2879 /* 2880 * Get stable max count value, disabled if set to 0 2881 */ 2882 maxpages = vm_prefault_pages; 2883 cpu_ccfence(); 2884 if (maxpages <= 0) 2885 return; 2886 2887 /* 2888 * We do not currently prefault mappings that use virtual page 2889 * tables. We do not prefault foreign pmaps. 2890 */ 2891 if (entry->maptype != VM_MAPTYPE_NORMAL) 2892 return; 2893 lp = curthread->td_lwp; 2894 if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) 2895 return; 2896 2897 /* 2898 * Limit pre-fault count to 1024 pages. 2899 */ 2900 if (maxpages > 1024) 2901 maxpages = 1024; 2902 2903 object = entry->object.vm_object; 2904 KKASSERT(object != NULL); 2905 KKASSERT(object == entry->object.vm_object); 2906 2907 /* 2908 * NOTE: VM_FAULT_DIRTY allowed later so must hold object exclusively 2909 * now (or do something more complex XXX). 2910 */ 2911 vm_object_hold(object); 2912 vm_object_chain_acquire(object, 0); 2913 2914 noneg = 0; 2915 nopos = 0; 2916 for (i = 0; i < maxpages; ++i) { 2917 vm_object_t lobject; 2918 vm_object_t nobject; 2919 int allocated = 0; 2920 int error; 2921 2922 /* 2923 * This can eat a lot of time on a heavily contended 2924 * machine so yield on the tick if needed. 2925 */ 2926 if ((i & 7) == 7) 2927 lwkt_yield(); 2928 2929 /* 2930 * Calculate the page to pre-fault, stopping the scan in 2931 * each direction separately if the limit is reached. 2932 */ 2933 if (i & 1) { 2934 if (noneg) 2935 continue; 2936 addr = addra - ((i + 1) >> 1) * PAGE_SIZE; 2937 } else { 2938 if (nopos) 2939 continue; 2940 addr = addra + ((i + 2) >> 1) * PAGE_SIZE; 2941 } 2942 if (addr < entry->start) { 2943 noneg = 1; 2944 if (noneg && nopos) 2945 break; 2946 continue; 2947 } 2948 if (addr >= entry->end) { 2949 nopos = 1; 2950 if (noneg && nopos) 2951 break; 2952 continue; 2953 } 2954 2955 /* 2956 * Skip pages already mapped, and stop scanning in that 2957 * direction. When the scan terminates in both directions 2958 * we are done. 2959 */ 2960 if (pmap_prefault_ok(pmap, addr) == 0) { 2961 if (i & 1) 2962 noneg = 1; 2963 else 2964 nopos = 1; 2965 if (noneg && nopos) 2966 break; 2967 continue; 2968 } 2969 2970 /* 2971 * Follow the VM object chain to obtain the page to be mapped 2972 * into the pmap. 2973 * 2974 * If we reach the terminal object without finding a page 2975 * and we determine it would be advantageous, then allocate 2976 * a zero-fill page for the base object. The base object 2977 * is guaranteed to be OBJT_DEFAULT for this case. 2978 * 2979 * In order to not have to check the pager via *haspage*() 2980 * we stop if any non-default object is encountered. e.g. 2981 * a vnode or swap object would stop the loop. 2982 */ 2983 index = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; 2984 lobject = object; 2985 pindex = index; 2986 pprot = prot; 2987 2988 KKASSERT(lobject == entry->object.vm_object); 2989 /*vm_object_hold(lobject); implied */ 2990 2991 while ((m = vm_page_lookup_busy_try(lobject, pindex, 2992 TRUE, &error)) == NULL) { 2993 if (lobject->type != OBJT_DEFAULT) 2994 break; 2995 if (lobject->backing_object == NULL) { 2996 if (vm_fast_fault == 0) 2997 break; 2998 if ((prot & VM_PROT_WRITE) == 0 || 2999 vm_page_count_min(0)) { 3000 break; 3001 } 3002 3003 /* 3004 * NOTE: Allocated from base object 3005 */ 3006 m = vm_page_alloc(object, index, 3007 VM_ALLOC_NORMAL | 3008 VM_ALLOC_ZERO | 3009 VM_ALLOC_USE_GD | 3010 VM_ALLOC_NULL_OK); 3011 if (m == NULL) 3012 break; 3013 allocated = 1; 3014 pprot = prot; 3015 /* lobject = object .. not needed */ 3016 break; 3017 } 3018 if (lobject->backing_object_offset & PAGE_MASK) 3019 break; 3020 nobject = lobject->backing_object; 3021 vm_object_hold(nobject); 3022 KKASSERT(nobject == lobject->backing_object); 3023 pindex += lobject->backing_object_offset >> PAGE_SHIFT; 3024 if (lobject != object) { 3025 vm_object_lock_swap(); 3026 vm_object_drop(lobject); 3027 } 3028 lobject = nobject; 3029 pprot &= ~VM_PROT_WRITE; 3030 vm_object_chain_acquire(lobject, 0); 3031 } 3032 3033 /* 3034 * NOTE: A non-NULL (m) will be associated with lobject if 3035 * it was found there, otherwise it is probably a 3036 * zero-fill page associated with the base object. 3037 * 3038 * Give-up if no page is available. 3039 */ 3040 if (m == NULL) { 3041 if (lobject != object) { 3042 #if 0 3043 if (object->backing_object != lobject) 3044 vm_object_hold(object->backing_object); 3045 #endif 3046 vm_object_chain_release_all( 3047 object->backing_object, lobject); 3048 #if 0 3049 if (object->backing_object != lobject) 3050 vm_object_drop(object->backing_object); 3051 #endif 3052 vm_object_drop(lobject); 3053 } 3054 break; 3055 } 3056 3057 /* 3058 * The object must be marked dirty if we are mapping a 3059 * writable page. m->object is either lobject or object, 3060 * both of which are still held. Do this before we 3061 * potentially drop the object. 3062 */ 3063 if (pprot & VM_PROT_WRITE) 3064 vm_object_set_writeable_dirty(m->object); 3065 3066 /* 3067 * Do not conditionalize on PG_RAM. If pages are present in 3068 * the VM system we assume optimal caching. If caching is 3069 * not optimal the I/O gravy train will be restarted when we 3070 * hit an unavailable page. We do not want to try to restart 3071 * the gravy train now because we really don't know how much 3072 * of the object has been cached. The cost for restarting 3073 * the gravy train should be low (since accesses will likely 3074 * be I/O bound anyway). 3075 */ 3076 if (lobject != object) { 3077 #if 0 3078 if (object->backing_object != lobject) 3079 vm_object_hold(object->backing_object); 3080 #endif 3081 vm_object_chain_release_all(object->backing_object, 3082 lobject); 3083 #if 0 3084 if (object->backing_object != lobject) 3085 vm_object_drop(object->backing_object); 3086 #endif 3087 vm_object_drop(lobject); 3088 } 3089 3090 /* 3091 * Enter the page into the pmap if appropriate. If we had 3092 * allocated the page we have to place it on a queue. If not 3093 * we just have to make sure it isn't on the cache queue 3094 * (pages on the cache queue are not allowed to be mapped). 3095 */ 3096 if (allocated) { 3097 /* 3098 * Page must be zerod. 3099 */ 3100 vm_page_zero_fill(m); 3101 mycpu->gd_cnt.v_zfod++; 3102 m->valid = VM_PAGE_BITS_ALL; 3103 3104 /* 3105 * Handle dirty page case 3106 */ 3107 if (pprot & VM_PROT_WRITE) 3108 vm_set_nosync(m, entry); 3109 pmap_enter(pmap, addr, m, pprot, 0, entry); 3110 mycpu->gd_cnt.v_vm_faults++; 3111 if (curthread->td_lwp) 3112 ++curthread->td_lwp->lwp_ru.ru_minflt; 3113 vm_page_deactivate(m); 3114 if (pprot & VM_PROT_WRITE) { 3115 /*vm_object_set_writeable_dirty(m->object);*/ 3116 vm_set_nosync(m, entry); 3117 if (fault_flags & VM_FAULT_DIRTY) { 3118 vm_page_dirty(m); 3119 /*XXX*/ 3120 swap_pager_unswapped(m); 3121 } 3122 } 3123 vm_page_wakeup(m); 3124 } else if (error) { 3125 /* couldn't busy page, no wakeup */ 3126 } else if ( 3127 ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && 3128 (m->flags & PG_FICTITIOUS) == 0) { 3129 /* 3130 * A fully valid page not undergoing soft I/O can 3131 * be immediately entered into the pmap. 3132 */ 3133 if ((m->queue - m->pc) == PQ_CACHE) 3134 vm_page_deactivate(m); 3135 if (pprot & VM_PROT_WRITE) { 3136 /*vm_object_set_writeable_dirty(m->object);*/ 3137 vm_set_nosync(m, entry); 3138 if (fault_flags & VM_FAULT_DIRTY) { 3139 vm_page_dirty(m); 3140 /*XXX*/ 3141 swap_pager_unswapped(m); 3142 } 3143 } 3144 if (pprot & VM_PROT_WRITE) 3145 vm_set_nosync(m, entry); 3146 pmap_enter(pmap, addr, m, pprot, 0, entry); 3147 mycpu->gd_cnt.v_vm_faults++; 3148 if (curthread->td_lwp) 3149 ++curthread->td_lwp->lwp_ru.ru_minflt; 3150 vm_page_wakeup(m); 3151 } else { 3152 vm_page_wakeup(m); 3153 } 3154 } 3155 vm_object_chain_release(object); 3156 vm_object_drop(object); 3157 } 3158 3159 /* 3160 * Object can be held shared 3161 */ 3162 static void 3163 vm_prefault_quick(pmap_t pmap, vm_offset_t addra, 3164 vm_map_entry_t entry, int prot, int fault_flags) 3165 { 3166 struct lwp *lp; 3167 vm_page_t m; 3168 vm_offset_t addr; 3169 vm_pindex_t pindex; 3170 vm_object_t object; 3171 int i; 3172 int noneg; 3173 int nopos; 3174 int maxpages; 3175 3176 /* 3177 * Get stable max count value, disabled if set to 0 3178 */ 3179 maxpages = vm_prefault_pages; 3180 cpu_ccfence(); 3181 if (maxpages <= 0) 3182 return; 3183 3184 /* 3185 * We do not currently prefault mappings that use virtual page 3186 * tables. We do not prefault foreign pmaps. 3187 */ 3188 if (entry->maptype != VM_MAPTYPE_NORMAL) 3189 return; 3190 lp = curthread->td_lwp; 3191 if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace))) 3192 return; 3193 object = entry->object.vm_object; 3194 if (object->backing_object != NULL) 3195 return; 3196 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 3197 3198 /* 3199 * Limit pre-fault count to 1024 pages. 3200 */ 3201 if (maxpages > 1024) 3202 maxpages = 1024; 3203 3204 noneg = 0; 3205 nopos = 0; 3206 for (i = 0; i < maxpages; ++i) { 3207 int error; 3208 3209 /* 3210 * Calculate the page to pre-fault, stopping the scan in 3211 * each direction separately if the limit is reached. 3212 */ 3213 if (i & 1) { 3214 if (noneg) 3215 continue; 3216 addr = addra - ((i + 1) >> 1) * PAGE_SIZE; 3217 } else { 3218 if (nopos) 3219 continue; 3220 addr = addra + ((i + 2) >> 1) * PAGE_SIZE; 3221 } 3222 if (addr < entry->start) { 3223 noneg = 1; 3224 if (noneg && nopos) 3225 break; 3226 continue; 3227 } 3228 if (addr >= entry->end) { 3229 nopos = 1; 3230 if (noneg && nopos) 3231 break; 3232 continue; 3233 } 3234 3235 /* 3236 * Follow the VM object chain to obtain the page to be mapped 3237 * into the pmap. This version of the prefault code only 3238 * works with terminal objects. 3239 * 3240 * The page must already exist. If we encounter a problem 3241 * we stop here. 3242 * 3243 * WARNING! We cannot call swap_pager_unswapped() or insert 3244 * a new vm_page with a shared token. 3245 */ 3246 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; 3247 3248 /* 3249 * Skip pages already mapped, and stop scanning in that 3250 * direction. When the scan terminates in both directions 3251 * we are done. 3252 */ 3253 if (pmap_prefault_ok(pmap, addr) == 0) { 3254 if (i & 1) 3255 noneg = 1; 3256 else 3257 nopos = 1; 3258 if (noneg && nopos) 3259 break; 3260 continue; 3261 } 3262 3263 /* 3264 * Shortcut the read-only mapping case using the far more 3265 * efficient vm_page_lookup_sbusy_try() function. This 3266 * allows us to acquire the page soft-busied only which 3267 * is especially nice for concurrent execs of the same 3268 * program. 3269 * 3270 * The lookup function also validates page suitability 3271 * (all valid bits set, and not fictitious). 3272 * 3273 * If the page is in PQ_CACHE we have to fall-through 3274 * and hard-busy it so we can move it out of PQ_CACHE. 3275 */ 3276 if ((prot & VM_PROT_WRITE) == 0) { 3277 m = vm_page_lookup_sbusy_try(object, pindex, 3278 0, PAGE_SIZE); 3279 if (m == NULL) 3280 break; 3281 if ((m->queue - m->pc) != PQ_CACHE) { 3282 pmap_enter(pmap, addr, m, prot, 0, entry); 3283 mycpu->gd_cnt.v_vm_faults++; 3284 if (curthread->td_lwp) 3285 ++curthread->td_lwp->lwp_ru.ru_minflt; 3286 vm_page_sbusy_drop(m); 3287 continue; 3288 } 3289 vm_page_sbusy_drop(m); 3290 } 3291 3292 /* 3293 * Fallback to normal vm_page lookup code. This code 3294 * hard-busies the page. Not only that, but the page 3295 * can remain in that state for a significant period 3296 * time due to pmap_enter()'s overhead. 3297 */ 3298 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 3299 if (m == NULL || error) 3300 break; 3301 3302 /* 3303 * Stop if the page cannot be trivially entered into the 3304 * pmap. 3305 */ 3306 if (((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) || 3307 (m->flags & PG_FICTITIOUS) || 3308 ((m->flags & PG_SWAPPED) && 3309 (prot & VM_PROT_WRITE) && 3310 (fault_flags & VM_FAULT_DIRTY))) { 3311 vm_page_wakeup(m); 3312 break; 3313 } 3314 3315 /* 3316 * Enter the page into the pmap. The object might be held 3317 * shared so we can't do any (serious) modifying operation 3318 * on it. 3319 */ 3320 if ((m->queue - m->pc) == PQ_CACHE) 3321 vm_page_deactivate(m); 3322 if (prot & VM_PROT_WRITE) { 3323 vm_object_set_writeable_dirty(m->object); 3324 vm_set_nosync(m, entry); 3325 if (fault_flags & VM_FAULT_DIRTY) { 3326 vm_page_dirty(m); 3327 /* can't happeen due to conditional above */ 3328 /* swap_pager_unswapped(m); */ 3329 } 3330 } 3331 pmap_enter(pmap, addr, m, prot, 0, entry); 3332 mycpu->gd_cnt.v_vm_faults++; 3333 if (curthread->td_lwp) 3334 ++curthread->td_lwp->lwp_ru.ru_minflt; 3335 vm_page_wakeup(m); 3336 } 3337 } 3338