1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * The Mach Operating System project at Carnegie-Mellon University. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94 35 * 36 * 37 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 38 * All rights reserved. 39 * 40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 41 * 42 * Permission to use, copy, modify and distribute this software and 43 * its documentation is hereby granted, provided that both the copyright 44 * notice and this permission notice appear in all copies of the 45 * software, derivative works or modified versions, and any portions 46 * thereof, and that both notices appear in supporting documentation. 47 * 48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 51 * 52 * Carnegie Mellon requests users of this software to return to 53 * 54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 55 * School of Computer Science 56 * Carnegie Mellon University 57 * Pittsburgh PA 15213-3890 58 * 59 * any improvements or extensions that they make and grant Carnegie the 60 * rights to redistribute these changes. 61 * 62 * $FreeBSD: src/sys/vm/vm_map.c,v 1.187.2.19 2003/05/27 00:47:02 alc Exp $ 63 */ 64 65 /* 66 * Virtual memory mapping module. 67 */ 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/kernel.h> 72 #include <sys/proc.h> 73 #include <sys/serialize.h> 74 #include <sys/lock.h> 75 #include <sys/vmmeter.h> 76 #include <sys/mman.h> 77 #include <sys/vnode.h> 78 #include <sys/resourcevar.h> 79 #include <sys/shm.h> 80 #include <sys/tree.h> 81 #include <sys/malloc.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <vm/pmap.h> 86 #include <vm/vm_map.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_pager.h> 90 #include <vm/vm_kern.h> 91 #include <vm/vm_extern.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_zone.h> 94 95 #include <sys/thread2.h> 96 #include <sys/sysref2.h> 97 #include <sys/random.h> 98 #include <sys/sysctl.h> 99 100 /* 101 * Virtual memory maps provide for the mapping, protection, and sharing 102 * of virtual memory objects. In addition, this module provides for an 103 * efficient virtual copy of memory from one map to another. 104 * 105 * Synchronization is required prior to most operations. 106 * 107 * Maps consist of an ordered doubly-linked list of simple entries. 108 * A hint and a RB tree is used to speed-up lookups. 109 * 110 * Callers looking to modify maps specify start/end addresses which cause 111 * the related map entry to be clipped if necessary, and then later 112 * recombined if the pieces remained compatible. 113 * 114 * Virtual copy operations are performed by copying VM object references 115 * from one map to another, and then marking both regions as copy-on-write. 116 */ 117 static void vmspace_terminate(struct vmspace *vm); 118 static void vmspace_lock(struct vmspace *vm); 119 static void vmspace_unlock(struct vmspace *vm); 120 static void vmspace_dtor(void *obj, void *private); 121 122 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore"); 123 124 struct sysref_class vmspace_sysref_class = { 125 .name = "vmspace", 126 .mtype = M_VMSPACE, 127 .proto = SYSREF_PROTO_VMSPACE, 128 .offset = offsetof(struct vmspace, vm_sysref), 129 .objsize = sizeof(struct vmspace), 130 .nom_cache = 32, 131 .flags = SRC_MANAGEDINIT, 132 .dtor = vmspace_dtor, 133 .ops = { 134 .terminate = (sysref_terminate_func_t)vmspace_terminate, 135 .lock = (sysref_lock_func_t)vmspace_lock, 136 .unlock = (sysref_lock_func_t)vmspace_unlock 137 } 138 }; 139 140 /* 141 * per-cpu page table cross mappings are initialized in early boot 142 * and might require a considerable number of vm_map_entry structures. 143 */ 144 #define VMEPERCPU (MAXCPU+1) 145 146 static struct vm_zone mapentzone_store, mapzone_store; 147 static vm_zone_t mapentzone, mapzone; 148 static struct vm_object mapentobj, mapobj; 149 150 static struct vm_map_entry map_entry_init[MAX_MAPENT]; 151 static struct vm_map_entry cpu_map_entry_init[MAXCPU][VMEPERCPU]; 152 static struct vm_map map_init[MAX_KMAP]; 153 154 static int randomize_mmap; 155 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0, 156 "Randomize mmap offsets"); 157 static int vm_map_relock_enable = 1; 158 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW, 159 &vm_map_relock_enable, 0, "Randomize mmap offsets"); 160 161 static void vm_map_entry_shadow(vm_map_entry_t entry, int addref); 162 static vm_map_entry_t vm_map_entry_create(vm_map_t map, int *); 163 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *); 164 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *); 165 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *); 166 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *); 167 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t); 168 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t, 169 vm_map_entry_t); 170 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int *count, int flags); 171 172 /* 173 * Initialize the vm_map module. Must be called before any other vm_map 174 * routines. 175 * 176 * Map and entry structures are allocated from the general purpose 177 * memory pool with some exceptions: 178 * 179 * - The kernel map is allocated statically. 180 * - Initial kernel map entries are allocated out of a static pool. 181 * 182 * These restrictions are necessary since malloc() uses the 183 * maps and requires map entries. 184 * 185 * Called from the low level boot code only. 186 */ 187 void 188 vm_map_startup(void) 189 { 190 mapzone = &mapzone_store; 191 zbootinit(mapzone, "MAP", sizeof (struct vm_map), 192 map_init, MAX_KMAP); 193 mapentzone = &mapentzone_store; 194 zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry), 195 map_entry_init, MAX_MAPENT); 196 } 197 198 /* 199 * Called prior to any vmspace allocations. 200 * 201 * Called from the low level boot code only. 202 */ 203 void 204 vm_init2(void) 205 { 206 zinitna(mapentzone, &mapentobj, NULL, 0, 0, 207 ZONE_USE_RESERVE | ZONE_SPECIAL, 1); 208 zinitna(mapzone, &mapobj, NULL, 0, 0, 0, 1); 209 pmap_init2(); 210 vm_object_init2(); 211 } 212 213 214 /* 215 * Red black tree functions 216 * 217 * The caller must hold the related map lock. 218 */ 219 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b); 220 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare); 221 222 /* a->start is address, and the only field has to be initialized */ 223 static int 224 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b) 225 { 226 if (a->start < b->start) 227 return(-1); 228 else if (a->start > b->start) 229 return(1); 230 return(0); 231 } 232 233 /* 234 * Allocate a vmspace structure, including a vm_map and pmap. 235 * Initialize numerous fields. While the initial allocation is zerod, 236 * subsequence reuse from the objcache leaves elements of the structure 237 * intact (particularly the pmap), so portions must be zerod. 238 * 239 * The structure is not considered activated until we call sysref_activate(). 240 * 241 * No requirements. 242 */ 243 struct vmspace * 244 vmspace_alloc(vm_offset_t min, vm_offset_t max) 245 { 246 struct vmspace *vm; 247 248 vm = sysref_alloc(&vmspace_sysref_class); 249 bzero(&vm->vm_startcopy, 250 (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy); 251 vm_map_init(&vm->vm_map, min, max, NULL); /* initializes token */ 252 253 /* 254 * Use a hold to prevent any additional racing hold from terminating 255 * the vmspace before we manage to activate it. This also acquires 256 * the token for safety. 257 */ 258 KKASSERT(vm->vm_holdcount == 0); 259 KKASSERT(vm->vm_exitingcnt == 0); 260 vmspace_hold(vm); 261 pmap_pinit(vmspace_pmap(vm)); /* (some fields reused) */ 262 vm->vm_map.pmap = vmspace_pmap(vm); /* XXX */ 263 vm->vm_shm = NULL; 264 vm->vm_flags = 0; 265 cpu_vmspace_alloc(vm); 266 sysref_activate(&vm->vm_sysref); 267 vmspace_drop(vm); 268 269 return (vm); 270 } 271 272 /* 273 * Free a primary reference to a vmspace. This can trigger a 274 * stage-1 termination. 275 */ 276 void 277 vmspace_free(struct vmspace *vm) 278 { 279 /* 280 * We want all finalization to occur via vmspace_drop() so we 281 * need to hold the vm around the put. 282 */ 283 vmspace_hold(vm); 284 sysref_put(&vm->vm_sysref); 285 vmspace_drop(vm); 286 } 287 288 void 289 vmspace_ref(struct vmspace *vm) 290 { 291 sysref_get(&vm->vm_sysref); 292 } 293 294 void 295 vmspace_hold(struct vmspace *vm) 296 { 297 refcount_acquire(&vm->vm_holdcount); 298 lwkt_gettoken(&vm->vm_map.token); 299 } 300 301 void 302 vmspace_drop(struct vmspace *vm) 303 { 304 lwkt_reltoken(&vm->vm_map.token); 305 if (refcount_release(&vm->vm_holdcount)) { 306 if (vm->vm_exitingcnt == 0 && 307 sysref_isinactive(&vm->vm_sysref)) { 308 vmspace_terminate(vm); 309 } 310 } 311 } 312 313 /* 314 * dtor function - Some elements of the pmap are retained in the 315 * free-cached vmspaces to improve performance. We have to clean them up 316 * here before returning the vmspace to the memory pool. 317 * 318 * No requirements. 319 */ 320 static void 321 vmspace_dtor(void *obj, void *private) 322 { 323 struct vmspace *vm = obj; 324 325 pmap_puninit(vmspace_pmap(vm)); 326 } 327 328 /* 329 * Called in three cases: 330 * 331 * (1) When the last sysref is dropped and the vmspace becomes inactive. 332 * (holdcount will not be 0 because the vmspace is held through the op) 333 * 334 * (2) When exitingcount becomes 0 on the last reap 335 * (holdcount will not be 0 because the vmspace is held through the op) 336 * 337 * (3) When the holdcount becomes 0 in addition to the above two 338 * 339 * sysref will not scrap the object until we call sysref_put() once more 340 * after the last ref has been dropped. 341 * 342 * VMSPACE_EXIT1 flags the primary deactivation 343 * VMSPACE_EXIT2 flags the last reap 344 */ 345 static void 346 vmspace_terminate(struct vmspace *vm) 347 { 348 int count; 349 350 /* 351 * 352 */ 353 lwkt_gettoken(&vm->vm_map.token); 354 if ((vm->vm_flags & VMSPACE_EXIT1) == 0) { 355 vm->vm_flags |= VMSPACE_EXIT1; 356 shmexit(vm); 357 pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS, 358 VM_MAX_USER_ADDRESS); 359 vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS, 360 VM_MAX_USER_ADDRESS); 361 } 362 if ((vm->vm_flags & VMSPACE_EXIT2) == 0 && vm->vm_exitingcnt == 0) { 363 vm->vm_flags |= VMSPACE_EXIT2; 364 cpu_vmspace_free(vm); 365 shmexit(vm); 366 367 /* 368 * Lock the map, to wait out all other references to it. 369 * Delete all of the mappings and pages they hold, then call 370 * the pmap module to reclaim anything left. 371 */ 372 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 373 vm_map_lock(&vm->vm_map); 374 vm_map_delete(&vm->vm_map, vm->vm_map.min_offset, 375 vm->vm_map.max_offset, &count); 376 vm_map_unlock(&vm->vm_map); 377 vm_map_entry_release(count); 378 379 lwkt_gettoken(&vmspace_pmap(vm)->pm_token); 380 pmap_release(vmspace_pmap(vm)); 381 lwkt_reltoken(&vmspace_pmap(vm)->pm_token); 382 } 383 384 lwkt_reltoken(&vm->vm_map.token); 385 if (vm->vm_exitingcnt == 0 && vm->vm_holdcount == 0) { 386 KKASSERT(vm->vm_flags & VMSPACE_EXIT1); 387 KKASSERT(vm->vm_flags & VMSPACE_EXIT2); 388 sysref_put(&vm->vm_sysref); 389 } 390 } 391 392 /* 393 * vmspaces are not currently locked. 394 */ 395 static void 396 vmspace_lock(struct vmspace *vm __unused) 397 { 398 } 399 400 static void 401 vmspace_unlock(struct vmspace *vm __unused) 402 { 403 } 404 405 /* 406 * This is called during exit indicating that the vmspace is no 407 * longer in used by an exiting process, but the process has not yet 408 * been reaped. 409 * 410 * No requirements. 411 */ 412 void 413 vmspace_exitbump(struct vmspace *vm) 414 { 415 vmspace_hold(vm); 416 ++vm->vm_exitingcnt; 417 vmspace_drop(vm); /* handles termination sequencing */ 418 } 419 420 /* 421 * Decrement the exitingcnt and issue the stage-2 termination if it becomes 422 * zero and the stage1 termination has already occured. 423 * 424 * No requirements. 425 */ 426 void 427 vmspace_exitfree(struct proc *p) 428 { 429 struct vmspace *vm; 430 431 vm = p->p_vmspace; 432 p->p_vmspace = NULL; 433 vmspace_hold(vm); 434 KKASSERT(vm->vm_exitingcnt > 0); 435 if (--vm->vm_exitingcnt == 0 && sysref_isinactive(&vm->vm_sysref)) 436 vmspace_terminate(vm); 437 vmspace_drop(vm); /* handles termination sequencing */ 438 } 439 440 /* 441 * Swap useage is determined by taking the proportional swap used by 442 * VM objects backing the VM map. To make up for fractional losses, 443 * if the VM object has any swap use at all the associated map entries 444 * count for at least 1 swap page. 445 * 446 * No requirements. 447 */ 448 int 449 vmspace_swap_count(struct vmspace *vm) 450 { 451 vm_map_t map = &vm->vm_map; 452 vm_map_entry_t cur; 453 vm_object_t object; 454 int count = 0; 455 int n; 456 457 vmspace_hold(vm); 458 for (cur = map->header.next; cur != &map->header; cur = cur->next) { 459 switch(cur->maptype) { 460 case VM_MAPTYPE_NORMAL: 461 case VM_MAPTYPE_VPAGETABLE: 462 if ((object = cur->object.vm_object) == NULL) 463 break; 464 if (object->swblock_count) { 465 n = (cur->end - cur->start) / PAGE_SIZE; 466 count += object->swblock_count * 467 SWAP_META_PAGES * n / object->size + 1; 468 } 469 break; 470 default: 471 break; 472 } 473 } 474 vmspace_drop(vm); 475 476 return(count); 477 } 478 479 /* 480 * Calculate the approximate number of anonymous pages in use by 481 * this vmspace. To make up for fractional losses, we count each 482 * VM object as having at least 1 anonymous page. 483 * 484 * No requirements. 485 */ 486 int 487 vmspace_anonymous_count(struct vmspace *vm) 488 { 489 vm_map_t map = &vm->vm_map; 490 vm_map_entry_t cur; 491 vm_object_t object; 492 int count = 0; 493 494 vmspace_hold(vm); 495 for (cur = map->header.next; cur != &map->header; cur = cur->next) { 496 switch(cur->maptype) { 497 case VM_MAPTYPE_NORMAL: 498 case VM_MAPTYPE_VPAGETABLE: 499 if ((object = cur->object.vm_object) == NULL) 500 break; 501 if (object->type != OBJT_DEFAULT && 502 object->type != OBJT_SWAP) { 503 break; 504 } 505 count += object->resident_page_count; 506 break; 507 default: 508 break; 509 } 510 } 511 vmspace_drop(vm); 512 513 return(count); 514 } 515 516 /* 517 * Creates and returns a new empty VM map with the given physical map 518 * structure, and having the given lower and upper address bounds. 519 * 520 * No requirements. 521 */ 522 vm_map_t 523 vm_map_create(vm_map_t result, pmap_t pmap, vm_offset_t min, vm_offset_t max) 524 { 525 if (result == NULL) 526 result = zalloc(mapzone); 527 vm_map_init(result, min, max, pmap); 528 return (result); 529 } 530 531 /* 532 * Initialize an existing vm_map structure such as that in the vmspace 533 * structure. The pmap is initialized elsewhere. 534 * 535 * No requirements. 536 */ 537 void 538 vm_map_init(struct vm_map *map, vm_offset_t min, vm_offset_t max, pmap_t pmap) 539 { 540 map->header.next = map->header.prev = &map->header; 541 RB_INIT(&map->rb_root); 542 map->nentries = 0; 543 map->size = 0; 544 map->system_map = 0; 545 map->min_offset = min; 546 map->max_offset = max; 547 map->pmap = pmap; 548 map->first_free = &map->header; 549 map->hint = &map->header; 550 map->timestamp = 0; 551 map->flags = 0; 552 lwkt_token_init(&map->token, "vm_map"); 553 lockinit(&map->lock, "thrd_sleep", (hz + 9) / 10, 0); 554 TUNABLE_INT("vm.cache_vmspaces", &vmspace_sysref_class.nom_cache); 555 } 556 557 /* 558 * Shadow the vm_map_entry's object. This typically needs to be done when 559 * a write fault is taken on an entry which had previously been cloned by 560 * fork(). The shared object (which might be NULL) must become private so 561 * we add a shadow layer above it. 562 * 563 * Object allocation for anonymous mappings is defered as long as possible. 564 * When creating a shadow, however, the underlying object must be instantiated 565 * so it can be shared. 566 * 567 * If the map segment is governed by a virtual page table then it is 568 * possible to address offsets beyond the mapped area. Just allocate 569 * a maximally sized object for this case. 570 * 571 * The vm_map must be exclusively locked. 572 * No other requirements. 573 */ 574 static 575 void 576 vm_map_entry_shadow(vm_map_entry_t entry, int addref) 577 { 578 if (entry->maptype == VM_MAPTYPE_VPAGETABLE) { 579 vm_object_shadow(&entry->object.vm_object, &entry->offset, 580 0x7FFFFFFF, addref); /* XXX */ 581 } else { 582 vm_object_shadow(&entry->object.vm_object, &entry->offset, 583 atop(entry->end - entry->start), addref); 584 } 585 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; 586 } 587 588 /* 589 * Allocate an object for a vm_map_entry. 590 * 591 * Object allocation for anonymous mappings is defered as long as possible. 592 * This function is called when we can defer no longer, generally when a map 593 * entry might be split or forked or takes a page fault. 594 * 595 * If the map segment is governed by a virtual page table then it is 596 * possible to address offsets beyond the mapped area. Just allocate 597 * a maximally sized object for this case. 598 * 599 * The vm_map must be exclusively locked. 600 * No other requirements. 601 */ 602 void 603 vm_map_entry_allocate_object(vm_map_entry_t entry) 604 { 605 vm_object_t obj; 606 607 if (entry->maptype == VM_MAPTYPE_VPAGETABLE) { 608 obj = vm_object_allocate(OBJT_DEFAULT, 0x7FFFFFFF); /* XXX */ 609 } else { 610 obj = vm_object_allocate(OBJT_DEFAULT, 611 atop(entry->end - entry->start)); 612 } 613 entry->object.vm_object = obj; 614 entry->offset = 0; 615 } 616 617 /* 618 * Set an initial negative count so the first attempt to reserve 619 * space preloads a bunch of vm_map_entry's for this cpu. Also 620 * pre-allocate 2 vm_map_entries which will be needed by zalloc() to 621 * map a new page for vm_map_entry structures. SMP systems are 622 * particularly sensitive. 623 * 624 * This routine is called in early boot so we cannot just call 625 * vm_map_entry_reserve(). 626 * 627 * Called from the low level boot code only (for each cpu) 628 */ 629 void 630 vm_map_entry_reserve_cpu_init(globaldata_t gd) 631 { 632 vm_map_entry_t entry; 633 int i; 634 635 gd->gd_vme_avail -= MAP_RESERVE_COUNT * 2; 636 entry = &cpu_map_entry_init[gd->gd_cpuid][0]; 637 for (i = 0; i < VMEPERCPU; ++i, ++entry) { 638 entry->next = gd->gd_vme_base; 639 gd->gd_vme_base = entry; 640 } 641 } 642 643 /* 644 * Reserves vm_map_entry structures so code later on can manipulate 645 * map_entry structures within a locked map without blocking trying 646 * to allocate a new vm_map_entry. 647 * 648 * No requirements. 649 */ 650 int 651 vm_map_entry_reserve(int count) 652 { 653 struct globaldata *gd = mycpu; 654 vm_map_entry_t entry; 655 656 /* 657 * Make sure we have enough structures in gd_vme_base to handle 658 * the reservation request. 659 * 660 * The critical section protects access to the per-cpu gd. 661 */ 662 crit_enter(); 663 while (gd->gd_vme_avail < count) { 664 entry = zalloc(mapentzone); 665 entry->next = gd->gd_vme_base; 666 gd->gd_vme_base = entry; 667 ++gd->gd_vme_avail; 668 } 669 gd->gd_vme_avail -= count; 670 crit_exit(); 671 672 return(count); 673 } 674 675 /* 676 * Releases previously reserved vm_map_entry structures that were not 677 * used. If we have too much junk in our per-cpu cache clean some of 678 * it out. 679 * 680 * No requirements. 681 */ 682 void 683 vm_map_entry_release(int count) 684 { 685 struct globaldata *gd = mycpu; 686 vm_map_entry_t entry; 687 688 crit_enter(); 689 gd->gd_vme_avail += count; 690 while (gd->gd_vme_avail > MAP_RESERVE_SLOP) { 691 entry = gd->gd_vme_base; 692 KKASSERT(entry != NULL); 693 gd->gd_vme_base = entry->next; 694 --gd->gd_vme_avail; 695 crit_exit(); 696 zfree(mapentzone, entry); 697 crit_enter(); 698 } 699 crit_exit(); 700 } 701 702 /* 703 * Reserve map entry structures for use in kernel_map itself. These 704 * entries have *ALREADY* been reserved on a per-cpu basis when the map 705 * was inited. This function is used by zalloc() to avoid a recursion 706 * when zalloc() itself needs to allocate additional kernel memory. 707 * 708 * This function works like the normal reserve but does not load the 709 * vm_map_entry cache (because that would result in an infinite 710 * recursion). Note that gd_vme_avail may go negative. This is expected. 711 * 712 * Any caller of this function must be sure to renormalize after 713 * potentially eating entries to ensure that the reserve supply 714 * remains intact. 715 * 716 * No requirements. 717 */ 718 int 719 vm_map_entry_kreserve(int count) 720 { 721 struct globaldata *gd = mycpu; 722 723 crit_enter(); 724 gd->gd_vme_avail -= count; 725 crit_exit(); 726 KASSERT(gd->gd_vme_base != NULL, 727 ("no reserved entries left, gd_vme_avail = %d", 728 gd->gd_vme_avail)); 729 return(count); 730 } 731 732 /* 733 * Release previously reserved map entries for kernel_map. We do not 734 * attempt to clean up like the normal release function as this would 735 * cause an unnecessary (but probably not fatal) deep procedure call. 736 * 737 * No requirements. 738 */ 739 void 740 vm_map_entry_krelease(int count) 741 { 742 struct globaldata *gd = mycpu; 743 744 crit_enter(); 745 gd->gd_vme_avail += count; 746 crit_exit(); 747 } 748 749 /* 750 * Allocates a VM map entry for insertion. No entry fields are filled in. 751 * 752 * The entries should have previously been reserved. The reservation count 753 * is tracked in (*countp). 754 * 755 * No requirements. 756 */ 757 static vm_map_entry_t 758 vm_map_entry_create(vm_map_t map, int *countp) 759 { 760 struct globaldata *gd = mycpu; 761 vm_map_entry_t entry; 762 763 KKASSERT(*countp > 0); 764 --*countp; 765 crit_enter(); 766 entry = gd->gd_vme_base; 767 KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp)); 768 gd->gd_vme_base = entry->next; 769 crit_exit(); 770 771 return(entry); 772 } 773 774 /* 775 * Dispose of a vm_map_entry that is no longer being referenced. 776 * 777 * No requirements. 778 */ 779 static void 780 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp) 781 { 782 struct globaldata *gd = mycpu; 783 784 KKASSERT(map->hint != entry); 785 KKASSERT(map->first_free != entry); 786 787 ++*countp; 788 crit_enter(); 789 entry->next = gd->gd_vme_base; 790 gd->gd_vme_base = entry; 791 crit_exit(); 792 } 793 794 795 /* 796 * Insert/remove entries from maps. 797 * 798 * The related map must be exclusively locked. 799 * The caller must hold map->token 800 * No other requirements. 801 */ 802 static __inline void 803 vm_map_entry_link(vm_map_t map, 804 vm_map_entry_t after_where, 805 vm_map_entry_t entry) 806 { 807 ASSERT_VM_MAP_LOCKED(map); 808 809 map->nentries++; 810 entry->prev = after_where; 811 entry->next = after_where->next; 812 entry->next->prev = entry; 813 after_where->next = entry; 814 if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry)) 815 panic("vm_map_entry_link: dup addr map %p ent %p", map, entry); 816 } 817 818 static __inline void 819 vm_map_entry_unlink(vm_map_t map, 820 vm_map_entry_t entry) 821 { 822 vm_map_entry_t prev; 823 vm_map_entry_t next; 824 825 ASSERT_VM_MAP_LOCKED(map); 826 827 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 828 panic("vm_map_entry_unlink: attempt to mess with " 829 "locked entry! %p", entry); 830 } 831 prev = entry->prev; 832 next = entry->next; 833 next->prev = prev; 834 prev->next = next; 835 vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry); 836 map->nentries--; 837 } 838 839 /* 840 * Finds the map entry containing (or immediately preceding) the specified 841 * address in the given map. The entry is returned in (*entry). 842 * 843 * The boolean result indicates whether the address is actually contained 844 * in the map. 845 * 846 * The related map must be locked. 847 * No other requirements. 848 */ 849 boolean_t 850 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry) 851 { 852 vm_map_entry_t tmp; 853 vm_map_entry_t last; 854 855 ASSERT_VM_MAP_LOCKED(map); 856 #if 0 857 /* 858 * XXX TEMPORARILY DISABLED. For some reason our attempt to revive 859 * the hint code with the red-black lookup meets with system crashes 860 * and lockups. We do not yet know why. 861 * 862 * It is possible that the problem is related to the setting 863 * of the hint during map_entry deletion, in the code specified 864 * at the GGG comment later on in this file. 865 * 866 * YYY More likely it's because this function can be called with 867 * a shared lock on the map, resulting in map->hint updates possibly 868 * racing. Fixed now but untested. 869 */ 870 /* 871 * Quickly check the cached hint, there's a good chance of a match. 872 */ 873 tmp = map->hint; 874 cpu_ccfence(); 875 if (tmp != &map->header) { 876 if (address >= tmp->start && address < tmp->end) { 877 *entry = tmp; 878 return(TRUE); 879 } 880 } 881 #endif 882 883 /* 884 * Locate the record from the top of the tree. 'last' tracks the 885 * closest prior record and is returned if no match is found, which 886 * in binary tree terms means tracking the most recent right-branch 887 * taken. If there is no prior record, &map->header is returned. 888 */ 889 last = &map->header; 890 tmp = RB_ROOT(&map->rb_root); 891 892 while (tmp) { 893 if (address >= tmp->start) { 894 if (address < tmp->end) { 895 *entry = tmp; 896 map->hint = tmp; 897 return(TRUE); 898 } 899 last = tmp; 900 tmp = RB_RIGHT(tmp, rb_entry); 901 } else { 902 tmp = RB_LEFT(tmp, rb_entry); 903 } 904 } 905 *entry = last; 906 return (FALSE); 907 } 908 909 /* 910 * Inserts the given whole VM object into the target map at the specified 911 * address range. The object's size should match that of the address range. 912 * 913 * The map must be exclusively locked. 914 * The object must be held. 915 * The caller must have reserved sufficient vm_map_entry structures. 916 * 917 * If object is non-NULL, ref count must be bumped by caller prior to 918 * making call to account for the new entry. 919 */ 920 int 921 vm_map_insert(vm_map_t map, int *countp, 922 vm_object_t object, vm_ooffset_t offset, 923 vm_offset_t start, vm_offset_t end, 924 vm_maptype_t maptype, 925 vm_prot_t prot, vm_prot_t max, 926 int cow) 927 { 928 vm_map_entry_t new_entry; 929 vm_map_entry_t prev_entry; 930 vm_map_entry_t temp_entry; 931 vm_eflags_t protoeflags; 932 int must_drop = 0; 933 934 ASSERT_VM_MAP_LOCKED(map); 935 if (object) 936 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 937 938 /* 939 * Check that the start and end points are not bogus. 940 */ 941 if ((start < map->min_offset) || (end > map->max_offset) || 942 (start >= end)) 943 return (KERN_INVALID_ADDRESS); 944 945 /* 946 * Find the entry prior to the proposed starting address; if it's part 947 * of an existing entry, this range is bogus. 948 */ 949 if (vm_map_lookup_entry(map, start, &temp_entry)) 950 return (KERN_NO_SPACE); 951 952 prev_entry = temp_entry; 953 954 /* 955 * Assert that the next entry doesn't overlap the end point. 956 */ 957 958 if ((prev_entry->next != &map->header) && 959 (prev_entry->next->start < end)) 960 return (KERN_NO_SPACE); 961 962 protoeflags = 0; 963 964 if (cow & MAP_COPY_ON_WRITE) 965 protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY; 966 967 if (cow & MAP_NOFAULT) { 968 protoeflags |= MAP_ENTRY_NOFAULT; 969 970 KASSERT(object == NULL, 971 ("vm_map_insert: paradoxical MAP_NOFAULT request")); 972 } 973 if (cow & MAP_DISABLE_SYNCER) 974 protoeflags |= MAP_ENTRY_NOSYNC; 975 if (cow & MAP_DISABLE_COREDUMP) 976 protoeflags |= MAP_ENTRY_NOCOREDUMP; 977 if (cow & MAP_IS_STACK) 978 protoeflags |= MAP_ENTRY_STACK; 979 if (cow & MAP_IS_KSTACK) 980 protoeflags |= MAP_ENTRY_KSTACK; 981 982 lwkt_gettoken(&map->token); 983 984 if (object) { 985 /* 986 * When object is non-NULL, it could be shared with another 987 * process. We have to set or clear OBJ_ONEMAPPING 988 * appropriately. 989 * 990 * NOTE: This flag is only applicable to DEFAULT and SWAP 991 * objects and will already be clear in other types 992 * of objects, so a shared object lock is ok for 993 * VNODE objects. 994 */ 995 if ((object->ref_count > 1) || (object->shadow_count != 0)) { 996 vm_object_clear_flag(object, OBJ_ONEMAPPING); 997 } 998 } 999 else if ((prev_entry != &map->header) && 1000 (prev_entry->eflags == protoeflags) && 1001 (prev_entry->end == start) && 1002 (prev_entry->wired_count == 0) && 1003 prev_entry->maptype == maptype && 1004 ((prev_entry->object.vm_object == NULL) || 1005 vm_object_coalesce(prev_entry->object.vm_object, 1006 OFF_TO_IDX(prev_entry->offset), 1007 (vm_size_t)(prev_entry->end - prev_entry->start), 1008 (vm_size_t)(end - prev_entry->end)))) { 1009 /* 1010 * We were able to extend the object. Determine if we 1011 * can extend the previous map entry to include the 1012 * new range as well. 1013 */ 1014 if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) && 1015 (prev_entry->protection == prot) && 1016 (prev_entry->max_protection == max)) { 1017 map->size += (end - prev_entry->end); 1018 prev_entry->end = end; 1019 vm_map_simplify_entry(map, prev_entry, countp); 1020 lwkt_reltoken(&map->token); 1021 return (KERN_SUCCESS); 1022 } 1023 1024 /* 1025 * If we can extend the object but cannot extend the 1026 * map entry, we have to create a new map entry. We 1027 * must bump the ref count on the extended object to 1028 * account for it. object may be NULL. 1029 */ 1030 object = prev_entry->object.vm_object; 1031 offset = prev_entry->offset + 1032 (prev_entry->end - prev_entry->start); 1033 if (object) { 1034 vm_object_hold(object); 1035 vm_object_chain_wait(object); 1036 vm_object_reference_locked(object); 1037 must_drop = 1; 1038 } 1039 } 1040 1041 /* 1042 * NOTE: if conditionals fail, object can be NULL here. This occurs 1043 * in things like the buffer map where we manage kva but do not manage 1044 * backing objects. 1045 */ 1046 1047 /* 1048 * Create a new entry 1049 */ 1050 1051 new_entry = vm_map_entry_create(map, countp); 1052 new_entry->start = start; 1053 new_entry->end = end; 1054 1055 new_entry->maptype = maptype; 1056 new_entry->eflags = protoeflags; 1057 new_entry->object.vm_object = object; 1058 new_entry->offset = offset; 1059 new_entry->aux.master_pde = 0; 1060 1061 new_entry->inheritance = VM_INHERIT_DEFAULT; 1062 new_entry->protection = prot; 1063 new_entry->max_protection = max; 1064 new_entry->wired_count = 0; 1065 1066 /* 1067 * Insert the new entry into the list 1068 */ 1069 1070 vm_map_entry_link(map, prev_entry, new_entry); 1071 map->size += new_entry->end - new_entry->start; 1072 1073 /* 1074 * Update the free space hint. Entries cannot overlap. 1075 * An exact comparison is needed to avoid matching 1076 * against the map->header. 1077 */ 1078 if ((map->first_free == prev_entry) && 1079 (prev_entry->end == new_entry->start)) { 1080 map->first_free = new_entry; 1081 } 1082 1083 #if 0 1084 /* 1085 * Temporarily removed to avoid MAP_STACK panic, due to 1086 * MAP_STACK being a huge hack. Will be added back in 1087 * when MAP_STACK (and the user stack mapping) is fixed. 1088 */ 1089 /* 1090 * It may be possible to simplify the entry 1091 */ 1092 vm_map_simplify_entry(map, new_entry, countp); 1093 #endif 1094 1095 /* 1096 * Try to pre-populate the page table. Mappings governed by virtual 1097 * page tables cannot be prepopulated without a lot of work, so 1098 * don't try. 1099 */ 1100 if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) && 1101 maptype != VM_MAPTYPE_VPAGETABLE) { 1102 int dorelock = 0; 1103 if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) { 1104 dorelock = 1; 1105 vm_object_lock_swap(); 1106 vm_object_drop(object); 1107 } 1108 pmap_object_init_pt(map->pmap, start, prot, 1109 object, OFF_TO_IDX(offset), end - start, 1110 cow & MAP_PREFAULT_PARTIAL); 1111 if (dorelock) { 1112 vm_object_hold(object); 1113 vm_object_lock_swap(); 1114 } 1115 } 1116 if (must_drop) 1117 vm_object_drop(object); 1118 1119 lwkt_reltoken(&map->token); 1120 return (KERN_SUCCESS); 1121 } 1122 1123 /* 1124 * Find sufficient space for `length' bytes in the given map, starting at 1125 * `start'. Returns 0 on success, 1 on no space. 1126 * 1127 * This function will returned an arbitrarily aligned pointer. If no 1128 * particular alignment is required you should pass align as 1. Note that 1129 * the map may return PAGE_SIZE aligned pointers if all the lengths used in 1130 * the map are a multiple of PAGE_SIZE, even if you pass a smaller align 1131 * argument. 1132 * 1133 * 'align' should be a power of 2 but is not required to be. 1134 * 1135 * The map must be exclusively locked. 1136 * No other requirements. 1137 */ 1138 int 1139 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length, 1140 vm_size_t align, int flags, vm_offset_t *addr) 1141 { 1142 vm_map_entry_t entry, next; 1143 vm_offset_t end; 1144 vm_offset_t align_mask; 1145 1146 if (start < map->min_offset) 1147 start = map->min_offset; 1148 if (start > map->max_offset) 1149 return (1); 1150 1151 /* 1152 * If the alignment is not a power of 2 we will have to use 1153 * a mod/division, set align_mask to a special value. 1154 */ 1155 if ((align | (align - 1)) + 1 != (align << 1)) 1156 align_mask = (vm_offset_t)-1; 1157 else 1158 align_mask = align - 1; 1159 1160 /* 1161 * Look for the first possible address; if there's already something 1162 * at this address, we have to start after it. 1163 */ 1164 if (start == map->min_offset) { 1165 if ((entry = map->first_free) != &map->header) 1166 start = entry->end; 1167 } else { 1168 vm_map_entry_t tmp; 1169 1170 if (vm_map_lookup_entry(map, start, &tmp)) 1171 start = tmp->end; 1172 entry = tmp; 1173 } 1174 1175 /* 1176 * Look through the rest of the map, trying to fit a new region in the 1177 * gap between existing regions, or after the very last region. 1178 */ 1179 for (;; start = (entry = next)->end) { 1180 /* 1181 * Adjust the proposed start by the requested alignment, 1182 * be sure that we didn't wrap the address. 1183 */ 1184 if (align_mask == (vm_offset_t)-1) 1185 end = ((start + align - 1) / align) * align; 1186 else 1187 end = (start + align_mask) & ~align_mask; 1188 if (end < start) 1189 return (1); 1190 start = end; 1191 /* 1192 * Find the end of the proposed new region. Be sure we didn't 1193 * go beyond the end of the map, or wrap around the address. 1194 * Then check to see if this is the last entry or if the 1195 * proposed end fits in the gap between this and the next 1196 * entry. 1197 */ 1198 end = start + length; 1199 if (end > map->max_offset || end < start) 1200 return (1); 1201 next = entry->next; 1202 1203 /* 1204 * If the next entry's start address is beyond the desired 1205 * end address we may have found a good entry. 1206 * 1207 * If the next entry is a stack mapping we do not map into 1208 * the stack's reserved space. 1209 * 1210 * XXX continue to allow mapping into the stack's reserved 1211 * space if doing a MAP_STACK mapping inside a MAP_STACK 1212 * mapping, for backwards compatibility. But the caller 1213 * really should use MAP_STACK | MAP_TRYFIXED if they 1214 * want to do that. 1215 */ 1216 if (next == &map->header) 1217 break; 1218 if (next->start >= end) { 1219 if ((next->eflags & MAP_ENTRY_STACK) == 0) 1220 break; 1221 if (flags & MAP_STACK) 1222 break; 1223 if (next->start - next->aux.avail_ssize >= end) 1224 break; 1225 } 1226 } 1227 map->hint = entry; 1228 1229 /* 1230 * Grow the kernel_map if necessary. pmap_growkernel() will panic 1231 * if it fails. The kernel_map is locked and nothing can steal 1232 * our address space if pmap_growkernel() blocks. 1233 * 1234 * NOTE: This may be unconditionally called for kldload areas on 1235 * x86_64 because these do not bump kernel_vm_end (which would 1236 * fill 128G worth of page tables!). Therefore we must not 1237 * retry. 1238 */ 1239 if (map == &kernel_map) { 1240 vm_offset_t kstop; 1241 1242 kstop = round_page(start + length); 1243 if (kstop > kernel_vm_end) 1244 pmap_growkernel(start, kstop); 1245 } 1246 *addr = start; 1247 return (0); 1248 } 1249 1250 /* 1251 * vm_map_find finds an unallocated region in the target address map with 1252 * the given length and allocates it. The search is defined to be first-fit 1253 * from the specified address; the region found is returned in the same 1254 * parameter. 1255 * 1256 * If object is non-NULL, ref count must be bumped by caller 1257 * prior to making call to account for the new entry. 1258 * 1259 * No requirements. This function will lock the map temporarily. 1260 */ 1261 int 1262 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset, 1263 vm_offset_t *addr, vm_size_t length, vm_size_t align, 1264 boolean_t fitit, 1265 vm_maptype_t maptype, 1266 vm_prot_t prot, vm_prot_t max, 1267 int cow) 1268 { 1269 vm_offset_t start; 1270 int result; 1271 int count; 1272 1273 start = *addr; 1274 1275 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 1276 vm_map_lock(map); 1277 if (object) 1278 vm_object_hold(object); 1279 if (fitit) { 1280 if (vm_map_findspace(map, start, length, align, 0, addr)) { 1281 if (object) 1282 vm_object_drop(object); 1283 vm_map_unlock(map); 1284 vm_map_entry_release(count); 1285 return (KERN_NO_SPACE); 1286 } 1287 start = *addr; 1288 } 1289 result = vm_map_insert(map, &count, object, offset, 1290 start, start + length, 1291 maptype, 1292 prot, max, 1293 cow); 1294 if (object) 1295 vm_object_drop(object); 1296 vm_map_unlock(map); 1297 vm_map_entry_release(count); 1298 1299 return (result); 1300 } 1301 1302 /* 1303 * Simplify the given map entry by merging with either neighbor. This 1304 * routine also has the ability to merge with both neighbors. 1305 * 1306 * This routine guarentees that the passed entry remains valid (though 1307 * possibly extended). When merging, this routine may delete one or 1308 * both neighbors. No action is taken on entries which have their 1309 * in-transition flag set. 1310 * 1311 * The map must be exclusively locked. 1312 */ 1313 void 1314 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp) 1315 { 1316 vm_map_entry_t next, prev; 1317 vm_size_t prevsize, esize; 1318 1319 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 1320 ++mycpu->gd_cnt.v_intrans_coll; 1321 return; 1322 } 1323 1324 if (entry->maptype == VM_MAPTYPE_SUBMAP) 1325 return; 1326 1327 prev = entry->prev; 1328 if (prev != &map->header) { 1329 prevsize = prev->end - prev->start; 1330 if ( (prev->end == entry->start) && 1331 (prev->maptype == entry->maptype) && 1332 (prev->object.vm_object == entry->object.vm_object) && 1333 (!prev->object.vm_object || 1334 (prev->offset + prevsize == entry->offset)) && 1335 (prev->eflags == entry->eflags) && 1336 (prev->protection == entry->protection) && 1337 (prev->max_protection == entry->max_protection) && 1338 (prev->inheritance == entry->inheritance) && 1339 (prev->wired_count == entry->wired_count)) { 1340 if (map->first_free == prev) 1341 map->first_free = entry; 1342 if (map->hint == prev) 1343 map->hint = entry; 1344 vm_map_entry_unlink(map, prev); 1345 entry->start = prev->start; 1346 entry->offset = prev->offset; 1347 if (prev->object.vm_object) 1348 vm_object_deallocate(prev->object.vm_object); 1349 vm_map_entry_dispose(map, prev, countp); 1350 } 1351 } 1352 1353 next = entry->next; 1354 if (next != &map->header) { 1355 esize = entry->end - entry->start; 1356 if ((entry->end == next->start) && 1357 (next->maptype == entry->maptype) && 1358 (next->object.vm_object == entry->object.vm_object) && 1359 (!entry->object.vm_object || 1360 (entry->offset + esize == next->offset)) && 1361 (next->eflags == entry->eflags) && 1362 (next->protection == entry->protection) && 1363 (next->max_protection == entry->max_protection) && 1364 (next->inheritance == entry->inheritance) && 1365 (next->wired_count == entry->wired_count)) { 1366 if (map->first_free == next) 1367 map->first_free = entry; 1368 if (map->hint == next) 1369 map->hint = entry; 1370 vm_map_entry_unlink(map, next); 1371 entry->end = next->end; 1372 if (next->object.vm_object) 1373 vm_object_deallocate(next->object.vm_object); 1374 vm_map_entry_dispose(map, next, countp); 1375 } 1376 } 1377 } 1378 1379 /* 1380 * Asserts that the given entry begins at or after the specified address. 1381 * If necessary, it splits the entry into two. 1382 */ 1383 #define vm_map_clip_start(map, entry, startaddr, countp) \ 1384 { \ 1385 if (startaddr > entry->start) \ 1386 _vm_map_clip_start(map, entry, startaddr, countp); \ 1387 } 1388 1389 /* 1390 * This routine is called only when it is known that the entry must be split. 1391 * 1392 * The map must be exclusively locked. 1393 */ 1394 static void 1395 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start, 1396 int *countp) 1397 { 1398 vm_map_entry_t new_entry; 1399 1400 /* 1401 * Split off the front portion -- note that we must insert the new 1402 * entry BEFORE this one, so that this entry has the specified 1403 * starting address. 1404 */ 1405 1406 vm_map_simplify_entry(map, entry, countp); 1407 1408 /* 1409 * If there is no object backing this entry, we might as well create 1410 * one now. If we defer it, an object can get created after the map 1411 * is clipped, and individual objects will be created for the split-up 1412 * map. This is a bit of a hack, but is also about the best place to 1413 * put this improvement. 1414 */ 1415 if (entry->object.vm_object == NULL && !map->system_map) { 1416 vm_map_entry_allocate_object(entry); 1417 } 1418 1419 new_entry = vm_map_entry_create(map, countp); 1420 *new_entry = *entry; 1421 1422 new_entry->end = start; 1423 entry->offset += (start - entry->start); 1424 entry->start = start; 1425 1426 vm_map_entry_link(map, entry->prev, new_entry); 1427 1428 switch(entry->maptype) { 1429 case VM_MAPTYPE_NORMAL: 1430 case VM_MAPTYPE_VPAGETABLE: 1431 if (new_entry->object.vm_object) { 1432 vm_object_hold(new_entry->object.vm_object); 1433 vm_object_chain_wait(new_entry->object.vm_object); 1434 vm_object_reference_locked(new_entry->object.vm_object); 1435 vm_object_drop(new_entry->object.vm_object); 1436 } 1437 break; 1438 default: 1439 break; 1440 } 1441 } 1442 1443 /* 1444 * Asserts that the given entry ends at or before the specified address. 1445 * If necessary, it splits the entry into two. 1446 * 1447 * The map must be exclusively locked. 1448 */ 1449 #define vm_map_clip_end(map, entry, endaddr, countp) \ 1450 { \ 1451 if (endaddr < entry->end) \ 1452 _vm_map_clip_end(map, entry, endaddr, countp); \ 1453 } 1454 1455 /* 1456 * This routine is called only when it is known that the entry must be split. 1457 * 1458 * The map must be exclusively locked. 1459 */ 1460 static void 1461 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end, 1462 int *countp) 1463 { 1464 vm_map_entry_t new_entry; 1465 1466 /* 1467 * If there is no object backing this entry, we might as well create 1468 * one now. If we defer it, an object can get created after the map 1469 * is clipped, and individual objects will be created for the split-up 1470 * map. This is a bit of a hack, but is also about the best place to 1471 * put this improvement. 1472 */ 1473 1474 if (entry->object.vm_object == NULL && !map->system_map) { 1475 vm_map_entry_allocate_object(entry); 1476 } 1477 1478 /* 1479 * Create a new entry and insert it AFTER the specified entry 1480 */ 1481 1482 new_entry = vm_map_entry_create(map, countp); 1483 *new_entry = *entry; 1484 1485 new_entry->start = entry->end = end; 1486 new_entry->offset += (end - entry->start); 1487 1488 vm_map_entry_link(map, entry, new_entry); 1489 1490 switch(entry->maptype) { 1491 case VM_MAPTYPE_NORMAL: 1492 case VM_MAPTYPE_VPAGETABLE: 1493 if (new_entry->object.vm_object) { 1494 vm_object_hold(new_entry->object.vm_object); 1495 vm_object_chain_wait(new_entry->object.vm_object); 1496 vm_object_reference_locked(new_entry->object.vm_object); 1497 vm_object_drop(new_entry->object.vm_object); 1498 } 1499 break; 1500 default: 1501 break; 1502 } 1503 } 1504 1505 /* 1506 * Asserts that the starting and ending region addresses fall within the 1507 * valid range for the map. 1508 */ 1509 #define VM_MAP_RANGE_CHECK(map, start, end) \ 1510 { \ 1511 if (start < vm_map_min(map)) \ 1512 start = vm_map_min(map); \ 1513 if (end > vm_map_max(map)) \ 1514 end = vm_map_max(map); \ 1515 if (start > end) \ 1516 start = end; \ 1517 } 1518 1519 /* 1520 * Used to block when an in-transition collison occurs. The map 1521 * is unlocked for the sleep and relocked before the return. 1522 */ 1523 void 1524 vm_map_transition_wait(vm_map_t map) 1525 { 1526 tsleep_interlock(map, 0); 1527 vm_map_unlock(map); 1528 tsleep(map, PINTERLOCKED, "vment", 0); 1529 vm_map_lock(map); 1530 } 1531 1532 /* 1533 * When we do blocking operations with the map lock held it is 1534 * possible that a clip might have occured on our in-transit entry, 1535 * requiring an adjustment to the entry in our loop. These macros 1536 * help the pageable and clip_range code deal with the case. The 1537 * conditional costs virtually nothing if no clipping has occured. 1538 */ 1539 1540 #define CLIP_CHECK_BACK(entry, save_start) \ 1541 do { \ 1542 while (entry->start != save_start) { \ 1543 entry = entry->prev; \ 1544 KASSERT(entry != &map->header, ("bad entry clip")); \ 1545 } \ 1546 } while(0) 1547 1548 #define CLIP_CHECK_FWD(entry, save_end) \ 1549 do { \ 1550 while (entry->end != save_end) { \ 1551 entry = entry->next; \ 1552 KASSERT(entry != &map->header, ("bad entry clip")); \ 1553 } \ 1554 } while(0) 1555 1556 1557 /* 1558 * Clip the specified range and return the base entry. The 1559 * range may cover several entries starting at the returned base 1560 * and the first and last entry in the covering sequence will be 1561 * properly clipped to the requested start and end address. 1562 * 1563 * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES 1564 * flag. 1565 * 1566 * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries 1567 * covered by the requested range. 1568 * 1569 * The map must be exclusively locked on entry and will remain locked 1570 * on return. If no range exists or the range contains holes and you 1571 * specified that no holes were allowed, NULL will be returned. This 1572 * routine may temporarily unlock the map in order avoid a deadlock when 1573 * sleeping. 1574 */ 1575 static 1576 vm_map_entry_t 1577 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end, 1578 int *countp, int flags) 1579 { 1580 vm_map_entry_t start_entry; 1581 vm_map_entry_t entry; 1582 1583 /* 1584 * Locate the entry and effect initial clipping. The in-transition 1585 * case does not occur very often so do not try to optimize it. 1586 */ 1587 again: 1588 if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) 1589 return (NULL); 1590 entry = start_entry; 1591 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 1592 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 1593 ++mycpu->gd_cnt.v_intrans_coll; 1594 ++mycpu->gd_cnt.v_intrans_wait; 1595 vm_map_transition_wait(map); 1596 /* 1597 * entry and/or start_entry may have been clipped while 1598 * we slept, or may have gone away entirely. We have 1599 * to restart from the lookup. 1600 */ 1601 goto again; 1602 } 1603 1604 /* 1605 * Since we hold an exclusive map lock we do not have to restart 1606 * after clipping, even though clipping may block in zalloc. 1607 */ 1608 vm_map_clip_start(map, entry, start, countp); 1609 vm_map_clip_end(map, entry, end, countp); 1610 entry->eflags |= MAP_ENTRY_IN_TRANSITION; 1611 1612 /* 1613 * Scan entries covered by the range. When working on the next 1614 * entry a restart need only re-loop on the current entry which 1615 * we have already locked, since 'next' may have changed. Also, 1616 * even though entry is safe, it may have been clipped so we 1617 * have to iterate forwards through the clip after sleeping. 1618 */ 1619 while (entry->next != &map->header && entry->next->start < end) { 1620 vm_map_entry_t next = entry->next; 1621 1622 if (flags & MAP_CLIP_NO_HOLES) { 1623 if (next->start > entry->end) { 1624 vm_map_unclip_range(map, start_entry, 1625 start, entry->end, countp, flags); 1626 return(NULL); 1627 } 1628 } 1629 1630 if (next->eflags & MAP_ENTRY_IN_TRANSITION) { 1631 vm_offset_t save_end = entry->end; 1632 next->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 1633 ++mycpu->gd_cnt.v_intrans_coll; 1634 ++mycpu->gd_cnt.v_intrans_wait; 1635 vm_map_transition_wait(map); 1636 1637 /* 1638 * clips might have occured while we blocked. 1639 */ 1640 CLIP_CHECK_FWD(entry, save_end); 1641 CLIP_CHECK_BACK(start_entry, start); 1642 continue; 1643 } 1644 /* 1645 * No restart necessary even though clip_end may block, we 1646 * are holding the map lock. 1647 */ 1648 vm_map_clip_end(map, next, end, countp); 1649 next->eflags |= MAP_ENTRY_IN_TRANSITION; 1650 entry = next; 1651 } 1652 if (flags & MAP_CLIP_NO_HOLES) { 1653 if (entry->end != end) { 1654 vm_map_unclip_range(map, start_entry, 1655 start, entry->end, countp, flags); 1656 return(NULL); 1657 } 1658 } 1659 return(start_entry); 1660 } 1661 1662 /* 1663 * Undo the effect of vm_map_clip_range(). You should pass the same 1664 * flags and the same range that you passed to vm_map_clip_range(). 1665 * This code will clear the in-transition flag on the entries and 1666 * wake up anyone waiting. This code will also simplify the sequence 1667 * and attempt to merge it with entries before and after the sequence. 1668 * 1669 * The map must be locked on entry and will remain locked on return. 1670 * 1671 * Note that you should also pass the start_entry returned by 1672 * vm_map_clip_range(). However, if you block between the two calls 1673 * with the map unlocked please be aware that the start_entry may 1674 * have been clipped and you may need to scan it backwards to find 1675 * the entry corresponding with the original start address. You are 1676 * responsible for this, vm_map_unclip_range() expects the correct 1677 * start_entry to be passed to it and will KASSERT otherwise. 1678 */ 1679 static 1680 void 1681 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry, 1682 vm_offset_t start, vm_offset_t end, 1683 int *countp, int flags) 1684 { 1685 vm_map_entry_t entry; 1686 1687 entry = start_entry; 1688 1689 KASSERT(entry->start == start, ("unclip_range: illegal base entry")); 1690 while (entry != &map->header && entry->start < end) { 1691 KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, 1692 ("in-transition flag not set during unclip on: %p", 1693 entry)); 1694 KASSERT(entry->end <= end, 1695 ("unclip_range: tail wasn't clipped")); 1696 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; 1697 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { 1698 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; 1699 wakeup(map); 1700 } 1701 entry = entry->next; 1702 } 1703 1704 /* 1705 * Simplification does not block so there is no restart case. 1706 */ 1707 entry = start_entry; 1708 while (entry != &map->header && entry->start < end) { 1709 vm_map_simplify_entry(map, entry, countp); 1710 entry = entry->next; 1711 } 1712 } 1713 1714 /* 1715 * Mark the given range as handled by a subordinate map. 1716 * 1717 * This range must have been created with vm_map_find(), and no other 1718 * operations may have been performed on this range prior to calling 1719 * vm_map_submap(). 1720 * 1721 * Submappings cannot be removed. 1722 * 1723 * No requirements. 1724 */ 1725 int 1726 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap) 1727 { 1728 vm_map_entry_t entry; 1729 int result = KERN_INVALID_ARGUMENT; 1730 int count; 1731 1732 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 1733 vm_map_lock(map); 1734 1735 VM_MAP_RANGE_CHECK(map, start, end); 1736 1737 if (vm_map_lookup_entry(map, start, &entry)) { 1738 vm_map_clip_start(map, entry, start, &count); 1739 } else { 1740 entry = entry->next; 1741 } 1742 1743 vm_map_clip_end(map, entry, end, &count); 1744 1745 if ((entry->start == start) && (entry->end == end) && 1746 ((entry->eflags & MAP_ENTRY_COW) == 0) && 1747 (entry->object.vm_object == NULL)) { 1748 entry->object.sub_map = submap; 1749 entry->maptype = VM_MAPTYPE_SUBMAP; 1750 result = KERN_SUCCESS; 1751 } 1752 vm_map_unlock(map); 1753 vm_map_entry_release(count); 1754 1755 return (result); 1756 } 1757 1758 /* 1759 * Sets the protection of the specified address region in the target map. 1760 * If "set_max" is specified, the maximum protection is to be set; 1761 * otherwise, only the current protection is affected. 1762 * 1763 * The protection is not applicable to submaps, but is applicable to normal 1764 * maps and maps governed by virtual page tables. For example, when operating 1765 * on a virtual page table our protection basically controls how COW occurs 1766 * on the backing object, whereas the virtual page table abstraction itself 1767 * is an abstraction for userland. 1768 * 1769 * No requirements. 1770 */ 1771 int 1772 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end, 1773 vm_prot_t new_prot, boolean_t set_max) 1774 { 1775 vm_map_entry_t current; 1776 vm_map_entry_t entry; 1777 int count; 1778 1779 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 1780 vm_map_lock(map); 1781 1782 VM_MAP_RANGE_CHECK(map, start, end); 1783 1784 if (vm_map_lookup_entry(map, start, &entry)) { 1785 vm_map_clip_start(map, entry, start, &count); 1786 } else { 1787 entry = entry->next; 1788 } 1789 1790 /* 1791 * Make a first pass to check for protection violations. 1792 */ 1793 current = entry; 1794 while ((current != &map->header) && (current->start < end)) { 1795 if (current->maptype == VM_MAPTYPE_SUBMAP) { 1796 vm_map_unlock(map); 1797 vm_map_entry_release(count); 1798 return (KERN_INVALID_ARGUMENT); 1799 } 1800 if ((new_prot & current->max_protection) != new_prot) { 1801 vm_map_unlock(map); 1802 vm_map_entry_release(count); 1803 return (KERN_PROTECTION_FAILURE); 1804 } 1805 current = current->next; 1806 } 1807 1808 /* 1809 * Go back and fix up protections. [Note that clipping is not 1810 * necessary the second time.] 1811 */ 1812 current = entry; 1813 1814 while ((current != &map->header) && (current->start < end)) { 1815 vm_prot_t old_prot; 1816 1817 vm_map_clip_end(map, current, end, &count); 1818 1819 old_prot = current->protection; 1820 if (set_max) { 1821 current->protection = 1822 (current->max_protection = new_prot) & 1823 old_prot; 1824 } else { 1825 current->protection = new_prot; 1826 } 1827 1828 /* 1829 * Update physical map if necessary. Worry about copy-on-write 1830 * here -- CHECK THIS XXX 1831 */ 1832 1833 if (current->protection != old_prot) { 1834 #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \ 1835 VM_PROT_ALL) 1836 1837 pmap_protect(map->pmap, current->start, 1838 current->end, 1839 current->protection & MASK(current)); 1840 #undef MASK 1841 } 1842 1843 vm_map_simplify_entry(map, current, &count); 1844 1845 current = current->next; 1846 } 1847 1848 vm_map_unlock(map); 1849 vm_map_entry_release(count); 1850 return (KERN_SUCCESS); 1851 } 1852 1853 /* 1854 * This routine traverses a processes map handling the madvise 1855 * system call. Advisories are classified as either those effecting 1856 * the vm_map_entry structure, or those effecting the underlying 1857 * objects. 1858 * 1859 * The <value> argument is used for extended madvise calls. 1860 * 1861 * No requirements. 1862 */ 1863 int 1864 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end, 1865 int behav, off_t value) 1866 { 1867 vm_map_entry_t current, entry; 1868 int modify_map = 0; 1869 int error = 0; 1870 int count; 1871 1872 /* 1873 * Some madvise calls directly modify the vm_map_entry, in which case 1874 * we need to use an exclusive lock on the map and we need to perform 1875 * various clipping operations. Otherwise we only need a read-lock 1876 * on the map. 1877 */ 1878 1879 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 1880 1881 switch(behav) { 1882 case MADV_NORMAL: 1883 case MADV_SEQUENTIAL: 1884 case MADV_RANDOM: 1885 case MADV_NOSYNC: 1886 case MADV_AUTOSYNC: 1887 case MADV_NOCORE: 1888 case MADV_CORE: 1889 case MADV_SETMAP: 1890 case MADV_INVAL: 1891 modify_map = 1; 1892 vm_map_lock(map); 1893 break; 1894 case MADV_WILLNEED: 1895 case MADV_DONTNEED: 1896 case MADV_FREE: 1897 vm_map_lock_read(map); 1898 break; 1899 default: 1900 vm_map_entry_release(count); 1901 return (EINVAL); 1902 } 1903 1904 /* 1905 * Locate starting entry and clip if necessary. 1906 */ 1907 1908 VM_MAP_RANGE_CHECK(map, start, end); 1909 1910 if (vm_map_lookup_entry(map, start, &entry)) { 1911 if (modify_map) 1912 vm_map_clip_start(map, entry, start, &count); 1913 } else { 1914 entry = entry->next; 1915 } 1916 1917 if (modify_map) { 1918 /* 1919 * madvise behaviors that are implemented in the vm_map_entry. 1920 * 1921 * We clip the vm_map_entry so that behavioral changes are 1922 * limited to the specified address range. 1923 */ 1924 for (current = entry; 1925 (current != &map->header) && (current->start < end); 1926 current = current->next 1927 ) { 1928 if (current->maptype == VM_MAPTYPE_SUBMAP) 1929 continue; 1930 1931 vm_map_clip_end(map, current, end, &count); 1932 1933 switch (behav) { 1934 case MADV_NORMAL: 1935 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL); 1936 break; 1937 case MADV_SEQUENTIAL: 1938 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL); 1939 break; 1940 case MADV_RANDOM: 1941 vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM); 1942 break; 1943 case MADV_NOSYNC: 1944 current->eflags |= MAP_ENTRY_NOSYNC; 1945 break; 1946 case MADV_AUTOSYNC: 1947 current->eflags &= ~MAP_ENTRY_NOSYNC; 1948 break; 1949 case MADV_NOCORE: 1950 current->eflags |= MAP_ENTRY_NOCOREDUMP; 1951 break; 1952 case MADV_CORE: 1953 current->eflags &= ~MAP_ENTRY_NOCOREDUMP; 1954 break; 1955 case MADV_INVAL: 1956 /* 1957 * Invalidate the related pmap entries, used 1958 * to flush portions of the real kernel's 1959 * pmap when the caller has removed or 1960 * modified existing mappings in a virtual 1961 * page table. 1962 */ 1963 pmap_remove(map->pmap, 1964 current->start, current->end); 1965 break; 1966 case MADV_SETMAP: 1967 /* 1968 * Set the page directory page for a map 1969 * governed by a virtual page table. Mark 1970 * the entry as being governed by a virtual 1971 * page table if it is not. 1972 * 1973 * XXX the page directory page is stored 1974 * in the avail_ssize field if the map_entry. 1975 * 1976 * XXX the map simplification code does not 1977 * compare this field so weird things may 1978 * happen if you do not apply this function 1979 * to the entire mapping governed by the 1980 * virtual page table. 1981 */ 1982 if (current->maptype != VM_MAPTYPE_VPAGETABLE) { 1983 error = EINVAL; 1984 break; 1985 } 1986 current->aux.master_pde = value; 1987 pmap_remove(map->pmap, 1988 current->start, current->end); 1989 break; 1990 default: 1991 error = EINVAL; 1992 break; 1993 } 1994 vm_map_simplify_entry(map, current, &count); 1995 } 1996 vm_map_unlock(map); 1997 } else { 1998 vm_pindex_t pindex; 1999 int count; 2000 2001 /* 2002 * madvise behaviors that are implemented in the underlying 2003 * vm_object. 2004 * 2005 * Since we don't clip the vm_map_entry, we have to clip 2006 * the vm_object pindex and count. 2007 * 2008 * NOTE! We currently do not support these functions on 2009 * virtual page tables. 2010 */ 2011 for (current = entry; 2012 (current != &map->header) && (current->start < end); 2013 current = current->next 2014 ) { 2015 vm_offset_t useStart; 2016 2017 if (current->maptype != VM_MAPTYPE_NORMAL) 2018 continue; 2019 2020 pindex = OFF_TO_IDX(current->offset); 2021 count = atop(current->end - current->start); 2022 useStart = current->start; 2023 2024 if (current->start < start) { 2025 pindex += atop(start - current->start); 2026 count -= atop(start - current->start); 2027 useStart = start; 2028 } 2029 if (current->end > end) 2030 count -= atop(current->end - end); 2031 2032 if (count <= 0) 2033 continue; 2034 2035 vm_object_madvise(current->object.vm_object, 2036 pindex, count, behav); 2037 2038 /* 2039 * Try to populate the page table. Mappings governed 2040 * by virtual page tables cannot be pre-populated 2041 * without a lot of work so don't try. 2042 */ 2043 if (behav == MADV_WILLNEED && 2044 current->maptype != VM_MAPTYPE_VPAGETABLE) { 2045 pmap_object_init_pt( 2046 map->pmap, 2047 useStart, 2048 current->protection, 2049 current->object.vm_object, 2050 pindex, 2051 (count << PAGE_SHIFT), 2052 MAP_PREFAULT_MADVISE 2053 ); 2054 } 2055 } 2056 vm_map_unlock_read(map); 2057 } 2058 vm_map_entry_release(count); 2059 return(error); 2060 } 2061 2062 2063 /* 2064 * Sets the inheritance of the specified address range in the target map. 2065 * Inheritance affects how the map will be shared with child maps at the 2066 * time of vm_map_fork. 2067 */ 2068 int 2069 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end, 2070 vm_inherit_t new_inheritance) 2071 { 2072 vm_map_entry_t entry; 2073 vm_map_entry_t temp_entry; 2074 int count; 2075 2076 switch (new_inheritance) { 2077 case VM_INHERIT_NONE: 2078 case VM_INHERIT_COPY: 2079 case VM_INHERIT_SHARE: 2080 break; 2081 default: 2082 return (KERN_INVALID_ARGUMENT); 2083 } 2084 2085 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 2086 vm_map_lock(map); 2087 2088 VM_MAP_RANGE_CHECK(map, start, end); 2089 2090 if (vm_map_lookup_entry(map, start, &temp_entry)) { 2091 entry = temp_entry; 2092 vm_map_clip_start(map, entry, start, &count); 2093 } else 2094 entry = temp_entry->next; 2095 2096 while ((entry != &map->header) && (entry->start < end)) { 2097 vm_map_clip_end(map, entry, end, &count); 2098 2099 entry->inheritance = new_inheritance; 2100 2101 vm_map_simplify_entry(map, entry, &count); 2102 2103 entry = entry->next; 2104 } 2105 vm_map_unlock(map); 2106 vm_map_entry_release(count); 2107 return (KERN_SUCCESS); 2108 } 2109 2110 /* 2111 * Implement the semantics of mlock 2112 */ 2113 int 2114 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, 2115 boolean_t new_pageable) 2116 { 2117 vm_map_entry_t entry; 2118 vm_map_entry_t start_entry; 2119 vm_offset_t end; 2120 int rv = KERN_SUCCESS; 2121 int count; 2122 2123 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 2124 vm_map_lock(map); 2125 VM_MAP_RANGE_CHECK(map, start, real_end); 2126 end = real_end; 2127 2128 start_entry = vm_map_clip_range(map, start, end, &count, 2129 MAP_CLIP_NO_HOLES); 2130 if (start_entry == NULL) { 2131 vm_map_unlock(map); 2132 vm_map_entry_release(count); 2133 return (KERN_INVALID_ADDRESS); 2134 } 2135 2136 if (new_pageable == 0) { 2137 entry = start_entry; 2138 while ((entry != &map->header) && (entry->start < end)) { 2139 vm_offset_t save_start; 2140 vm_offset_t save_end; 2141 2142 /* 2143 * Already user wired or hard wired (trivial cases) 2144 */ 2145 if (entry->eflags & MAP_ENTRY_USER_WIRED) { 2146 entry = entry->next; 2147 continue; 2148 } 2149 if (entry->wired_count != 0) { 2150 entry->wired_count++; 2151 entry->eflags |= MAP_ENTRY_USER_WIRED; 2152 entry = entry->next; 2153 continue; 2154 } 2155 2156 /* 2157 * A new wiring requires instantiation of appropriate 2158 * management structures and the faulting in of the 2159 * page. 2160 */ 2161 if (entry->maptype != VM_MAPTYPE_SUBMAP) { 2162 int copyflag = entry->eflags & 2163 MAP_ENTRY_NEEDS_COPY; 2164 if (copyflag && ((entry->protection & 2165 VM_PROT_WRITE) != 0)) { 2166 vm_map_entry_shadow(entry, 0); 2167 } else if (entry->object.vm_object == NULL && 2168 !map->system_map) { 2169 vm_map_entry_allocate_object(entry); 2170 } 2171 } 2172 entry->wired_count++; 2173 entry->eflags |= MAP_ENTRY_USER_WIRED; 2174 2175 /* 2176 * Now fault in the area. Note that vm_fault_wire() 2177 * may release the map lock temporarily, it will be 2178 * relocked on return. The in-transition 2179 * flag protects the entries. 2180 */ 2181 save_start = entry->start; 2182 save_end = entry->end; 2183 rv = vm_fault_wire(map, entry, TRUE); 2184 if (rv) { 2185 CLIP_CHECK_BACK(entry, save_start); 2186 for (;;) { 2187 KASSERT(entry->wired_count == 1, ("bad wired_count on entry")); 2188 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 2189 entry->wired_count = 0; 2190 if (entry->end == save_end) 2191 break; 2192 entry = entry->next; 2193 KASSERT(entry != &map->header, ("bad entry clip during backout")); 2194 } 2195 end = save_start; /* unwire the rest */ 2196 break; 2197 } 2198 /* 2199 * note that even though the entry might have been 2200 * clipped, the USER_WIRED flag we set prevents 2201 * duplication so we do not have to do a 2202 * clip check. 2203 */ 2204 entry = entry->next; 2205 } 2206 2207 /* 2208 * If we failed fall through to the unwiring section to 2209 * unwire what we had wired so far. 'end' has already 2210 * been adjusted. 2211 */ 2212 if (rv) 2213 new_pageable = 1; 2214 2215 /* 2216 * start_entry might have been clipped if we unlocked the 2217 * map and blocked. No matter how clipped it has gotten 2218 * there should be a fragment that is on our start boundary. 2219 */ 2220 CLIP_CHECK_BACK(start_entry, start); 2221 } 2222 2223 /* 2224 * Deal with the unwiring case. 2225 */ 2226 if (new_pageable) { 2227 /* 2228 * This is the unwiring case. We must first ensure that the 2229 * range to be unwired is really wired down. We know there 2230 * are no holes. 2231 */ 2232 entry = start_entry; 2233 while ((entry != &map->header) && (entry->start < end)) { 2234 if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { 2235 rv = KERN_INVALID_ARGUMENT; 2236 goto done; 2237 } 2238 KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry)); 2239 entry = entry->next; 2240 } 2241 2242 /* 2243 * Now decrement the wiring count for each region. If a region 2244 * becomes completely unwired, unwire its physical pages and 2245 * mappings. 2246 */ 2247 /* 2248 * The map entries are processed in a loop, checking to 2249 * make sure the entry is wired and asserting it has a wired 2250 * count. However, another loop was inserted more-or-less in 2251 * the middle of the unwiring path. This loop picks up the 2252 * "entry" loop variable from the first loop without first 2253 * setting it to start_entry. Naturally, the secound loop 2254 * is never entered and the pages backing the entries are 2255 * never unwired. This can lead to a leak of wired pages. 2256 */ 2257 entry = start_entry; 2258 while ((entry != &map->header) && (entry->start < end)) { 2259 KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED, 2260 ("expected USER_WIRED on entry %p", entry)); 2261 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 2262 entry->wired_count--; 2263 if (entry->wired_count == 0) 2264 vm_fault_unwire(map, entry); 2265 entry = entry->next; 2266 } 2267 } 2268 done: 2269 vm_map_unclip_range(map, start_entry, start, real_end, &count, 2270 MAP_CLIP_NO_HOLES); 2271 map->timestamp++; 2272 vm_map_unlock(map); 2273 vm_map_entry_release(count); 2274 return (rv); 2275 } 2276 2277 /* 2278 * Sets the pageability of the specified address range in the target map. 2279 * Regions specified as not pageable require locked-down physical 2280 * memory and physical page maps. 2281 * 2282 * The map must not be locked, but a reference must remain to the map 2283 * throughout the call. 2284 * 2285 * This function may be called via the zalloc path and must properly 2286 * reserve map entries for kernel_map. 2287 * 2288 * No requirements. 2289 */ 2290 int 2291 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags) 2292 { 2293 vm_map_entry_t entry; 2294 vm_map_entry_t start_entry; 2295 vm_offset_t end; 2296 int rv = KERN_SUCCESS; 2297 int count; 2298 2299 if (kmflags & KM_KRESERVE) 2300 count = vm_map_entry_kreserve(MAP_RESERVE_COUNT); 2301 else 2302 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 2303 vm_map_lock(map); 2304 VM_MAP_RANGE_CHECK(map, start, real_end); 2305 end = real_end; 2306 2307 start_entry = vm_map_clip_range(map, start, end, &count, 2308 MAP_CLIP_NO_HOLES); 2309 if (start_entry == NULL) { 2310 vm_map_unlock(map); 2311 rv = KERN_INVALID_ADDRESS; 2312 goto failure; 2313 } 2314 if ((kmflags & KM_PAGEABLE) == 0) { 2315 /* 2316 * Wiring. 2317 * 2318 * 1. Holding the write lock, we create any shadow or zero-fill 2319 * objects that need to be created. Then we clip each map 2320 * entry to the region to be wired and increment its wiring 2321 * count. We create objects before clipping the map entries 2322 * to avoid object proliferation. 2323 * 2324 * 2. We downgrade to a read lock, and call vm_fault_wire to 2325 * fault in the pages for any newly wired area (wired_count is 2326 * 1). 2327 * 2328 * Downgrading to a read lock for vm_fault_wire avoids a 2329 * possible deadlock with another process that may have faulted 2330 * on one of the pages to be wired (it would mark the page busy, 2331 * blocking us, then in turn block on the map lock that we 2332 * hold). Because of problems in the recursive lock package, 2333 * we cannot upgrade to a write lock in vm_map_lookup. Thus, 2334 * any actions that require the write lock must be done 2335 * beforehand. Because we keep the read lock on the map, the 2336 * copy-on-write status of the entries we modify here cannot 2337 * change. 2338 */ 2339 entry = start_entry; 2340 while ((entry != &map->header) && (entry->start < end)) { 2341 /* 2342 * Trivial case if the entry is already wired 2343 */ 2344 if (entry->wired_count) { 2345 entry->wired_count++; 2346 entry = entry->next; 2347 continue; 2348 } 2349 2350 /* 2351 * The entry is being newly wired, we have to setup 2352 * appropriate management structures. A shadow 2353 * object is required for a copy-on-write region, 2354 * or a normal object for a zero-fill region. We 2355 * do not have to do this for entries that point to sub 2356 * maps because we won't hold the lock on the sub map. 2357 */ 2358 if (entry->maptype != VM_MAPTYPE_SUBMAP) { 2359 int copyflag = entry->eflags & 2360 MAP_ENTRY_NEEDS_COPY; 2361 if (copyflag && ((entry->protection & 2362 VM_PROT_WRITE) != 0)) { 2363 vm_map_entry_shadow(entry, 0); 2364 } else if (entry->object.vm_object == NULL && 2365 !map->system_map) { 2366 vm_map_entry_allocate_object(entry); 2367 } 2368 } 2369 2370 entry->wired_count++; 2371 entry = entry->next; 2372 } 2373 2374 /* 2375 * Pass 2. 2376 */ 2377 2378 /* 2379 * HACK HACK HACK HACK 2380 * 2381 * vm_fault_wire() temporarily unlocks the map to avoid 2382 * deadlocks. The in-transition flag from vm_map_clip_range 2383 * call should protect us from changes while the map is 2384 * unlocked. T 2385 * 2386 * NOTE: Previously this comment stated that clipping might 2387 * still occur while the entry is unlocked, but from 2388 * what I can tell it actually cannot. 2389 * 2390 * It is unclear whether the CLIP_CHECK_*() calls 2391 * are still needed but we keep them in anyway. 2392 * 2393 * HACK HACK HACK HACK 2394 */ 2395 2396 entry = start_entry; 2397 while (entry != &map->header && entry->start < end) { 2398 /* 2399 * If vm_fault_wire fails for any page we need to undo 2400 * what has been done. We decrement the wiring count 2401 * for those pages which have not yet been wired (now) 2402 * and unwire those that have (later). 2403 */ 2404 vm_offset_t save_start = entry->start; 2405 vm_offset_t save_end = entry->end; 2406 2407 if (entry->wired_count == 1) 2408 rv = vm_fault_wire(map, entry, FALSE); 2409 if (rv) { 2410 CLIP_CHECK_BACK(entry, save_start); 2411 for (;;) { 2412 KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly")); 2413 entry->wired_count = 0; 2414 if (entry->end == save_end) 2415 break; 2416 entry = entry->next; 2417 KASSERT(entry != &map->header, ("bad entry clip during backout")); 2418 } 2419 end = save_start; 2420 break; 2421 } 2422 CLIP_CHECK_FWD(entry, save_end); 2423 entry = entry->next; 2424 } 2425 2426 /* 2427 * If a failure occured undo everything by falling through 2428 * to the unwiring code. 'end' has already been adjusted 2429 * appropriately. 2430 */ 2431 if (rv) 2432 kmflags |= KM_PAGEABLE; 2433 2434 /* 2435 * start_entry is still IN_TRANSITION but may have been 2436 * clipped since vm_fault_wire() unlocks and relocks the 2437 * map. No matter how clipped it has gotten there should 2438 * be a fragment that is on our start boundary. 2439 */ 2440 CLIP_CHECK_BACK(start_entry, start); 2441 } 2442 2443 if (kmflags & KM_PAGEABLE) { 2444 /* 2445 * This is the unwiring case. We must first ensure that the 2446 * range to be unwired is really wired down. We know there 2447 * are no holes. 2448 */ 2449 entry = start_entry; 2450 while ((entry != &map->header) && (entry->start < end)) { 2451 if (entry->wired_count == 0) { 2452 rv = KERN_INVALID_ARGUMENT; 2453 goto done; 2454 } 2455 entry = entry->next; 2456 } 2457 2458 /* 2459 * Now decrement the wiring count for each region. If a region 2460 * becomes completely unwired, unwire its physical pages and 2461 * mappings. 2462 */ 2463 entry = start_entry; 2464 while ((entry != &map->header) && (entry->start < end)) { 2465 entry->wired_count--; 2466 if (entry->wired_count == 0) 2467 vm_fault_unwire(map, entry); 2468 entry = entry->next; 2469 } 2470 } 2471 done: 2472 vm_map_unclip_range(map, start_entry, start, real_end, 2473 &count, MAP_CLIP_NO_HOLES); 2474 map->timestamp++; 2475 vm_map_unlock(map); 2476 failure: 2477 if (kmflags & KM_KRESERVE) 2478 vm_map_entry_krelease(count); 2479 else 2480 vm_map_entry_release(count); 2481 return (rv); 2482 } 2483 2484 /* 2485 * Mark a newly allocated address range as wired but do not fault in 2486 * the pages. The caller is expected to load the pages into the object. 2487 * 2488 * The map must be locked on entry and will remain locked on return. 2489 * No other requirements. 2490 */ 2491 void 2492 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size, 2493 int *countp) 2494 { 2495 vm_map_entry_t scan; 2496 vm_map_entry_t entry; 2497 2498 entry = vm_map_clip_range(map, addr, addr + size, 2499 countp, MAP_CLIP_NO_HOLES); 2500 for (scan = entry; 2501 scan != &map->header && scan->start < addr + size; 2502 scan = scan->next) { 2503 KKASSERT(scan->wired_count == 0); 2504 scan->wired_count = 1; 2505 } 2506 vm_map_unclip_range(map, entry, addr, addr + size, 2507 countp, MAP_CLIP_NO_HOLES); 2508 } 2509 2510 /* 2511 * Push any dirty cached pages in the address range to their pager. 2512 * If syncio is TRUE, dirty pages are written synchronously. 2513 * If invalidate is TRUE, any cached pages are freed as well. 2514 * 2515 * This routine is called by sys_msync() 2516 * 2517 * Returns an error if any part of the specified range is not mapped. 2518 * 2519 * No requirements. 2520 */ 2521 int 2522 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end, 2523 boolean_t syncio, boolean_t invalidate) 2524 { 2525 vm_map_entry_t current; 2526 vm_map_entry_t entry; 2527 vm_size_t size; 2528 vm_object_t object; 2529 vm_object_t tobj; 2530 vm_ooffset_t offset; 2531 2532 vm_map_lock_read(map); 2533 VM_MAP_RANGE_CHECK(map, start, end); 2534 if (!vm_map_lookup_entry(map, start, &entry)) { 2535 vm_map_unlock_read(map); 2536 return (KERN_INVALID_ADDRESS); 2537 } 2538 lwkt_gettoken(&map->token); 2539 2540 /* 2541 * Make a first pass to check for holes. 2542 */ 2543 for (current = entry; current->start < end; current = current->next) { 2544 if (current->maptype == VM_MAPTYPE_SUBMAP) { 2545 lwkt_reltoken(&map->token); 2546 vm_map_unlock_read(map); 2547 return (KERN_INVALID_ARGUMENT); 2548 } 2549 if (end > current->end && 2550 (current->next == &map->header || 2551 current->end != current->next->start)) { 2552 lwkt_reltoken(&map->token); 2553 vm_map_unlock_read(map); 2554 return (KERN_INVALID_ADDRESS); 2555 } 2556 } 2557 2558 if (invalidate) 2559 pmap_remove(vm_map_pmap(map), start, end); 2560 2561 /* 2562 * Make a second pass, cleaning/uncaching pages from the indicated 2563 * objects as we go. 2564 */ 2565 for (current = entry; current->start < end; current = current->next) { 2566 offset = current->offset + (start - current->start); 2567 size = (end <= current->end ? end : current->end) - start; 2568 if (current->maptype == VM_MAPTYPE_SUBMAP) { 2569 vm_map_t smap; 2570 vm_map_entry_t tentry; 2571 vm_size_t tsize; 2572 2573 smap = current->object.sub_map; 2574 vm_map_lock_read(smap); 2575 vm_map_lookup_entry(smap, offset, &tentry); 2576 tsize = tentry->end - offset; 2577 if (tsize < size) 2578 size = tsize; 2579 object = tentry->object.vm_object; 2580 offset = tentry->offset + (offset - tentry->start); 2581 vm_map_unlock_read(smap); 2582 } else { 2583 object = current->object.vm_object; 2584 } 2585 2586 if (object) 2587 vm_object_hold(object); 2588 2589 /* 2590 * Note that there is absolutely no sense in writing out 2591 * anonymous objects, so we track down the vnode object 2592 * to write out. 2593 * We invalidate (remove) all pages from the address space 2594 * anyway, for semantic correctness. 2595 * 2596 * note: certain anonymous maps, such as MAP_NOSYNC maps, 2597 * may start out with a NULL object. 2598 */ 2599 while (object && (tobj = object->backing_object) != NULL) { 2600 vm_object_hold(tobj); 2601 if (tobj == object->backing_object) { 2602 vm_object_lock_swap(); 2603 offset += object->backing_object_offset; 2604 vm_object_drop(object); 2605 object = tobj; 2606 if (object->size < OFF_TO_IDX(offset + size)) 2607 size = IDX_TO_OFF(object->size) - 2608 offset; 2609 break; 2610 } 2611 vm_object_drop(tobj); 2612 } 2613 if (object && (object->type == OBJT_VNODE) && 2614 (current->protection & VM_PROT_WRITE) && 2615 (object->flags & OBJ_NOMSYNC) == 0) { 2616 /* 2617 * Flush pages if writing is allowed, invalidate them 2618 * if invalidation requested. Pages undergoing I/O 2619 * will be ignored by vm_object_page_remove(). 2620 * 2621 * We cannot lock the vnode and then wait for paging 2622 * to complete without deadlocking against vm_fault. 2623 * Instead we simply call vm_object_page_remove() and 2624 * allow it to block internally on a page-by-page 2625 * basis when it encounters pages undergoing async 2626 * I/O. 2627 */ 2628 int flags; 2629 2630 /* no chain wait needed for vnode objects */ 2631 vm_object_reference_locked(object); 2632 vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY); 2633 flags = (syncio || invalidate) ? OBJPC_SYNC : 0; 2634 flags |= invalidate ? OBJPC_INVAL : 0; 2635 2636 /* 2637 * When operating on a virtual page table just 2638 * flush the whole object. XXX we probably ought 2639 * to 2640 */ 2641 switch(current->maptype) { 2642 case VM_MAPTYPE_NORMAL: 2643 vm_object_page_clean(object, 2644 OFF_TO_IDX(offset), 2645 OFF_TO_IDX(offset + size + PAGE_MASK), 2646 flags); 2647 break; 2648 case VM_MAPTYPE_VPAGETABLE: 2649 vm_object_page_clean(object, 0, 0, flags); 2650 break; 2651 } 2652 vn_unlock(((struct vnode *)object->handle)); 2653 vm_object_deallocate_locked(object); 2654 } 2655 if (object && invalidate && 2656 ((object->type == OBJT_VNODE) || 2657 (object->type == OBJT_DEVICE))) { 2658 int clean_only = 2659 (object->type == OBJT_DEVICE) ? FALSE : TRUE; 2660 /* no chain wait needed for vnode/device objects */ 2661 vm_object_reference_locked(object); 2662 switch(current->maptype) { 2663 case VM_MAPTYPE_NORMAL: 2664 vm_object_page_remove(object, 2665 OFF_TO_IDX(offset), 2666 OFF_TO_IDX(offset + size + PAGE_MASK), 2667 clean_only); 2668 break; 2669 case VM_MAPTYPE_VPAGETABLE: 2670 vm_object_page_remove(object, 0, 0, clean_only); 2671 break; 2672 } 2673 vm_object_deallocate_locked(object); 2674 } 2675 start += size; 2676 if (object) 2677 vm_object_drop(object); 2678 } 2679 2680 lwkt_reltoken(&map->token); 2681 vm_map_unlock_read(map); 2682 2683 return (KERN_SUCCESS); 2684 } 2685 2686 /* 2687 * Make the region specified by this entry pageable. 2688 * 2689 * The vm_map must be exclusively locked. 2690 */ 2691 static void 2692 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry) 2693 { 2694 entry->eflags &= ~MAP_ENTRY_USER_WIRED; 2695 entry->wired_count = 0; 2696 vm_fault_unwire(map, entry); 2697 } 2698 2699 /* 2700 * Deallocate the given entry from the target map. 2701 * 2702 * The vm_map must be exclusively locked. 2703 */ 2704 static void 2705 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp) 2706 { 2707 vm_map_entry_unlink(map, entry); 2708 map->size -= entry->end - entry->start; 2709 2710 switch(entry->maptype) { 2711 case VM_MAPTYPE_NORMAL: 2712 case VM_MAPTYPE_VPAGETABLE: 2713 vm_object_deallocate(entry->object.vm_object); 2714 break; 2715 default: 2716 break; 2717 } 2718 2719 vm_map_entry_dispose(map, entry, countp); 2720 } 2721 2722 /* 2723 * Deallocates the given address range from the target map. 2724 * 2725 * The vm_map must be exclusively locked. 2726 */ 2727 int 2728 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp) 2729 { 2730 vm_object_t object; 2731 vm_map_entry_t entry; 2732 vm_map_entry_t first_entry; 2733 2734 ASSERT_VM_MAP_LOCKED(map); 2735 lwkt_gettoken(&map->token); 2736 again: 2737 /* 2738 * Find the start of the region, and clip it. Set entry to point 2739 * at the first record containing the requested address or, if no 2740 * such record exists, the next record with a greater address. The 2741 * loop will run from this point until a record beyond the termination 2742 * address is encountered. 2743 * 2744 * map->hint must be adjusted to not point to anything we delete, 2745 * so set it to the entry prior to the one being deleted. 2746 * 2747 * GGG see other GGG comment. 2748 */ 2749 if (vm_map_lookup_entry(map, start, &first_entry)) { 2750 entry = first_entry; 2751 vm_map_clip_start(map, entry, start, countp); 2752 map->hint = entry->prev; /* possible problem XXX */ 2753 } else { 2754 map->hint = first_entry; /* possible problem XXX */ 2755 entry = first_entry->next; 2756 } 2757 2758 /* 2759 * If a hole opens up prior to the current first_free then 2760 * adjust first_free. As with map->hint, map->first_free 2761 * cannot be left set to anything we might delete. 2762 */ 2763 if (entry == &map->header) { 2764 map->first_free = &map->header; 2765 } else if (map->first_free->start >= start) { 2766 map->first_free = entry->prev; 2767 } 2768 2769 /* 2770 * Step through all entries in this region 2771 */ 2772 while ((entry != &map->header) && (entry->start < end)) { 2773 vm_map_entry_t next; 2774 vm_offset_t s, e; 2775 vm_pindex_t offidxstart, offidxend, count; 2776 2777 /* 2778 * If we hit an in-transition entry we have to sleep and 2779 * retry. It's easier (and not really slower) to just retry 2780 * since this case occurs so rarely and the hint is already 2781 * pointing at the right place. We have to reset the 2782 * start offset so as not to accidently delete an entry 2783 * another process just created in vacated space. 2784 */ 2785 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { 2786 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; 2787 start = entry->start; 2788 ++mycpu->gd_cnt.v_intrans_coll; 2789 ++mycpu->gd_cnt.v_intrans_wait; 2790 vm_map_transition_wait(map); 2791 goto again; 2792 } 2793 vm_map_clip_end(map, entry, end, countp); 2794 2795 s = entry->start; 2796 e = entry->end; 2797 next = entry->next; 2798 2799 offidxstart = OFF_TO_IDX(entry->offset); 2800 count = OFF_TO_IDX(e - s); 2801 object = entry->object.vm_object; 2802 2803 /* 2804 * Unwire before removing addresses from the pmap; otherwise, 2805 * unwiring will put the entries back in the pmap. 2806 */ 2807 if (entry->wired_count != 0) 2808 vm_map_entry_unwire(map, entry); 2809 2810 offidxend = offidxstart + count; 2811 2812 if (object == &kernel_object) { 2813 vm_object_hold(object); 2814 vm_object_page_remove(object, offidxstart, 2815 offidxend, FALSE); 2816 vm_object_drop(object); 2817 } else if (object && object->type != OBJT_DEFAULT && 2818 object->type != OBJT_SWAP) { 2819 /* 2820 * vnode object routines cannot be chain-locked, 2821 * but since we aren't removing pages from the 2822 * object here we can use a shared hold. 2823 */ 2824 vm_object_hold_shared(object); 2825 pmap_remove(map->pmap, s, e); 2826 vm_object_drop(object); 2827 } else if (object) { 2828 vm_object_hold(object); 2829 vm_object_chain_acquire(object); 2830 pmap_remove(map->pmap, s, e); 2831 2832 if (object != NULL && 2833 object->ref_count != 1 && 2834 (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == 2835 OBJ_ONEMAPPING && 2836 (object->type == OBJT_DEFAULT || 2837 object->type == OBJT_SWAP)) { 2838 vm_object_collapse(object, NULL); 2839 vm_object_page_remove(object, offidxstart, 2840 offidxend, FALSE); 2841 if (object->type == OBJT_SWAP) { 2842 swap_pager_freespace(object, 2843 offidxstart, 2844 count); 2845 } 2846 if (offidxend >= object->size && 2847 offidxstart < object->size) { 2848 object->size = offidxstart; 2849 } 2850 } 2851 vm_object_chain_release(object); 2852 vm_object_drop(object); 2853 } 2854 2855 /* 2856 * Delete the entry (which may delete the object) only after 2857 * removing all pmap entries pointing to its pages. 2858 * (Otherwise, its page frames may be reallocated, and any 2859 * modify bits will be set in the wrong object!) 2860 */ 2861 vm_map_entry_delete(map, entry, countp); 2862 entry = next; 2863 } 2864 lwkt_reltoken(&map->token); 2865 return (KERN_SUCCESS); 2866 } 2867 2868 /* 2869 * Remove the given address range from the target map. 2870 * This is the exported form of vm_map_delete. 2871 * 2872 * No requirements. 2873 */ 2874 int 2875 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) 2876 { 2877 int result; 2878 int count; 2879 2880 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 2881 vm_map_lock(map); 2882 VM_MAP_RANGE_CHECK(map, start, end); 2883 result = vm_map_delete(map, start, end, &count); 2884 vm_map_unlock(map); 2885 vm_map_entry_release(count); 2886 2887 return (result); 2888 } 2889 2890 /* 2891 * Assert that the target map allows the specified privilege on the 2892 * entire address region given. The entire region must be allocated. 2893 * 2894 * The caller must specify whether the vm_map is already locked or not. 2895 */ 2896 boolean_t 2897 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end, 2898 vm_prot_t protection, boolean_t have_lock) 2899 { 2900 vm_map_entry_t entry; 2901 vm_map_entry_t tmp_entry; 2902 boolean_t result; 2903 2904 if (have_lock == FALSE) 2905 vm_map_lock_read(map); 2906 2907 if (!vm_map_lookup_entry(map, start, &tmp_entry)) { 2908 if (have_lock == FALSE) 2909 vm_map_unlock_read(map); 2910 return (FALSE); 2911 } 2912 entry = tmp_entry; 2913 2914 result = TRUE; 2915 while (start < end) { 2916 if (entry == &map->header) { 2917 result = FALSE; 2918 break; 2919 } 2920 /* 2921 * No holes allowed! 2922 */ 2923 2924 if (start < entry->start) { 2925 result = FALSE; 2926 break; 2927 } 2928 /* 2929 * Check protection associated with entry. 2930 */ 2931 2932 if ((entry->protection & protection) != protection) { 2933 result = FALSE; 2934 break; 2935 } 2936 /* go to next entry */ 2937 2938 start = entry->end; 2939 entry = entry->next; 2940 } 2941 if (have_lock == FALSE) 2942 vm_map_unlock_read(map); 2943 return (result); 2944 } 2945 2946 /* 2947 * If appropriate this function shadows the original object with a new object 2948 * and moves the VM pages from the original object to the new object. 2949 * The original object will also be collapsed, if possible. 2950 * 2951 * We can only do this for normal memory objects with a single mapping, and 2952 * it only makes sense to do it if there are 2 or more refs on the original 2953 * object. i.e. typically a memory object that has been extended into 2954 * multiple vm_map_entry's with non-overlapping ranges. 2955 * 2956 * This makes it easier to remove unused pages and keeps object inheritance 2957 * from being a negative impact on memory usage. 2958 * 2959 * On return the (possibly new) entry->object.vm_object will have an 2960 * additional ref on it for the caller to dispose of (usually by cloning 2961 * the vm_map_entry). The additional ref had to be done in this routine 2962 * to avoid racing a collapse. The object's ONEMAPPING flag will also be 2963 * cleared. 2964 * 2965 * The vm_map must be locked and its token held. 2966 */ 2967 static void 2968 vm_map_split(vm_map_entry_t entry) 2969 { 2970 #if 0 2971 /* UNOPTIMIZED */ 2972 vm_object_t oobject; 2973 2974 oobject = entry->object.vm_object; 2975 vm_object_hold(oobject); 2976 vm_object_chain_wait(oobject); 2977 vm_object_reference_locked(oobject); 2978 vm_object_clear_flag(oobject, OBJ_ONEMAPPING); 2979 vm_object_drop(oobject); 2980 #else 2981 /* OPTIMIZED */ 2982 vm_object_t oobject, nobject, bobject; 2983 vm_offset_t s, e; 2984 vm_page_t m; 2985 vm_pindex_t offidxstart, offidxend, idx; 2986 vm_size_t size; 2987 vm_ooffset_t offset; 2988 2989 /* 2990 * Setup. Chain lock the original object throughout the entire 2991 * routine to prevent new page faults from occuring. 2992 * 2993 * XXX can madvise WILLNEED interfere with us too? 2994 */ 2995 oobject = entry->object.vm_object; 2996 vm_object_hold(oobject); 2997 vm_object_chain_acquire(oobject); 2998 2999 /* 3000 * Original object cannot be split? 3001 */ 3002 if (oobject->handle == NULL || (oobject->type != OBJT_DEFAULT && 3003 oobject->type != OBJT_SWAP)) { 3004 vm_object_chain_release(oobject); 3005 vm_object_reference_locked(oobject); 3006 vm_object_clear_flag(oobject, OBJ_ONEMAPPING); 3007 vm_object_drop(oobject); 3008 return; 3009 } 3010 3011 /* 3012 * Collapse original object with its backing store as an 3013 * optimization to reduce chain lengths when possible. 3014 * 3015 * If ref_count <= 1 there aren't other non-overlapping vm_map_entry's 3016 * for oobject, so there's no point collapsing it. 3017 * 3018 * Then re-check whether the object can be split. 3019 */ 3020 vm_object_collapse(oobject, NULL); 3021 3022 if (oobject->ref_count <= 1 || 3023 (oobject->type != OBJT_DEFAULT && oobject->type != OBJT_SWAP) || 3024 (oobject->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) != OBJ_ONEMAPPING) { 3025 vm_object_chain_release(oobject); 3026 vm_object_reference_locked(oobject); 3027 vm_object_clear_flag(oobject, OBJ_ONEMAPPING); 3028 vm_object_drop(oobject); 3029 return; 3030 } 3031 3032 /* 3033 * Acquire the chain lock on the backing object. 3034 * 3035 * Give bobject an additional ref count for when it will be shadowed 3036 * by nobject. 3037 */ 3038 if ((bobject = oobject->backing_object) != NULL) { 3039 vm_object_hold(bobject); 3040 vm_object_chain_wait(bobject); 3041 vm_object_reference_locked(bobject); 3042 vm_object_chain_acquire(bobject); 3043 KKASSERT(bobject->backing_object == bobject); 3044 KKASSERT((bobject->flags & OBJ_DEAD) == 0); 3045 } 3046 3047 /* 3048 * Calculate the object page range and allocate the new object. 3049 */ 3050 offset = entry->offset; 3051 s = entry->start; 3052 e = entry->end; 3053 3054 offidxstart = OFF_TO_IDX(offset); 3055 offidxend = offidxstart + OFF_TO_IDX(e - s); 3056 size = offidxend - offidxstart; 3057 3058 switch(oobject->type) { 3059 case OBJT_DEFAULT: 3060 nobject = default_pager_alloc(NULL, IDX_TO_OFF(size), 3061 VM_PROT_ALL, 0); 3062 break; 3063 case OBJT_SWAP: 3064 nobject = swap_pager_alloc(NULL, IDX_TO_OFF(size), 3065 VM_PROT_ALL, 0); 3066 break; 3067 default: 3068 /* not reached */ 3069 nobject = NULL; 3070 KKASSERT(0); 3071 } 3072 3073 if (nobject == NULL) { 3074 if (bobject) { 3075 vm_object_chain_release(bobject); 3076 vm_object_deallocate(bobject); 3077 vm_object_drop(bobject); 3078 } 3079 vm_object_chain_release(oobject); 3080 vm_object_reference_locked(oobject); 3081 vm_object_clear_flag(oobject, OBJ_ONEMAPPING); 3082 vm_object_drop(oobject); 3083 return; 3084 } 3085 3086 /* 3087 * The new object will replace entry->object.vm_object so it needs 3088 * a second reference (the caller expects an additional ref). 3089 */ 3090 vm_object_hold(nobject); 3091 vm_object_reference_locked(nobject); 3092 vm_object_chain_acquire(nobject); 3093 3094 /* 3095 * nobject shadows bobject (oobject already shadows bobject). 3096 */ 3097 if (bobject) { 3098 nobject->backing_object_offset = 3099 oobject->backing_object_offset + IDX_TO_OFF(offidxstart); 3100 nobject->backing_object = bobject; 3101 bobject->shadow_count++; 3102 bobject->generation++; 3103 LIST_INSERT_HEAD(&bobject->shadow_head, nobject, shadow_list); 3104 vm_object_clear_flag(bobject, OBJ_ONEMAPPING); /* XXX? */ 3105 vm_object_chain_release(bobject); 3106 vm_object_drop(bobject); 3107 } 3108 3109 /* 3110 * Move the VM pages from oobject to nobject 3111 */ 3112 for (idx = 0; idx < size; idx++) { 3113 vm_page_t m; 3114 3115 m = vm_page_lookup_busy_wait(oobject, offidxstart + idx, 3116 TRUE, "vmpg"); 3117 if (m == NULL) 3118 continue; 3119 3120 /* 3121 * We must wait for pending I/O to complete before we can 3122 * rename the page. 3123 * 3124 * We do not have to VM_PROT_NONE the page as mappings should 3125 * not be changed by this operation. 3126 * 3127 * NOTE: The act of renaming a page updates chaingen for both 3128 * objects. 3129 */ 3130 vm_page_rename(m, nobject, idx); 3131 /* page automatically made dirty by rename and cache handled */ 3132 /* page remains busy */ 3133 } 3134 3135 if (oobject->type == OBJT_SWAP) { 3136 vm_object_pip_add(oobject, 1); 3137 /* 3138 * copy oobject pages into nobject and destroy unneeded 3139 * pages in shadow object. 3140 */ 3141 swap_pager_copy(oobject, nobject, offidxstart, 0); 3142 vm_object_pip_wakeup(oobject); 3143 } 3144 3145 /* 3146 * Wakeup the pages we played with. No spl protection is needed 3147 * for a simple wakeup. 3148 */ 3149 for (idx = 0; idx < size; idx++) { 3150 m = vm_page_lookup(nobject, idx); 3151 if (m) { 3152 KKASSERT(m->flags & PG_BUSY); 3153 vm_page_wakeup(m); 3154 } 3155 } 3156 entry->object.vm_object = nobject; 3157 entry->offset = 0LL; 3158 3159 /* 3160 * Cleanup 3161 * 3162 * NOTE: There is no need to remove OBJ_ONEMAPPING from oobject, the 3163 * related pages were moved and are no longer applicable to the 3164 * original object. 3165 * 3166 * NOTE: Deallocate oobject (due to its entry->object.vm_object being 3167 * replaced by nobject). 3168 */ 3169 vm_object_chain_release(nobject); 3170 vm_object_drop(nobject); 3171 if (bobject) { 3172 vm_object_chain_release(bobject); 3173 vm_object_drop(bobject); 3174 } 3175 vm_object_chain_release(oobject); 3176 /*vm_object_clear_flag(oobject, OBJ_ONEMAPPING);*/ 3177 vm_object_deallocate_locked(oobject); 3178 vm_object_drop(oobject); 3179 #endif 3180 } 3181 3182 /* 3183 * Copies the contents of the source entry to the destination 3184 * entry. The entries *must* be aligned properly. 3185 * 3186 * The vm_maps must be exclusively locked. 3187 * The vm_map's token must be held. 3188 * 3189 * Because the maps are locked no faults can be in progress during the 3190 * operation. 3191 */ 3192 static void 3193 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map, 3194 vm_map_entry_t src_entry, vm_map_entry_t dst_entry) 3195 { 3196 vm_object_t src_object; 3197 3198 if (dst_entry->maptype == VM_MAPTYPE_SUBMAP) 3199 return; 3200 if (src_entry->maptype == VM_MAPTYPE_SUBMAP) 3201 return; 3202 3203 if (src_entry->wired_count == 0) { 3204 /* 3205 * If the source entry is marked needs_copy, it is already 3206 * write-protected. 3207 */ 3208 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { 3209 pmap_protect(src_map->pmap, 3210 src_entry->start, 3211 src_entry->end, 3212 src_entry->protection & ~VM_PROT_WRITE); 3213 } 3214 3215 /* 3216 * Make a copy of the object. 3217 * 3218 * The object must be locked prior to checking the object type 3219 * and for the call to vm_object_collapse() and vm_map_split(). 3220 * We cannot use *_hold() here because the split code will 3221 * probably try to destroy the object. The lock is a pool 3222 * token and doesn't care. 3223 * 3224 * We must bump src_map->timestamp when setting 3225 * MAP_ENTRY_NEEDS_COPY to force any concurrent fault 3226 * to retry, otherwise the concurrent fault might improperly 3227 * install a RW pte when its supposed to be a RO(COW) pte. 3228 * This race can occur because a vnode-backed fault may have 3229 * to temporarily release the map lock. 3230 */ 3231 if (src_entry->object.vm_object != NULL) { 3232 vm_map_split(src_entry); 3233 src_object = src_entry->object.vm_object; 3234 dst_entry->object.vm_object = src_object; 3235 src_entry->eflags |= (MAP_ENTRY_COW | 3236 MAP_ENTRY_NEEDS_COPY); 3237 dst_entry->eflags |= (MAP_ENTRY_COW | 3238 MAP_ENTRY_NEEDS_COPY); 3239 dst_entry->offset = src_entry->offset; 3240 ++src_map->timestamp; 3241 } else { 3242 dst_entry->object.vm_object = NULL; 3243 dst_entry->offset = 0; 3244 } 3245 3246 pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start, 3247 dst_entry->end - dst_entry->start, src_entry->start); 3248 } else { 3249 /* 3250 * Of course, wired down pages can't be set copy-on-write. 3251 * Cause wired pages to be copied into the new map by 3252 * simulating faults (the new pages are pageable) 3253 */ 3254 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry); 3255 } 3256 } 3257 3258 /* 3259 * vmspace_fork: 3260 * Create a new process vmspace structure and vm_map 3261 * based on those of an existing process. The new map 3262 * is based on the old map, according to the inheritance 3263 * values on the regions in that map. 3264 * 3265 * The source map must not be locked. 3266 * No requirements. 3267 */ 3268 struct vmspace * 3269 vmspace_fork(struct vmspace *vm1) 3270 { 3271 struct vmspace *vm2; 3272 vm_map_t old_map = &vm1->vm_map; 3273 vm_map_t new_map; 3274 vm_map_entry_t old_entry; 3275 vm_map_entry_t new_entry; 3276 vm_object_t object; 3277 int count; 3278 3279 lwkt_gettoken(&vm1->vm_map.token); 3280 vm_map_lock(old_map); 3281 3282 vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset); 3283 lwkt_gettoken(&vm2->vm_map.token); 3284 bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy, 3285 (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy); 3286 new_map = &vm2->vm_map; /* XXX */ 3287 new_map->timestamp = 1; 3288 3289 vm_map_lock(new_map); 3290 3291 count = 0; 3292 old_entry = old_map->header.next; 3293 while (old_entry != &old_map->header) { 3294 ++count; 3295 old_entry = old_entry->next; 3296 } 3297 3298 count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT); 3299 3300 old_entry = old_map->header.next; 3301 while (old_entry != &old_map->header) { 3302 if (old_entry->maptype == VM_MAPTYPE_SUBMAP) 3303 panic("vm_map_fork: encountered a submap"); 3304 3305 switch (old_entry->inheritance) { 3306 case VM_INHERIT_NONE: 3307 break; 3308 case VM_INHERIT_SHARE: 3309 /* 3310 * Clone the entry, creating the shared object if 3311 * necessary. 3312 */ 3313 if (old_entry->object.vm_object == NULL) 3314 vm_map_entry_allocate_object(old_entry); 3315 3316 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) { 3317 /* 3318 * Shadow a map_entry which needs a copy, 3319 * replacing its object with a new object 3320 * that points to the old one. Ask the 3321 * shadow code to automatically add an 3322 * additional ref. We can't do it afterwords 3323 * because we might race a collapse. The call 3324 * to vm_map_entry_shadow() will also clear 3325 * OBJ_ONEMAPPING. 3326 */ 3327 vm_map_entry_shadow(old_entry, 1); 3328 } else { 3329 /* 3330 * We will make a shared copy of the object, 3331 * and must clear OBJ_ONEMAPPING. 3332 * 3333 * XXX assert that object.vm_object != NULL 3334 * since we allocate it above. 3335 */ 3336 if (old_entry->object.vm_object) { 3337 object = old_entry->object.vm_object; 3338 vm_object_hold(object); 3339 vm_object_chain_wait(object); 3340 vm_object_reference_locked(object); 3341 vm_object_clear_flag(object, 3342 OBJ_ONEMAPPING); 3343 vm_object_drop(object); 3344 } 3345 } 3346 3347 /* 3348 * Clone the entry. We've already bumped the ref on 3349 * any vm_object. 3350 */ 3351 new_entry = vm_map_entry_create(new_map, &count); 3352 *new_entry = *old_entry; 3353 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; 3354 new_entry->wired_count = 0; 3355 3356 /* 3357 * Insert the entry into the new map -- we know we're 3358 * inserting at the end of the new map. 3359 */ 3360 3361 vm_map_entry_link(new_map, new_map->header.prev, 3362 new_entry); 3363 3364 /* 3365 * Update the physical map 3366 */ 3367 pmap_copy(new_map->pmap, old_map->pmap, 3368 new_entry->start, 3369 (old_entry->end - old_entry->start), 3370 old_entry->start); 3371 break; 3372 case VM_INHERIT_COPY: 3373 /* 3374 * Clone the entry and link into the map. 3375 */ 3376 new_entry = vm_map_entry_create(new_map, &count); 3377 *new_entry = *old_entry; 3378 new_entry->eflags &= ~MAP_ENTRY_USER_WIRED; 3379 new_entry->wired_count = 0; 3380 new_entry->object.vm_object = NULL; 3381 vm_map_entry_link(new_map, new_map->header.prev, 3382 new_entry); 3383 vm_map_copy_entry(old_map, new_map, old_entry, 3384 new_entry); 3385 break; 3386 } 3387 old_entry = old_entry->next; 3388 } 3389 3390 new_map->size = old_map->size; 3391 vm_map_unlock(old_map); 3392 vm_map_unlock(new_map); 3393 vm_map_entry_release(count); 3394 3395 lwkt_reltoken(&vm2->vm_map.token); 3396 lwkt_reltoken(&vm1->vm_map.token); 3397 3398 return (vm2); 3399 } 3400 3401 /* 3402 * Create an auto-grow stack entry 3403 * 3404 * No requirements. 3405 */ 3406 int 3407 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize, 3408 int flags, vm_prot_t prot, vm_prot_t max, int cow) 3409 { 3410 vm_map_entry_t prev_entry; 3411 vm_map_entry_t new_stack_entry; 3412 vm_size_t init_ssize; 3413 int rv; 3414 int count; 3415 vm_offset_t tmpaddr; 3416 3417 cow |= MAP_IS_STACK; 3418 3419 if (max_ssize < sgrowsiz) 3420 init_ssize = max_ssize; 3421 else 3422 init_ssize = sgrowsiz; 3423 3424 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 3425 vm_map_lock(map); 3426 3427 /* 3428 * Find space for the mapping 3429 */ 3430 if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) { 3431 if (vm_map_findspace(map, addrbos, max_ssize, 1, 3432 flags, &tmpaddr)) { 3433 vm_map_unlock(map); 3434 vm_map_entry_release(count); 3435 return (KERN_NO_SPACE); 3436 } 3437 addrbos = tmpaddr; 3438 } 3439 3440 /* If addr is already mapped, no go */ 3441 if (vm_map_lookup_entry(map, addrbos, &prev_entry)) { 3442 vm_map_unlock(map); 3443 vm_map_entry_release(count); 3444 return (KERN_NO_SPACE); 3445 } 3446 3447 #if 0 3448 /* XXX already handled by kern_mmap() */ 3449 /* If we would blow our VMEM resource limit, no go */ 3450 if (map->size + init_ssize > 3451 curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) { 3452 vm_map_unlock(map); 3453 vm_map_entry_release(count); 3454 return (KERN_NO_SPACE); 3455 } 3456 #endif 3457 3458 /* 3459 * If we can't accomodate max_ssize in the current mapping, 3460 * no go. However, we need to be aware that subsequent user 3461 * mappings might map into the space we have reserved for 3462 * stack, and currently this space is not protected. 3463 * 3464 * Hopefully we will at least detect this condition 3465 * when we try to grow the stack. 3466 */ 3467 if ((prev_entry->next != &map->header) && 3468 (prev_entry->next->start < addrbos + max_ssize)) { 3469 vm_map_unlock(map); 3470 vm_map_entry_release(count); 3471 return (KERN_NO_SPACE); 3472 } 3473 3474 /* 3475 * We initially map a stack of only init_ssize. We will 3476 * grow as needed later. Since this is to be a grow 3477 * down stack, we map at the top of the range. 3478 * 3479 * Note: we would normally expect prot and max to be 3480 * VM_PROT_ALL, and cow to be 0. Possibly we should 3481 * eliminate these as input parameters, and just 3482 * pass these values here in the insert call. 3483 */ 3484 rv = vm_map_insert(map, &count, 3485 NULL, 0, addrbos + max_ssize - init_ssize, 3486 addrbos + max_ssize, 3487 VM_MAPTYPE_NORMAL, 3488 prot, max, 3489 cow); 3490 3491 /* Now set the avail_ssize amount */ 3492 if (rv == KERN_SUCCESS) { 3493 if (prev_entry != &map->header) 3494 vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize, &count); 3495 new_stack_entry = prev_entry->next; 3496 if (new_stack_entry->end != addrbos + max_ssize || 3497 new_stack_entry->start != addrbos + max_ssize - init_ssize) 3498 panic ("Bad entry start/end for new stack entry"); 3499 else 3500 new_stack_entry->aux.avail_ssize = max_ssize - init_ssize; 3501 } 3502 3503 vm_map_unlock(map); 3504 vm_map_entry_release(count); 3505 return (rv); 3506 } 3507 3508 /* 3509 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if the 3510 * desired address is already mapped, or if we successfully grow 3511 * the stack. Also returns KERN_SUCCESS if addr is outside the 3512 * stack range (this is strange, but preserves compatibility with 3513 * the grow function in vm_machdep.c). 3514 * 3515 * No requirements. 3516 */ 3517 int 3518 vm_map_growstack (struct proc *p, vm_offset_t addr) 3519 { 3520 vm_map_entry_t prev_entry; 3521 vm_map_entry_t stack_entry; 3522 vm_map_entry_t new_stack_entry; 3523 struct vmspace *vm = p->p_vmspace; 3524 vm_map_t map = &vm->vm_map; 3525 vm_offset_t end; 3526 int grow_amount; 3527 int rv = KERN_SUCCESS; 3528 int is_procstack; 3529 int use_read_lock = 1; 3530 int count; 3531 3532 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 3533 Retry: 3534 if (use_read_lock) 3535 vm_map_lock_read(map); 3536 else 3537 vm_map_lock(map); 3538 3539 /* If addr is already in the entry range, no need to grow.*/ 3540 if (vm_map_lookup_entry(map, addr, &prev_entry)) 3541 goto done; 3542 3543 if ((stack_entry = prev_entry->next) == &map->header) 3544 goto done; 3545 if (prev_entry == &map->header) 3546 end = stack_entry->start - stack_entry->aux.avail_ssize; 3547 else 3548 end = prev_entry->end; 3549 3550 /* 3551 * This next test mimics the old grow function in vm_machdep.c. 3552 * It really doesn't quite make sense, but we do it anyway 3553 * for compatibility. 3554 * 3555 * If not growable stack, return success. This signals the 3556 * caller to proceed as he would normally with normal vm. 3557 */ 3558 if (stack_entry->aux.avail_ssize < 1 || 3559 addr >= stack_entry->start || 3560 addr < stack_entry->start - stack_entry->aux.avail_ssize) { 3561 goto done; 3562 } 3563 3564 /* Find the minimum grow amount */ 3565 grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE); 3566 if (grow_amount > stack_entry->aux.avail_ssize) { 3567 rv = KERN_NO_SPACE; 3568 goto done; 3569 } 3570 3571 /* 3572 * If there is no longer enough space between the entries 3573 * nogo, and adjust the available space. Note: this 3574 * should only happen if the user has mapped into the 3575 * stack area after the stack was created, and is 3576 * probably an error. 3577 * 3578 * This also effectively destroys any guard page the user 3579 * might have intended by limiting the stack size. 3580 */ 3581 if (grow_amount > stack_entry->start - end) { 3582 if (use_read_lock && vm_map_lock_upgrade(map)) { 3583 /* lost lock */ 3584 use_read_lock = 0; 3585 goto Retry; 3586 } 3587 use_read_lock = 0; 3588 stack_entry->aux.avail_ssize = stack_entry->start - end; 3589 rv = KERN_NO_SPACE; 3590 goto done; 3591 } 3592 3593 is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr; 3594 3595 /* If this is the main process stack, see if we're over the 3596 * stack limit. 3597 */ 3598 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > 3599 p->p_rlimit[RLIMIT_STACK].rlim_cur)) { 3600 rv = KERN_NO_SPACE; 3601 goto done; 3602 } 3603 3604 /* Round up the grow amount modulo SGROWSIZ */ 3605 grow_amount = roundup (grow_amount, sgrowsiz); 3606 if (grow_amount > stack_entry->aux.avail_ssize) { 3607 grow_amount = stack_entry->aux.avail_ssize; 3608 } 3609 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > 3610 p->p_rlimit[RLIMIT_STACK].rlim_cur)) { 3611 grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - 3612 ctob(vm->vm_ssize); 3613 } 3614 3615 /* If we would blow our VMEM resource limit, no go */ 3616 if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 3617 rv = KERN_NO_SPACE; 3618 goto done; 3619 } 3620 3621 if (use_read_lock && vm_map_lock_upgrade(map)) { 3622 /* lost lock */ 3623 use_read_lock = 0; 3624 goto Retry; 3625 } 3626 use_read_lock = 0; 3627 3628 /* Get the preliminary new entry start value */ 3629 addr = stack_entry->start - grow_amount; 3630 3631 /* If this puts us into the previous entry, cut back our growth 3632 * to the available space. Also, see the note above. 3633 */ 3634 if (addr < end) { 3635 stack_entry->aux.avail_ssize = stack_entry->start - end; 3636 addr = end; 3637 } 3638 3639 rv = vm_map_insert(map, &count, 3640 NULL, 0, addr, stack_entry->start, 3641 VM_MAPTYPE_NORMAL, 3642 VM_PROT_ALL, VM_PROT_ALL, 3643 0); 3644 3645 /* Adjust the available stack space by the amount we grew. */ 3646 if (rv == KERN_SUCCESS) { 3647 if (prev_entry != &map->header) 3648 vm_map_clip_end(map, prev_entry, addr, &count); 3649 new_stack_entry = prev_entry->next; 3650 if (new_stack_entry->end != stack_entry->start || 3651 new_stack_entry->start != addr) 3652 panic ("Bad stack grow start/end in new stack entry"); 3653 else { 3654 new_stack_entry->aux.avail_ssize = 3655 stack_entry->aux.avail_ssize - 3656 (new_stack_entry->end - new_stack_entry->start); 3657 if (is_procstack) 3658 vm->vm_ssize += btoc(new_stack_entry->end - 3659 new_stack_entry->start); 3660 } 3661 3662 if (map->flags & MAP_WIREFUTURE) 3663 vm_map_unwire(map, new_stack_entry->start, 3664 new_stack_entry->end, FALSE); 3665 } 3666 3667 done: 3668 if (use_read_lock) 3669 vm_map_unlock_read(map); 3670 else 3671 vm_map_unlock(map); 3672 vm_map_entry_release(count); 3673 return (rv); 3674 } 3675 3676 /* 3677 * Unshare the specified VM space for exec. If other processes are 3678 * mapped to it, then create a new one. The new vmspace is null. 3679 * 3680 * No requirements. 3681 */ 3682 void 3683 vmspace_exec(struct proc *p, struct vmspace *vmcopy) 3684 { 3685 struct vmspace *oldvmspace = p->p_vmspace; 3686 struct vmspace *newvmspace; 3687 vm_map_t map = &p->p_vmspace->vm_map; 3688 3689 /* 3690 * If we are execing a resident vmspace we fork it, otherwise 3691 * we create a new vmspace. Note that exitingcnt is not 3692 * copied to the new vmspace. 3693 */ 3694 lwkt_gettoken(&oldvmspace->vm_map.token); 3695 if (vmcopy) { 3696 newvmspace = vmspace_fork(vmcopy); 3697 lwkt_gettoken(&newvmspace->vm_map.token); 3698 } else { 3699 newvmspace = vmspace_alloc(map->min_offset, map->max_offset); 3700 lwkt_gettoken(&newvmspace->vm_map.token); 3701 bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy, 3702 (caddr_t)&oldvmspace->vm_endcopy - 3703 (caddr_t)&oldvmspace->vm_startcopy); 3704 } 3705 3706 /* 3707 * Finish initializing the vmspace before assigning it 3708 * to the process. The vmspace will become the current vmspace 3709 * if p == curproc. 3710 */ 3711 pmap_pinit2(vmspace_pmap(newvmspace)); 3712 pmap_replacevm(p, newvmspace, 0); 3713 lwkt_reltoken(&newvmspace->vm_map.token); 3714 lwkt_reltoken(&oldvmspace->vm_map.token); 3715 vmspace_free(oldvmspace); 3716 } 3717 3718 /* 3719 * Unshare the specified VM space for forcing COW. This 3720 * is called by rfork, for the (RFMEM|RFPROC) == 0 case. 3721 */ 3722 void 3723 vmspace_unshare(struct proc *p) 3724 { 3725 struct vmspace *oldvmspace = p->p_vmspace; 3726 struct vmspace *newvmspace; 3727 3728 lwkt_gettoken(&oldvmspace->vm_map.token); 3729 if (oldvmspace->vm_sysref.refcnt == 1) { 3730 lwkt_reltoken(&oldvmspace->vm_map.token); 3731 return; 3732 } 3733 newvmspace = vmspace_fork(oldvmspace); 3734 lwkt_gettoken(&newvmspace->vm_map.token); 3735 pmap_pinit2(vmspace_pmap(newvmspace)); 3736 pmap_replacevm(p, newvmspace, 0); 3737 lwkt_reltoken(&newvmspace->vm_map.token); 3738 lwkt_reltoken(&oldvmspace->vm_map.token); 3739 vmspace_free(oldvmspace); 3740 } 3741 3742 /* 3743 * vm_map_hint: return the beginning of the best area suitable for 3744 * creating a new mapping with "prot" protection. 3745 * 3746 * No requirements. 3747 */ 3748 vm_offset_t 3749 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot) 3750 { 3751 struct vmspace *vms = p->p_vmspace; 3752 3753 if (!randomize_mmap) { 3754 /* 3755 * Set a reasonable start point for the hint if it was 3756 * not specified or if it falls within the heap space. 3757 * Hinted mmap()s do not allocate out of the heap space. 3758 */ 3759 if (addr == 0 || 3760 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 3761 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) { 3762 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 3763 } 3764 3765 return addr; 3766 } 3767 3768 if (addr != 0 && addr >= (vm_offset_t)vms->vm_daddr) 3769 return addr; 3770 3771 #ifdef notyet 3772 #ifdef __i386__ 3773 /* 3774 * If executable skip first two pages, otherwise start 3775 * after data + heap region. 3776 */ 3777 if ((prot & VM_PROT_EXECUTE) && 3778 ((vm_offset_t)vms->vm_daddr >= I386_MAX_EXE_ADDR)) { 3779 addr = (PAGE_SIZE * 2) + 3780 (karc4random() & (I386_MAX_EXE_ADDR / 2 - 1)); 3781 return (round_page(addr)); 3782 } 3783 #endif /* __i386__ */ 3784 #endif /* notyet */ 3785 3786 addr = (vm_offset_t)vms->vm_daddr + MAXDSIZ; 3787 addr += karc4random() & (MIN((256 * 1024 * 1024), MAXDSIZ) - 1); 3788 3789 return (round_page(addr)); 3790 } 3791 3792 /* 3793 * Finds the VM object, offset, and protection for a given virtual address 3794 * in the specified map, assuming a page fault of the type specified. 3795 * 3796 * Leaves the map in question locked for read; return values are guaranteed 3797 * until a vm_map_lookup_done call is performed. Note that the map argument 3798 * is in/out; the returned map must be used in the call to vm_map_lookup_done. 3799 * 3800 * A handle (out_entry) is returned for use in vm_map_lookup_done, to make 3801 * that fast. 3802 * 3803 * If a lookup is requested with "write protection" specified, the map may 3804 * be changed to perform virtual copying operations, although the data 3805 * referenced will remain the same. 3806 * 3807 * No requirements. 3808 */ 3809 int 3810 vm_map_lookup(vm_map_t *var_map, /* IN/OUT */ 3811 vm_offset_t vaddr, 3812 vm_prot_t fault_typea, 3813 vm_map_entry_t *out_entry, /* OUT */ 3814 vm_object_t *object, /* OUT */ 3815 vm_pindex_t *pindex, /* OUT */ 3816 vm_prot_t *out_prot, /* OUT */ 3817 boolean_t *wired) /* OUT */ 3818 { 3819 vm_map_entry_t entry; 3820 vm_map_t map = *var_map; 3821 vm_prot_t prot; 3822 vm_prot_t fault_type = fault_typea; 3823 int use_read_lock = 1; 3824 int rv = KERN_SUCCESS; 3825 3826 RetryLookup: 3827 if (use_read_lock) 3828 vm_map_lock_read(map); 3829 else 3830 vm_map_lock(map); 3831 3832 /* 3833 * If the map has an interesting hint, try it before calling full 3834 * blown lookup routine. 3835 */ 3836 entry = map->hint; 3837 cpu_ccfence(); 3838 *out_entry = entry; 3839 *object = NULL; 3840 3841 if ((entry == &map->header) || 3842 (vaddr < entry->start) || (vaddr >= entry->end)) { 3843 vm_map_entry_t tmp_entry; 3844 3845 /* 3846 * Entry was either not a valid hint, or the vaddr was not 3847 * contained in the entry, so do a full lookup. 3848 */ 3849 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) { 3850 rv = KERN_INVALID_ADDRESS; 3851 goto done; 3852 } 3853 3854 entry = tmp_entry; 3855 *out_entry = entry; 3856 } 3857 3858 /* 3859 * Handle submaps. 3860 */ 3861 if (entry->maptype == VM_MAPTYPE_SUBMAP) { 3862 vm_map_t old_map = map; 3863 3864 *var_map = map = entry->object.sub_map; 3865 if (use_read_lock) 3866 vm_map_unlock_read(old_map); 3867 else 3868 vm_map_unlock(old_map); 3869 use_read_lock = 1; 3870 goto RetryLookup; 3871 } 3872 3873 /* 3874 * Check whether this task is allowed to have this page. 3875 * Note the special case for MAP_ENTRY_COW 3876 * pages with an override. This is to implement a forced 3877 * COW for debuggers. 3878 */ 3879 3880 if (fault_type & VM_PROT_OVERRIDE_WRITE) 3881 prot = entry->max_protection; 3882 else 3883 prot = entry->protection; 3884 3885 fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); 3886 if ((fault_type & prot) != fault_type) { 3887 rv = KERN_PROTECTION_FAILURE; 3888 goto done; 3889 } 3890 3891 if ((entry->eflags & MAP_ENTRY_USER_WIRED) && 3892 (entry->eflags & MAP_ENTRY_COW) && 3893 (fault_type & VM_PROT_WRITE) && 3894 (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) { 3895 rv = KERN_PROTECTION_FAILURE; 3896 goto done; 3897 } 3898 3899 /* 3900 * If this page is not pageable, we have to get it for all possible 3901 * accesses. 3902 */ 3903 *wired = (entry->wired_count != 0); 3904 if (*wired) 3905 prot = fault_type = entry->protection; 3906 3907 /* 3908 * Virtual page tables may need to update the accessed (A) bit 3909 * in a page table entry. Upgrade the fault to a write fault for 3910 * that case if the map will support it. If the map does not support 3911 * it the page table entry simply will not be updated. 3912 */ 3913 if (entry->maptype == VM_MAPTYPE_VPAGETABLE) { 3914 if (prot & VM_PROT_WRITE) 3915 fault_type |= VM_PROT_WRITE; 3916 } 3917 3918 /* 3919 * If the entry was copy-on-write, we either ... 3920 */ 3921 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) { 3922 /* 3923 * If we want to write the page, we may as well handle that 3924 * now since we've got the map locked. 3925 * 3926 * If we don't need to write the page, we just demote the 3927 * permissions allowed. 3928 */ 3929 3930 if (fault_type & VM_PROT_WRITE) { 3931 /* 3932 * Make a new object, and place it in the object 3933 * chain. Note that no new references have appeared 3934 * -- one just moved from the map to the new 3935 * object. 3936 */ 3937 3938 if (use_read_lock && vm_map_lock_upgrade(map)) { 3939 /* lost lock */ 3940 use_read_lock = 0; 3941 goto RetryLookup; 3942 } 3943 use_read_lock = 0; 3944 3945 vm_map_entry_shadow(entry, 0); 3946 } else { 3947 /* 3948 * We're attempting to read a copy-on-write page -- 3949 * don't allow writes. 3950 */ 3951 3952 prot &= ~VM_PROT_WRITE; 3953 } 3954 } 3955 3956 /* 3957 * Create an object if necessary. 3958 */ 3959 if (entry->object.vm_object == NULL && !map->system_map) { 3960 if (use_read_lock && vm_map_lock_upgrade(map)) { 3961 /* lost lock */ 3962 use_read_lock = 0; 3963 goto RetryLookup; 3964 } 3965 use_read_lock = 0; 3966 vm_map_entry_allocate_object(entry); 3967 } 3968 3969 /* 3970 * Return the object/offset from this entry. If the entry was 3971 * copy-on-write or empty, it has been fixed up. 3972 */ 3973 3974 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset); 3975 *object = entry->object.vm_object; 3976 3977 /* 3978 * Return whether this is the only map sharing this data. On 3979 * success we return with a read lock held on the map. On failure 3980 * we return with the map unlocked. 3981 */ 3982 *out_prot = prot; 3983 done: 3984 if (rv == KERN_SUCCESS) { 3985 if (use_read_lock == 0) 3986 vm_map_lock_downgrade(map); 3987 } else if (use_read_lock) { 3988 vm_map_unlock_read(map); 3989 } else { 3990 vm_map_unlock(map); 3991 } 3992 return (rv); 3993 } 3994 3995 /* 3996 * Releases locks acquired by a vm_map_lookup() 3997 * (according to the handle returned by that lookup). 3998 * 3999 * No other requirements. 4000 */ 4001 void 4002 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count) 4003 { 4004 /* 4005 * Unlock the main-level map 4006 */ 4007 vm_map_unlock_read(map); 4008 if (count) 4009 vm_map_entry_release(count); 4010 } 4011 4012 #include "opt_ddb.h" 4013 #ifdef DDB 4014 #include <sys/kernel.h> 4015 4016 #include <ddb/ddb.h> 4017 4018 /* 4019 * Debugging only 4020 */ 4021 DB_SHOW_COMMAND(map, vm_map_print) 4022 { 4023 static int nlines; 4024 /* XXX convert args. */ 4025 vm_map_t map = (vm_map_t)addr; 4026 boolean_t full = have_addr; 4027 4028 vm_map_entry_t entry; 4029 4030 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n", 4031 (void *)map, 4032 (void *)map->pmap, map->nentries, map->timestamp); 4033 nlines++; 4034 4035 if (!full && db_indent) 4036 return; 4037 4038 db_indent += 2; 4039 for (entry = map->header.next; entry != &map->header; 4040 entry = entry->next) { 4041 db_iprintf("map entry %p: start=%p, end=%p\n", 4042 (void *)entry, (void *)entry->start, (void *)entry->end); 4043 nlines++; 4044 { 4045 static char *inheritance_name[4] = 4046 {"share", "copy", "none", "donate_copy"}; 4047 4048 db_iprintf(" prot=%x/%x/%s", 4049 entry->protection, 4050 entry->max_protection, 4051 inheritance_name[(int)(unsigned char)entry->inheritance]); 4052 if (entry->wired_count != 0) 4053 db_printf(", wired"); 4054 } 4055 if (entry->maptype == VM_MAPTYPE_SUBMAP) { 4056 /* XXX no %qd in kernel. Truncate entry->offset. */ 4057 db_printf(", share=%p, offset=0x%lx\n", 4058 (void *)entry->object.sub_map, 4059 (long)entry->offset); 4060 nlines++; 4061 if ((entry->prev == &map->header) || 4062 (entry->prev->object.sub_map != 4063 entry->object.sub_map)) { 4064 db_indent += 2; 4065 vm_map_print((db_expr_t)(intptr_t) 4066 entry->object.sub_map, 4067 full, 0, NULL); 4068 db_indent -= 2; 4069 } 4070 } else { 4071 /* XXX no %qd in kernel. Truncate entry->offset. */ 4072 db_printf(", object=%p, offset=0x%lx", 4073 (void *)entry->object.vm_object, 4074 (long)entry->offset); 4075 if (entry->eflags & MAP_ENTRY_COW) 4076 db_printf(", copy (%s)", 4077 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done"); 4078 db_printf("\n"); 4079 nlines++; 4080 4081 if ((entry->prev == &map->header) || 4082 (entry->prev->object.vm_object != 4083 entry->object.vm_object)) { 4084 db_indent += 2; 4085 vm_object_print((db_expr_t)(intptr_t) 4086 entry->object.vm_object, 4087 full, 0, NULL); 4088 nlines += 4; 4089 db_indent -= 2; 4090 } 4091 } 4092 } 4093 db_indent -= 2; 4094 if (db_indent == 0) 4095 nlines = 0; 4096 } 4097 4098 /* 4099 * Debugging only 4100 */ 4101 DB_SHOW_COMMAND(procvm, procvm) 4102 { 4103 struct proc *p; 4104 4105 if (have_addr) { 4106 p = (struct proc *) addr; 4107 } else { 4108 p = curproc; 4109 } 4110 4111 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n", 4112 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map, 4113 (void *)vmspace_pmap(p->p_vmspace)); 4114 4115 vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL); 4116 } 4117 4118 #endif /* DDB */ 4119