xref: /dragonfly/sys/vm/vm_map.c (revision e4adeac1)
1 /*
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
37  *
38  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
39  * All rights reserved.
40  *
41  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/proc.h>
67 #include <sys/serialize.h>
68 #include <sys/lock.h>
69 #include <sys/vmmeter.h>
70 #include <sys/mman.h>
71 #include <sys/vnode.h>
72 #include <sys/resourcevar.h>
73 #include <sys/shm.h>
74 #include <sys/tree.h>
75 #include <sys/malloc.h>
76 #include <sys/objcache.h>
77 #include <sys/kern_syscall.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_kern.h>
87 #include <vm/vm_extern.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_zone.h>
90 
91 #include <sys/random.h>
92 #include <sys/sysctl.h>
93 #include <sys/spinlock.h>
94 
95 #include <sys/thread2.h>
96 #include <sys/spinlock2.h>
97 
98 /*
99  * Virtual memory maps provide for the mapping, protection, and sharing
100  * of virtual memory objects.  In addition, this module provides for an
101  * efficient virtual copy of memory from one map to another.
102  *
103  * Synchronization is required prior to most operations.
104  *
105  * Maps consist of an ordered doubly-linked list of simple entries.
106  * A hint and a RB tree is used to speed-up lookups.
107  *
108  * Callers looking to modify maps specify start/end addresses which cause
109  * the related map entry to be clipped if necessary, and then later
110  * recombined if the pieces remained compatible.
111  *
112  * Virtual copy operations are performed by copying VM object references
113  * from one map to another, and then marking both regions as copy-on-write.
114  */
115 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
116 static void vmspace_dtor(void *obj, void *privdata);
117 static void vmspace_terminate(struct vmspace *vm, int final);
118 
119 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
120 MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
121 static struct objcache *vmspace_cache;
122 
123 /*
124  * per-cpu page table cross mappings are initialized in early boot
125  * and might require a considerable number of vm_map_entry structures.
126  */
127 #define MAPENTRYBSP_CACHE	(MAXCPU+1)
128 #define MAPENTRYAP_CACHE	8
129 
130 /*
131  * Partioning threaded programs with large anonymous memory areas can
132  * improve concurrent fault performance.
133  */
134 #define MAP_ENTRY_PARTITION_SIZE	((vm_offset_t)(32 * 1024 * 1024))
135 #define MAP_ENTRY_PARTITION_MASK	(MAP_ENTRY_PARTITION_SIZE - 1)
136 
137 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)	\
138 	((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
139 
140 static struct vm_zone mapentzone_store;
141 __read_mostly static vm_zone_t mapentzone;
142 
143 static struct vm_map_entry map_entry_init[MAX_MAPENT];
144 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
145 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
146 
147 __read_mostly static int randomize_mmap;
148 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
149     "Randomize mmap offsets");
150 __read_mostly static int vm_map_relock_enable = 1;
151 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
152 	   &vm_map_relock_enable, 0, "insert pop pgtable optimization");
153 __read_mostly static int vm_map_partition_enable = 1;
154 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
155 	   &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
156 __read_mostly static int vm_map_backing_limit = 5;
157 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
158 	   &vm_map_backing_limit, 0, "ba.backing_ba link depth");
159 __read_mostly static int vm_map_backing_shadow_test = 1;
160 SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW,
161 	   &vm_map_backing_shadow_test, 0, "ba.object shadow test");
162 
163 static void vmspace_drop_notoken(struct vmspace *vm);
164 static void vm_map_entry_shadow(vm_map_entry_t entry);
165 static vm_map_entry_t vm_map_entry_create(int *);
166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
167 static void vm_map_entry_dispose_ba (vm_map_entry_t entry, vm_map_backing_t ba);
168 static void vm_map_backing_replicated(vm_map_t map,
169 		vm_map_entry_t entry, int flags);
170 static void vm_map_backing_adjust_start(vm_map_entry_t entry,
171 		vm_ooffset_t start);
172 static void vm_map_backing_adjust_end(vm_map_entry_t entry,
173 		vm_ooffset_t end);
174 static void vm_map_backing_attach (vm_map_entry_t entry, vm_map_backing_t ba);
175 static void vm_map_backing_detach (vm_map_entry_t entry, vm_map_backing_t ba);
176 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
177 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
178 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
179 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
180 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
181 		vm_map_entry_t);
182 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
183 		vm_offset_t start, vm_offset_t end, int *countp, int flags);
184 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
185 		vm_offset_t vaddr, int *countp);
186 
187 #define MAP_BACK_CLIPPED	0x0001
188 #define MAP_BACK_BASEOBJREFD	0x0002
189 
190 /*
191  * Initialize the vm_map module.  Must be called before any other vm_map
192  * routines.
193  *
194  * Map and entry structures are allocated from the general purpose
195  * memory pool with some exceptions:
196  *
197  *	- The kernel map is allocated statically.
198  *	- Initial kernel map entries are allocated out of a static pool.
199  *	- We must set ZONE_SPECIAL here or the early boot code can get
200  *	  stuck if there are >63 cores.
201  *
202  *	These restrictions are necessary since malloc() uses the
203  *	maps and requires map entries.
204  *
205  * Called from the low level boot code only.
206  */
207 void
208 vm_map_startup(void)
209 {
210 	mapentzone = &mapentzone_store;
211 	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
212 		  map_entry_init, MAX_MAPENT);
213 	mapentzone_store.zflags |= ZONE_SPECIAL;
214 }
215 
216 /*
217  * Called prior to any vmspace allocations.
218  *
219  * Called from the low level boot code only.
220  */
221 void
222 vm_init2(void)
223 {
224 	vmspace_cache = objcache_create_mbacked(M_VMSPACE,
225 						sizeof(struct vmspace),
226 						0, ncpus * 4,
227 						vmspace_ctor, vmspace_dtor,
228 						NULL);
229 	zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
230 	pmap_init2();
231 	vm_object_init2();
232 }
233 
234 /*
235  * objcache support.  We leave the pmap root cached as long as possible
236  * for performance reasons.
237  */
238 static
239 boolean_t
240 vmspace_ctor(void *obj, void *privdata, int ocflags)
241 {
242 	struct vmspace *vm = obj;
243 
244 	bzero(vm, sizeof(*vm));
245 	vm->vm_refcnt = VM_REF_DELETED;
246 
247 	return 1;
248 }
249 
250 static
251 void
252 vmspace_dtor(void *obj, void *privdata)
253 {
254 	struct vmspace *vm = obj;
255 
256 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
257 	pmap_puninit(vmspace_pmap(vm));
258 }
259 
260 /*
261  * Red black tree functions
262  *
263  * The caller must hold the related map lock.
264  */
265 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
266 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
267 
268 /* a->ba.start is address, and the only field which must be initialized */
269 static int
270 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
271 {
272 	if (a->ba.start < b->ba.start)
273 		return(-1);
274 	else if (a->ba.start > b->ba.start)
275 		return(1);
276 	return(0);
277 }
278 
279 /*
280  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
281  * every refcnt.
282  */
283 void
284 vmspace_initrefs(struct vmspace *vm)
285 {
286 	vm->vm_refcnt = 1;
287 	vm->vm_holdcnt = 1;
288 }
289 
290 /*
291  * Allocate a vmspace structure, including a vm_map and pmap.
292  * Initialize numerous fields.  While the initial allocation is zerod,
293  * subsequence reuse from the objcache leaves elements of the structure
294  * intact (particularly the pmap), so portions must be zerod.
295  *
296  * Returns a referenced vmspace.
297  *
298  * No requirements.
299  */
300 struct vmspace *
301 vmspace_alloc(vm_offset_t min, vm_offset_t max)
302 {
303 	struct vmspace *vm;
304 
305 	vm = objcache_get(vmspace_cache, M_WAITOK);
306 
307 	bzero(&vm->vm_startcopy,
308 	      (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
309 	vm_map_init(&vm->vm_map, min, max, NULL);	/* initializes token */
310 
311 	/*
312 	 * NOTE: hold to acquires token for safety.
313 	 *
314 	 * On return vmspace is referenced (refs=1, hold=1).  That is,
315 	 * each refcnt also has a holdcnt.  There can be additional holds
316 	 * (holdcnt) above and beyond the refcnt.  Finalization is handled in
317 	 * two stages, one on refs 1->0, and the the second on hold 1->0.
318 	 */
319 	KKASSERT(vm->vm_holdcnt == 0);
320 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
321 	vmspace_initrefs(vm);
322 	vmspace_hold(vm);
323 	pmap_pinit(vmspace_pmap(vm));		/* (some fields reused) */
324 	vm->vm_map.pmap = vmspace_pmap(vm);	/* XXX */
325 	vm->vm_shm = NULL;
326 	vm->vm_flags = 0;
327 	cpu_vmspace_alloc(vm);
328 	vmspace_drop(vm);
329 
330 	return (vm);
331 }
332 
333 /*
334  * NOTE: Can return 0 if the vmspace is exiting.
335  */
336 int
337 vmspace_getrefs(struct vmspace *vm)
338 {
339 	int32_t n;
340 
341 	n = vm->vm_refcnt;
342 	cpu_ccfence();
343 	if (n & VM_REF_DELETED)
344 		n = -1;
345 	return n;
346 }
347 
348 void
349 vmspace_hold(struct vmspace *vm)
350 {
351 	atomic_add_int(&vm->vm_holdcnt, 1);
352 	lwkt_gettoken(&vm->vm_map.token);
353 }
354 
355 /*
356  * Drop with final termination interlock.
357  */
358 void
359 vmspace_drop(struct vmspace *vm)
360 {
361 	lwkt_reltoken(&vm->vm_map.token);
362 	vmspace_drop_notoken(vm);
363 }
364 
365 static void
366 vmspace_drop_notoken(struct vmspace *vm)
367 {
368 	if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
369 		if (vm->vm_refcnt & VM_REF_DELETED)
370 			vmspace_terminate(vm, 1);
371 	}
372 }
373 
374 /*
375  * A vmspace object must not be in a terminated state to be able to obtain
376  * additional refs on it.
377  *
378  * These are official references to the vmspace, the count is used to check
379  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
380  *
381  * XXX we need to combine hold & ref together into one 64-bit field to allow
382  * holds to prevent stage-1 termination.
383  */
384 void
385 vmspace_ref(struct vmspace *vm)
386 {
387 	uint32_t n;
388 
389 	atomic_add_int(&vm->vm_holdcnt, 1);
390 	n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
391 	KKASSERT((n & VM_REF_DELETED) == 0);
392 }
393 
394 /*
395  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
396  * termination of the vmspace.  Then, on the final drop of the hold we
397  * will do stage-2 final termination.
398  */
399 void
400 vmspace_rel(struct vmspace *vm)
401 {
402 	uint32_t n;
403 
404 	/*
405 	 * Drop refs.  Each ref also has a hold which is also dropped.
406 	 *
407 	 * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
408 	 * prevent finalization) to start termination processing.
409 	 * Finalization occurs when the last hold count drops to 0.
410 	 */
411 	n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
412 	while (n == 0) {
413 		if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
414 			vmspace_terminate(vm, 0);
415 			break;
416 		}
417 		n = vm->vm_refcnt;
418 		cpu_ccfence();
419 	}
420 	vmspace_drop_notoken(vm);
421 }
422 
423 /*
424  * This is called during exit indicating that the vmspace is no
425  * longer in used by an exiting process, but the process has not yet
426  * been reaped.
427  *
428  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
429  * to prevent stage-2 until the process is reaped.  Note hte order of
430  * operation, we must hold first.
431  *
432  * No requirements.
433  */
434 void
435 vmspace_relexit(struct vmspace *vm)
436 {
437 	atomic_add_int(&vm->vm_holdcnt, 1);
438 	vmspace_rel(vm);
439 }
440 
441 /*
442  * Called during reap to disconnect the remainder of the vmspace from
443  * the process.  On the hold drop the vmspace termination is finalized.
444  *
445  * No requirements.
446  */
447 void
448 vmspace_exitfree(struct proc *p)
449 {
450 	struct vmspace *vm;
451 
452 	vm = p->p_vmspace;
453 	p->p_vmspace = NULL;
454 	vmspace_drop_notoken(vm);
455 }
456 
457 /*
458  * Called in two cases:
459  *
460  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
461  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
462  *     and holdcnt will still be non-zero.
463  *
464  * (2) When holdcnt becomes 0, called with final == 1.  There should no
465  *     longer be anyone with access to the vmspace.
466  *
467  * VMSPACE_EXIT1 flags the primary deactivation
468  * VMSPACE_EXIT2 flags the last reap
469  */
470 static void
471 vmspace_terminate(struct vmspace *vm, int final)
472 {
473 	int count;
474 
475 	lwkt_gettoken(&vm->vm_map.token);
476 	if (final == 0) {
477 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
478 		vm->vm_flags |= VMSPACE_EXIT1;
479 
480 		/*
481 		 * Get rid of most of the resources.  Leave the kernel pmap
482 		 * intact.
483 		 *
484 		 * If the pmap does not contain wired pages we can bulk-delete
485 		 * the pmap as a performance optimization before removing the
486 		 * related mappings.
487 		 *
488 		 * If the pmap contains wired pages we cannot do this
489 		 * pre-optimization because currently vm_fault_unwire()
490 		 * expects the pmap pages to exist and will not decrement
491 		 * p->wire_count if they do not.
492 		 */
493 		shmexit(vm);
494 		if (vmspace_pmap(vm)->pm_stats.wired_count) {
495 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
496 				      VM_MAX_USER_ADDRESS);
497 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
498 					  VM_MAX_USER_ADDRESS);
499 		} else {
500 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
501 					  VM_MAX_USER_ADDRESS);
502 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
503 				      VM_MAX_USER_ADDRESS);
504 		}
505 		lwkt_reltoken(&vm->vm_map.token);
506 	} else {
507 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
508 		KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
509 
510 		/*
511 		 * Get rid of remaining basic resources.
512 		 */
513 		vm->vm_flags |= VMSPACE_EXIT2;
514 		shmexit(vm);
515 
516 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
517 		vm_map_lock(&vm->vm_map);
518 		cpu_vmspace_free(vm);
519 
520 		/*
521 		 * Lock the map, to wait out all other references to it.
522 		 * Delete all of the mappings and pages they hold, then call
523 		 * the pmap module to reclaim anything left.
524 		 */
525 		vm_map_delete(&vm->vm_map,
526 			      vm_map_min(&vm->vm_map),
527 			      vm_map_max(&vm->vm_map),
528 			      &count);
529 		vm_map_unlock(&vm->vm_map);
530 		vm_map_entry_release(count);
531 
532 		pmap_release(vmspace_pmap(vm));
533 		lwkt_reltoken(&vm->vm_map.token);
534 		objcache_put(vmspace_cache, vm);
535 	}
536 }
537 
538 /*
539  * Swap useage is determined by taking the proportional swap used by
540  * VM objects backing the VM map.  To make up for fractional losses,
541  * if the VM object has any swap use at all the associated map entries
542  * count for at least 1 swap page.
543  *
544  * No requirements.
545  */
546 vm_offset_t
547 vmspace_swap_count(struct vmspace *vm)
548 {
549 	vm_map_t map = &vm->vm_map;
550 	vm_map_entry_t cur;
551 	vm_object_t object;
552 	vm_offset_t count = 0;
553 	vm_offset_t n;
554 
555 	vmspace_hold(vm);
556 
557 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
558 		switch(cur->maptype) {
559 		case VM_MAPTYPE_NORMAL:
560 			if ((object = cur->ba.object) == NULL)
561 				break;
562 			if (object->swblock_count) {
563 				n = (cur->ba.end - cur->ba.start) / PAGE_SIZE;
564 				count += object->swblock_count *
565 				    SWAP_META_PAGES * n / object->size + 1;
566 			}
567 			break;
568 		default:
569 			break;
570 		}
571 	}
572 	vmspace_drop(vm);
573 
574 	return(count);
575 }
576 
577 /*
578  * Calculate the approximate number of anonymous pages in use by
579  * this vmspace.  To make up for fractional losses, we count each
580  * VM object as having at least 1 anonymous page.
581  *
582  * No requirements.
583  */
584 vm_offset_t
585 vmspace_anonymous_count(struct vmspace *vm)
586 {
587 	vm_map_t map = &vm->vm_map;
588 	vm_map_entry_t cur;
589 	vm_object_t object;
590 	vm_offset_t count = 0;
591 
592 	vmspace_hold(vm);
593 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
594 		switch(cur->maptype) {
595 		case VM_MAPTYPE_NORMAL:
596 			if ((object = cur->ba.object) == NULL)
597 				break;
598 			if (object->type != OBJT_DEFAULT &&
599 			    object->type != OBJT_SWAP) {
600 				break;
601 			}
602 			count += object->resident_page_count;
603 			break;
604 		default:
605 			break;
606 		}
607 	}
608 	vmspace_drop(vm);
609 
610 	return(count);
611 }
612 
613 /*
614  * Initialize an existing vm_map structure such as that in the vmspace
615  * structure.  The pmap is initialized elsewhere.
616  *
617  * No requirements.
618  */
619 void
620 vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr,
621 	    pmap_t pmap)
622 {
623 	RB_INIT(&map->rb_root);
624 	spin_init(&map->ilock_spin, "ilock");
625 	map->ilock_base = NULL;
626 	map->nentries = 0;
627 	map->size = 0;
628 	map->system_map = 0;
629 	vm_map_min(map) = min_addr;
630 	vm_map_max(map) = max_addr;
631 	map->pmap = pmap;
632 	map->timestamp = 0;
633 	map->flags = 0;
634 	bzero(&map->freehint, sizeof(map->freehint));
635 	lwkt_token_init(&map->token, "vm_map");
636 	lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
637 }
638 
639 /*
640  * Find the first possible free address for the specified request length.
641  * Returns 0 if we don't have one cached.
642  */
643 static
644 vm_offset_t
645 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
646 {
647 	vm_map_freehint_t *scan;
648 
649 	scan = &map->freehint[0];
650 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
651 		if (scan->length == length && scan->align == align)
652 			return(scan->start);
653 		++scan;
654 	}
655 	return 0;
656 }
657 
658 /*
659  * Unconditionally set the freehint.  Called by vm_map_findspace() after
660  * it finds an address.  This will help us iterate optimally on the next
661  * similar findspace.
662  */
663 static
664 void
665 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
666 		       vm_size_t length, vm_size_t align)
667 {
668 	vm_map_freehint_t *scan;
669 
670 	scan = &map->freehint[0];
671 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
672 		if (scan->length == length && scan->align == align) {
673 			scan->start = start;
674 			return;
675 		}
676 		++scan;
677 	}
678 	scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
679 	scan->start = start;
680 	scan->align = align;
681 	scan->length = length;
682 	++map->freehint_newindex;
683 }
684 
685 /*
686  * Update any existing freehints (for any alignment), for the hole we just
687  * added.
688  */
689 static
690 void
691 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
692 {
693 	vm_map_freehint_t *scan;
694 
695 	scan = &map->freehint[0];
696 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
697 		if (scan->length <= length && scan->start > start)
698 			scan->start = start;
699 		++scan;
700 	}
701 }
702 
703 /*
704  * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
705  * object in the entry for COW faults.
706  *
707  * The entire chain including entry->ba (prior to inserting the fronting
708  * object) essentially becomes set in stone... elements of it can be paged
709  * in or out, but cannot be further modified.
710  *
711  * NOTE: If we do not optimize the backing chain then a unique copy is not
712  *	 needed.  Note, however, that because portions of the chain are
713  *	 shared across pmaps we cannot make any changes to the vm_map_backing
714  *	 elements themselves.
715  *
716  * If the map segment is governed by a virtual page table then it is
717  * possible to address offsets beyond the mapped area.  Just allocate
718  * a maximally sized object for this case.
719  *
720  * If addref is non-zero an additional reference is added to the returned
721  * entry.  This mechanic exists because the additional reference might have
722  * to be added atomically and not after return to prevent a premature
723  * collapse.  XXX currently there is no collapse code.
724  *
725  * The vm_map must be exclusively locked.
726  * No other requirements.
727  */
728 static
729 void
730 vm_map_entry_shadow(vm_map_entry_t entry)
731 {
732 	vm_map_backing_t ba;
733 	vm_size_t length;
734 	vm_object_t source;
735 	vm_object_t result;
736 
737 	/*
738 	 * Number of bytes we have to shadow
739 	 */
740 	length = atop(entry->ba.end - entry->ba.start);
741 
742 	/*
743 	 * Don't create the new object if the old object isn't shared.
744 	 * This case occurs quite often when programs fork/exec/wait.
745 	 *
746 	 * Caller ensures source exists (all backing_ba's must have objects),
747 	 * typically indirectly by virtue of the NEEDS_COPY flag being set.
748 	 * We have a ref on source by virtue of the entry and do not need
749 	 * to lock it to do this test.
750 	 */
751 	source = entry->ba.object;
752 	KKASSERT(source);
753 
754 	if (source->type != OBJT_VNODE) {
755 		if (source->ref_count == 1 &&
756 		    source->handle == NULL &&
757 		    (source->type == OBJT_DEFAULT ||
758 		     source->type == OBJT_SWAP)) {
759 			goto done;
760 		}
761 	}
762 	ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
763 	vm_object_hold_shared(source);
764 
765 	/*
766 	 * Once it becomes part of a backing_ba chain it can wind up anywhere,
767 	 * drop the ONEMAPPING flag now.
768 	 */
769 	vm_object_clear_flag(source, OBJ_ONEMAPPING);
770 
771 	/*
772 	 * Allocate a new object with the given length.  The new object
773 	 * is returned referenced but we may have to add another one.
774 	 * If we are adding a second reference we must clear OBJ_ONEMAPPING.
775 	 * (typically because the caller is about to clone a vm_map_entry).
776 	 *
777 	 * The source object currently has an extra reference to prevent
778 	 * collapses into it while we mess with its shadow list, which
779 	 * we will remove later in this routine.
780 	 *
781 	 * The target object may require a second reference if asked for one
782 	 * by the caller.
783 	 */
784 	result = vm_object_allocate_hold(OBJT_DEFAULT, length);
785 	if (result == NULL)
786 		panic("vm_object_shadow: no object for shadowing");
787 
788 	/*
789 	 * The new object shadows the source object.
790 	 *
791 	 * Try to optimize the result object's page color when shadowing
792 	 * in order to maintain page coloring consistency in the combined
793 	 * shadowed object.
794 	 *
795 	 * The source object is moved to ba, retaining its existing ref-count.
796 	 * No additional ref is needed.
797 	 *
798 	 * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
799 	 */
800 	vm_map_backing_detach(entry, &entry->ba);
801 	*ba = entry->ba;		/* previous ba */
802 	entry->ba.object = result;	/* new ba (at head of entry) */
803 	entry->ba.backing_ba = ba;
804 	entry->ba.backing_count = ba->backing_count + 1;
805 	entry->ba.offset = 0;
806 
807 	/* cpu localization twist */
808 	result->pg_color = vm_quickcolor();
809 
810 	vm_map_backing_attach(entry, &entry->ba);
811 	vm_map_backing_attach(entry, ba);
812 
813 	/*
814 	 * Adjust the return storage.  Drop the ref on source before
815 	 * returning.
816 	 */
817 	vm_object_drop(result);
818 	vm_object_drop(source);
819 done:
820 	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
821 }
822 
823 /*
824  * Allocate an object for a vm_map_entry.
825  *
826  * Object allocation for anonymous mappings is defered as long as possible.
827  * This function is called when we can defer no longer, generally when a map
828  * entry might be split or forked or takes a page fault.
829  *
830  * If the map segment is governed by a virtual page table then it is
831  * possible to address offsets beyond the mapped area.  Just allocate
832  * a maximally sized object for this case.
833  *
834  * The vm_map must be exclusively locked.
835  * No other requirements.
836  */
837 void
838 vm_map_entry_allocate_object(vm_map_entry_t entry)
839 {
840 	vm_object_t obj;
841 
842 	/*
843 	 * ba.offset is NOT cumulatively added in the backing_ba scan like
844 	 * it was in the old object chain, so we can assign whatever offset
845 	 * we like to the new object.
846 	 *
847 	 * For now assign a value of 0 to make debugging object sizes
848 	 * easier.
849 	 */
850 	entry->ba.offset = 0;
851 
852 	obj = vm_object_allocate(OBJT_DEFAULT,
853 				 atop(entry->ba.end - entry->ba.start) +
854 				 entry->ba.offset);
855 	entry->ba.object = obj;
856 	vm_map_backing_attach(entry, &entry->ba);
857 }
858 
859 /*
860  * Set an initial negative count so the first attempt to reserve
861  * space preloads a bunch of vm_map_entry's for this cpu.  Also
862  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
863  * map a new page for vm_map_entry structures.  SMP systems are
864  * particularly sensitive.
865  *
866  * This routine is called in early boot so we cannot just call
867  * vm_map_entry_reserve().
868  *
869  * Called from the low level boot code only (for each cpu)
870  *
871  * WARNING! Take care not to have too-big a static/BSS structure here
872  *	    as MAXCPU can be 256+, otherwise the loader's 64MB heap
873  *	    can get blown out by the kernel plus the initrd image.
874  */
875 void
876 vm_map_entry_reserve_cpu_init(globaldata_t gd)
877 {
878 	vm_map_entry_t entry;
879 	int count;
880 	int i;
881 
882 	atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
883 	if (gd->gd_cpuid == 0) {
884 		entry = &cpu_map_entry_init_bsp[0];
885 		count = MAPENTRYBSP_CACHE;
886 	} else {
887 		entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
888 		count = MAPENTRYAP_CACHE;
889 	}
890 	for (i = 0; i < count; ++i, ++entry) {
891 		MAPENT_FREELIST(entry) = gd->gd_vme_base;
892 		gd->gd_vme_base = entry;
893 	}
894 }
895 
896 /*
897  * Reserves vm_map_entry structures so code later-on can manipulate
898  * map_entry structures within a locked map without blocking trying
899  * to allocate a new vm_map_entry.
900  *
901  * No requirements.
902  *
903  * WARNING!  We must not decrement gd_vme_avail until after we have
904  *	     ensured that sufficient entries exist, otherwise we can
905  *	     get into an endless call recursion in the zalloc code
906  *	     itself.
907  */
908 int
909 vm_map_entry_reserve(int count)
910 {
911 	struct globaldata *gd = mycpu;
912 	vm_map_entry_t entry;
913 
914 	/*
915 	 * Make sure we have enough structures in gd_vme_base to handle
916 	 * the reservation request.
917 	 *
918 	 * Use a critical section to protect against VM faults.  It might
919 	 * not be needed, but we have to be careful here.
920 	 */
921 	if (gd->gd_vme_avail < count) {
922 		crit_enter();
923 		while (gd->gd_vme_avail < count) {
924 			entry = zalloc(mapentzone);
925 			MAPENT_FREELIST(entry) = gd->gd_vme_base;
926 			gd->gd_vme_base = entry;
927 			atomic_add_int(&gd->gd_vme_avail, 1);
928 		}
929 		crit_exit();
930 	}
931 	atomic_add_int(&gd->gd_vme_avail, -count);
932 
933 	return(count);
934 }
935 
936 /*
937  * Releases previously reserved vm_map_entry structures that were not
938  * used.  If we have too much junk in our per-cpu cache clean some of
939  * it out.
940  *
941  * No requirements.
942  */
943 void
944 vm_map_entry_release(int count)
945 {
946 	struct globaldata *gd = mycpu;
947 	vm_map_entry_t entry;
948 	vm_map_entry_t efree;
949 
950 	count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
951 	if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
952 		efree = NULL;
953 		crit_enter();
954 		while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
955 			entry = gd->gd_vme_base;
956 			KKASSERT(entry != NULL);
957 			gd->gd_vme_base = MAPENT_FREELIST(entry);
958 			atomic_add_int(&gd->gd_vme_avail, -1);
959 			MAPENT_FREELIST(entry) = efree;
960 			efree = entry;
961 		}
962 		crit_exit();
963 		while ((entry = efree) != NULL) {
964 			efree = MAPENT_FREELIST(efree);
965 			zfree(mapentzone, entry);
966 		}
967 	}
968 }
969 
970 /*
971  * Reserve map entry structures for use in kernel_map itself.  These
972  * entries have *ALREADY* been reserved on a per-cpu basis when the map
973  * was inited.  This function is used by zalloc() to avoid a recursion
974  * when zalloc() itself needs to allocate additional kernel memory.
975  *
976  * This function works like the normal reserve but does not load the
977  * vm_map_entry cache (because that would result in an infinite
978  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
979  *
980  * Any caller of this function must be sure to renormalize after
981  * potentially eating entries to ensure that the reserve supply
982  * remains intact.
983  *
984  * No requirements.
985  */
986 int
987 vm_map_entry_kreserve(int count)
988 {
989 	struct globaldata *gd = mycpu;
990 
991 	atomic_add_int(&gd->gd_vme_avail, -count);
992 	KASSERT(gd->gd_vme_base != NULL,
993 		("no reserved entries left, gd_vme_avail = %d",
994 		gd->gd_vme_avail));
995 	return(count);
996 }
997 
998 /*
999  * Release previously reserved map entries for kernel_map.  We do not
1000  * attempt to clean up like the normal release function as this would
1001  * cause an unnecessary (but probably not fatal) deep procedure call.
1002  *
1003  * No requirements.
1004  */
1005 void
1006 vm_map_entry_krelease(int count)
1007 {
1008 	struct globaldata *gd = mycpu;
1009 
1010 	atomic_add_int(&gd->gd_vme_avail, count);
1011 }
1012 
1013 /*
1014  * Allocates a VM map entry for insertion.  No entry fields are filled in.
1015  *
1016  * The entries should have previously been reserved.  The reservation count
1017  * is tracked in (*countp).
1018  *
1019  * No requirements.
1020  */
1021 static vm_map_entry_t
1022 vm_map_entry_create(int *countp)
1023 {
1024 	struct globaldata *gd = mycpu;
1025 	vm_map_entry_t entry;
1026 
1027 	KKASSERT(*countp > 0);
1028 	--*countp;
1029 	crit_enter();
1030 	entry = gd->gd_vme_base;
1031 	KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
1032 	gd->gd_vme_base = MAPENT_FREELIST(entry);
1033 	crit_exit();
1034 
1035 	return(entry);
1036 }
1037 
1038 /*
1039  * Attach and detach backing store elements
1040  */
1041 static void
1042 vm_map_backing_attach(vm_map_entry_t entry, vm_map_backing_t ba)
1043 {
1044 	vm_object_t obj;
1045 
1046 	switch(entry->maptype) {
1047 	case VM_MAPTYPE_NORMAL:
1048 		obj = ba->object;
1049 		lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1050 		TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry);
1051 		lockmgr(&obj->backing_lk, LK_RELEASE);
1052 		break;
1053 	case VM_MAPTYPE_UKSMAP:
1054 		ba->uksmap(ba, UKSMAPOP_ADD, entry->aux.dev, NULL);
1055 		break;
1056 	}
1057 }
1058 
1059 static void
1060 vm_map_backing_detach(vm_map_entry_t entry, vm_map_backing_t ba)
1061 {
1062 	vm_object_t obj;
1063 
1064 	switch(entry->maptype) {
1065 	case VM_MAPTYPE_NORMAL:
1066 		obj = ba->object;
1067 		lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1068 		TAILQ_REMOVE(&obj->backing_list, ba, entry);
1069 		lockmgr(&obj->backing_lk, LK_RELEASE);
1070 		break;
1071 	case VM_MAPTYPE_UKSMAP:
1072 		ba->uksmap(ba, UKSMAPOP_REM, entry->aux.dev, NULL);
1073 		break;
1074 	}
1075 }
1076 
1077 /*
1078  * Dispose of the dynamically allocated backing_ba chain associated
1079  * with a vm_map_entry.
1080  *
1081  * We decrement the (possibly shared) element and kfree() on the
1082  * 1->0 transition.  We only iterate to the next backing_ba when
1083  * the previous one went through a 1->0 transition.
1084  *
1085  * These can only be normal vm_object based backings.
1086  */
1087 static void
1088 vm_map_entry_dispose_ba(vm_map_entry_t entry, vm_map_backing_t ba)
1089 {
1090 	vm_map_backing_t next;
1091 
1092 	while (ba) {
1093 		if (ba->map_object) {
1094 			vm_map_backing_detach(entry, ba);
1095 			vm_object_deallocate(ba->object);
1096 		}
1097 		next = ba->backing_ba;
1098 		kfree(ba, M_MAP_BACKING);
1099 		ba = next;
1100 	}
1101 }
1102 
1103 /*
1104  * Dispose of a vm_map_entry that is no longer being referenced.
1105  *
1106  * No requirements.
1107  */
1108 static void
1109 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
1110 {
1111 	struct globaldata *gd = mycpu;
1112 
1113 	/*
1114 	 * Dispose of the base object and the backing link.
1115 	 */
1116 	switch(entry->maptype) {
1117 	case VM_MAPTYPE_NORMAL:
1118 		if (entry->ba.map_object) {
1119 			vm_map_backing_detach(entry, &entry->ba);
1120 			vm_object_deallocate(entry->ba.object);
1121 		}
1122 		break;
1123 	case VM_MAPTYPE_SUBMAP:
1124 		break;
1125 	case VM_MAPTYPE_UKSMAP:
1126 		vm_map_backing_detach(entry, &entry->ba);
1127 		break;
1128 	default:
1129 		break;
1130 	}
1131 	vm_map_entry_dispose_ba(entry, entry->ba.backing_ba);
1132 
1133 	/*
1134 	 * Cleanup for safety.
1135 	 */
1136 	entry->ba.backing_ba = NULL;
1137 	entry->ba.object = NULL;
1138 	entry->ba.offset = 0;
1139 
1140 	++*countp;
1141 	crit_enter();
1142 	MAPENT_FREELIST(entry) = gd->gd_vme_base;
1143 	gd->gd_vme_base = entry;
1144 	crit_exit();
1145 }
1146 
1147 
1148 /*
1149  * Insert/remove entries from maps.
1150  *
1151  * The related map must be exclusively locked.
1152  * The caller must hold map->token
1153  * No other requirements.
1154  */
1155 static __inline void
1156 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1157 {
1158 	ASSERT_VM_MAP_LOCKED(map);
1159 
1160 	map->nentries++;
1161 	if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
1162 		panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
1163 }
1164 
1165 static __inline void
1166 vm_map_entry_unlink(vm_map_t map,
1167 		    vm_map_entry_t entry)
1168 {
1169 	ASSERT_VM_MAP_LOCKED(map);
1170 
1171 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1172 		panic("vm_map_entry_unlink: attempt to mess with "
1173 		      "locked entry! %p", entry);
1174 	}
1175 	vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
1176 	map->nentries--;
1177 }
1178 
1179 /*
1180  * Finds the map entry containing (or immediately preceding) the specified
1181  * address in the given map.  The entry is returned in (*entry).
1182  *
1183  * The boolean result indicates whether the address is actually contained
1184  * in the map.
1185  *
1186  * The related map must be locked.
1187  * No other requirements.
1188  */
1189 boolean_t
1190 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1191 {
1192 	vm_map_entry_t tmp;
1193 	vm_map_entry_t last;
1194 
1195 	ASSERT_VM_MAP_LOCKED(map);
1196 
1197 	/*
1198 	 * Locate the record from the top of the tree.  'last' tracks the
1199 	 * closest prior record and is returned if no match is found, which
1200 	 * in binary tree terms means tracking the most recent right-branch
1201 	 * taken.  If there is no prior record, *entry is set to NULL.
1202 	 */
1203 	last = NULL;
1204 	tmp = RB_ROOT(&map->rb_root);
1205 
1206 	while (tmp) {
1207 		if (address >= tmp->ba.start) {
1208 			if (address < tmp->ba.end) {
1209 				*entry = tmp;
1210 				return(TRUE);
1211 			}
1212 			last = tmp;
1213 			tmp = RB_RIGHT(tmp, rb_entry);
1214 		} else {
1215 			tmp = RB_LEFT(tmp, rb_entry);
1216 		}
1217 	}
1218 	*entry = last;
1219 	return (FALSE);
1220 }
1221 
1222 /*
1223  * Inserts the given whole VM object into the target map at the specified
1224  * address range.  The object's size should match that of the address range.
1225  *
1226  * The map must be exclusively locked.
1227  * The object must be held.
1228  * The caller must have reserved sufficient vm_map_entry structures.
1229  *
1230  * If object is non-NULL, ref count must be bumped by caller prior to
1231  * making call to account for the new entry.  XXX API is a bit messy.
1232  */
1233 int
1234 vm_map_insert(vm_map_t map, int *countp,
1235 	      void *map_object, void *map_aux,
1236 	      vm_ooffset_t offset, void *aux_info,
1237 	      vm_offset_t start, vm_offset_t end,
1238 	      vm_maptype_t maptype, vm_subsys_t id,
1239 	      vm_prot_t prot, vm_prot_t max, int cow)
1240 {
1241 	vm_map_entry_t new_entry;
1242 	vm_map_entry_t prev_entry;
1243 	vm_map_entry_t next;
1244 	vm_map_entry_t temp_entry;
1245 	vm_eflags_t protoeflags;
1246 	vm_object_t object;
1247 	int must_drop = 0;
1248 
1249 	if (maptype == VM_MAPTYPE_UKSMAP)
1250 		object = NULL;
1251 	else
1252 		object = map_object;
1253 
1254 	ASSERT_VM_MAP_LOCKED(map);
1255 	if (object)
1256 		ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1257 
1258 	/*
1259 	 * Check that the start and end points are not bogus.
1260 	 */
1261 	if ((start < vm_map_min(map)) || (end > vm_map_max(map)) ||
1262 	    (start >= end)) {
1263 		return (KERN_INVALID_ADDRESS);
1264 	}
1265 
1266 	/*
1267 	 * Find the entry prior to the proposed starting address; if it's part
1268 	 * of an existing entry, this range is bogus.
1269 	 */
1270 	if (vm_map_lookup_entry(map, start, &temp_entry))
1271 		return (KERN_NO_SPACE);
1272 	prev_entry = temp_entry;
1273 
1274 	/*
1275 	 * Assert that the next entry doesn't overlap the end point.
1276 	 */
1277 	if (prev_entry)
1278 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
1279 	else
1280 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
1281 	if (next && next->ba.start < end)
1282 		return (KERN_NO_SPACE);
1283 
1284 	protoeflags = 0;
1285 
1286 	if (cow & MAP_COPY_ON_WRITE)
1287 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1288 
1289 	if (cow & MAP_NOFAULT) {
1290 		protoeflags |= MAP_ENTRY_NOFAULT;
1291 
1292 		KASSERT(object == NULL,
1293 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
1294 	}
1295 	if (cow & MAP_DISABLE_SYNCER)
1296 		protoeflags |= MAP_ENTRY_NOSYNC;
1297 	if (cow & MAP_DISABLE_COREDUMP)
1298 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1299 	if (cow & MAP_IS_STACK)
1300 		protoeflags |= MAP_ENTRY_STACK;
1301 	if (cow & MAP_IS_KSTACK)
1302 		protoeflags |= MAP_ENTRY_KSTACK;
1303 
1304 	lwkt_gettoken(&map->token);
1305 
1306 	if (object) {
1307 		;
1308 	} else if (prev_entry &&
1309 		 (prev_entry->eflags == protoeflags) &&
1310 		 (prev_entry->ba.end == start) &&
1311 		 (prev_entry->wired_count == 0) &&
1312 		 (prev_entry->id == id) &&
1313 		 prev_entry->maptype == maptype &&
1314 		 maptype == VM_MAPTYPE_NORMAL &&
1315 		 prev_entry->ba.backing_ba == NULL &&	/* not backed */
1316 		 ((prev_entry->ba.object == NULL) ||
1317 		  vm_object_coalesce(prev_entry->ba.object,
1318 				     OFF_TO_IDX(prev_entry->ba.offset),
1319 				     (vm_size_t)(prev_entry->ba.end - prev_entry->ba.start),
1320 				     (vm_size_t)(end - prev_entry->ba.end)))) {
1321 		/*
1322 		 * We were able to extend the object.  Determine if we
1323 		 * can extend the previous map entry to include the
1324 		 * new range as well.
1325 		 */
1326 		if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1327 		    (prev_entry->protection == prot) &&
1328 		    (prev_entry->max_protection == max)) {
1329 			map->size += (end - prev_entry->ba.end);
1330 			vm_map_backing_adjust_end(prev_entry, end);
1331 			vm_map_simplify_entry(map, prev_entry, countp);
1332 			lwkt_reltoken(&map->token);
1333 			return (KERN_SUCCESS);
1334 		}
1335 
1336 		/*
1337 		 * If we can extend the object but cannot extend the
1338 		 * map entry, we have to create a new map entry.  We
1339 		 * must bump the ref count on the extended object to
1340 		 * account for it.  object may be NULL.
1341 		 */
1342 		object = prev_entry->ba.object;
1343 		offset = prev_entry->ba.offset +
1344 			(prev_entry->ba.end - prev_entry->ba.start);
1345 		if (object) {
1346 			vm_object_hold(object);
1347 			vm_object_lock_swap(); /* map->token order */
1348 			vm_object_reference_locked(object);
1349 			map_object = object;
1350 			must_drop = 1;
1351 		}
1352 	}
1353 
1354 	/*
1355 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
1356 	 * in things like the buffer map where we manage kva but do not manage
1357 	 * backing objects.
1358 	 */
1359 
1360 	/*
1361 	 * Create a new entry
1362 	 */
1363 	new_entry = vm_map_entry_create(countp);
1364 	new_entry->ba.pmap = map->pmap;
1365 	new_entry->ba.start = start;
1366 	new_entry->ba.end = end;
1367 	new_entry->id = id;
1368 
1369 	new_entry->maptype = maptype;
1370 	new_entry->eflags = protoeflags;
1371 	new_entry->aux.master_pde = 0;		/* in case size is different */
1372 	new_entry->aux.map_aux = map_aux;
1373 	new_entry->ba.map_object = map_object;
1374 	new_entry->ba.backing_ba = NULL;
1375 	new_entry->ba.backing_count = 0;
1376 	new_entry->ba.offset = offset;
1377 	new_entry->ba.aux_info = aux_info;
1378 	new_entry->ba.flags = 0;
1379 	new_entry->ba.pmap = map->pmap;
1380 
1381 	new_entry->inheritance = VM_INHERIT_DEFAULT;
1382 	new_entry->protection = prot;
1383 	new_entry->max_protection = max;
1384 	new_entry->wired_count = 0;
1385 
1386 	/*
1387 	 * Insert the new entry into the list
1388 	 */
1389 	vm_map_backing_replicated(map, new_entry, MAP_BACK_BASEOBJREFD);
1390 	vm_map_entry_link(map, new_entry);
1391 	map->size += new_entry->ba.end - new_entry->ba.start;
1392 
1393 	/*
1394 	 * Don't worry about updating freehint[] when inserting, allow
1395 	 * addresses to be lower than the actual first free spot.
1396 	 */
1397 #if 0
1398 	/*
1399 	 * Temporarily removed to avoid MAP_STACK panic, due to
1400 	 * MAP_STACK being a huge hack.  Will be added back in
1401 	 * when MAP_STACK (and the user stack mapping) is fixed.
1402 	 */
1403 	/*
1404 	 * It may be possible to simplify the entry
1405 	 */
1406 	vm_map_simplify_entry(map, new_entry, countp);
1407 #endif
1408 
1409 	/*
1410 	 * Try to pre-populate the page table.  Mappings governed by virtual
1411 	 * page tables cannot be prepopulated without a lot of work, so
1412 	 * don't try.
1413 	 */
1414 	if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1415 	    maptype != VM_MAPTYPE_UKSMAP) {
1416 		int dorelock = 0;
1417 		if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1418 			dorelock = 1;
1419 			vm_object_lock_swap();
1420 			vm_object_drop(object);
1421 		}
1422 		pmap_object_init_pt(map->pmap, new_entry,
1423 				    new_entry->ba.start,
1424 				    new_entry->ba.end - new_entry->ba.start,
1425 				    cow & MAP_PREFAULT_PARTIAL);
1426 		if (dorelock) {
1427 			vm_object_hold(object);
1428 			vm_object_lock_swap();
1429 		}
1430 	}
1431 	lwkt_reltoken(&map->token);
1432 	if (must_drop)
1433 		vm_object_drop(object);
1434 
1435 	return (KERN_SUCCESS);
1436 }
1437 
1438 /*
1439  * Find sufficient space for `length' bytes in the given map, starting at
1440  * `start'.  Returns 0 on success, 1 on no space.
1441  *
1442  * This function will returned an arbitrarily aligned pointer.  If no
1443  * particular alignment is required you should pass align as 1.  Note that
1444  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1445  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1446  * argument.
1447  *
1448  * 'align' should be a power of 2 but is not required to be.
1449  *
1450  * The map must be exclusively locked.
1451  * No other requirements.
1452  */
1453 int
1454 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1455 		 vm_size_t align, int flags, vm_offset_t *addr)
1456 {
1457 	vm_map_entry_t entry;
1458 	vm_map_entry_t tmp;
1459 	vm_offset_t hole_start;
1460 	vm_offset_t end;
1461 	vm_offset_t align_mask;
1462 
1463 	if (start < vm_map_min(map))
1464 		start = vm_map_min(map);
1465 	if (start > vm_map_max(map))
1466 		return (1);
1467 
1468 	/*
1469 	 * If the alignment is not a power of 2 we will have to use
1470 	 * a mod/division, set align_mask to a special value.
1471 	 */
1472 	if ((align | (align - 1)) + 1 != (align << 1))
1473 		align_mask = (vm_offset_t)-1;
1474 	else
1475 		align_mask = align - 1;
1476 
1477 	/*
1478 	 * Use freehint to adjust the start point, hopefully reducing
1479 	 * the iteration to O(1).
1480 	 */
1481 	hole_start = vm_map_freehint_find(map, length, align);
1482 	if (start < hole_start)
1483 		start = hole_start;
1484 	if (vm_map_lookup_entry(map, start, &tmp))
1485 		start = tmp->ba.end;
1486 	entry = tmp;	/* may be NULL */
1487 
1488 	/*
1489 	 * Look through the rest of the map, trying to fit a new region in the
1490 	 * gap between existing regions, or after the very last region.
1491 	 */
1492 	for (;;) {
1493 		/*
1494 		 * Adjust the proposed start by the requested alignment,
1495 		 * be sure that we didn't wrap the address.
1496 		 */
1497 		if (align_mask == (vm_offset_t)-1)
1498 			end = roundup(start, align);
1499 		else
1500 			end = (start + align_mask) & ~align_mask;
1501 		if (end < start)
1502 			return (1);
1503 		start = end;
1504 
1505 		/*
1506 		 * Find the end of the proposed new region.  Be sure we didn't
1507 		 * go beyond the end of the map, or wrap around the address.
1508 		 * Then check to see if this is the last entry or if the
1509 		 * proposed end fits in the gap between this and the next
1510 		 * entry.
1511 		 */
1512 		end = start + length;
1513 		if (end > vm_map_max(map) || end < start)
1514 			return (1);
1515 
1516 		/*
1517 		 * Locate the next entry, we can stop if this is the
1518 		 * last entry (we know we are in-bounds so that would
1519 		 * be a sucess).
1520 		 */
1521 		if (entry)
1522 			entry = vm_map_rb_tree_RB_NEXT(entry);
1523 		else
1524 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
1525 		if (entry == NULL)
1526 			break;
1527 
1528 		/*
1529 		 * Determine if the proposed area would overlap the
1530 		 * next entry.
1531 		 *
1532 		 * When matching against a STACK entry, only allow the
1533 		 * memory map to intrude on the ungrown portion of the
1534 		 * STACK entry when MAP_TRYFIXED is set.
1535 		 */
1536 		if (entry->ba.start >= end) {
1537 			if ((entry->eflags & MAP_ENTRY_STACK) == 0)
1538 				break;
1539 			if (flags & MAP_TRYFIXED)
1540 				break;
1541 			if (entry->ba.start - entry->aux.avail_ssize >= end)
1542 				break;
1543 		}
1544 		start = entry->ba.end;
1545 	}
1546 
1547 	/*
1548 	 * Update the freehint
1549 	 */
1550 	vm_map_freehint_update(map, start, length, align);
1551 
1552 	/*
1553 	 * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1554 	 * if it fails.  The kernel_map is locked and nothing can steal
1555 	 * our address space if pmap_growkernel() blocks.
1556 	 *
1557 	 * NOTE: This may be unconditionally called for kldload areas on
1558 	 *	 x86_64 because these do not bump kernel_vm_end (which would
1559 	 *	 fill 128G worth of page tables!).  Therefore we must not
1560 	 *	 retry.
1561 	 */
1562 	if (map == &kernel_map) {
1563 		vm_offset_t kstop;
1564 
1565 		kstop = round_page(start + length);
1566 		if (kstop > kernel_vm_end)
1567 			pmap_growkernel(start, kstop);
1568 	}
1569 	*addr = start;
1570 	return (0);
1571 }
1572 
1573 /*
1574  * vm_map_find finds an unallocated region in the target address map with
1575  * the given length and allocates it.  The search is defined to be first-fit
1576  * from the specified address; the region found is returned in the same
1577  * parameter.
1578  *
1579  * If object is non-NULL, ref count must be bumped by caller
1580  * prior to making call to account for the new entry.
1581  *
1582  * No requirements.  This function will lock the map temporarily.
1583  */
1584 int
1585 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1586 	    vm_ooffset_t offset, vm_offset_t *addr,
1587 	    vm_size_t length, vm_size_t align, boolean_t fitit,
1588 	    vm_maptype_t maptype, vm_subsys_t id,
1589 	    vm_prot_t prot, vm_prot_t max, int cow)
1590 {
1591 	vm_offset_t start;
1592 	vm_object_t object;
1593 	void *aux_info;
1594 	int result;
1595 	int count;
1596 
1597 	/*
1598 	 * Certain UKSMAPs may need aux_info.
1599 	 *
1600 	 * (map_object is the callback function, aux_info is the process
1601 	 *  or thread, if necessary).
1602 	 */
1603 	aux_info = NULL;
1604 	if (maptype == VM_MAPTYPE_UKSMAP) {
1605 		KKASSERT(map_aux != NULL && map_object != NULL);
1606 
1607 		switch(minor(((struct cdev *)map_aux))) {
1608 		case 5:
1609 			/*
1610 			 * /dev/upmap
1611 			 */
1612 			aux_info = curproc;
1613 			break;
1614 		case 6:
1615 			/*
1616 			 * /dev/kpmap
1617 			 */
1618 			break;
1619 		case 7:
1620 			/*
1621 			 * /dev/lpmap
1622 			 */
1623 			aux_info = curthread->td_lwp;
1624 			break;
1625 		}
1626 		object = NULL;
1627 	} else {
1628 		object = map_object;
1629 	}
1630 
1631 	start = *addr;
1632 
1633 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1634 	vm_map_lock(map);
1635 	if (object)
1636 		vm_object_hold_shared(object);
1637 	if (fitit) {
1638 		if (vm_map_findspace(map, start, length, align, 0, addr)) {
1639 			if (object)
1640 				vm_object_drop(object);
1641 			vm_map_unlock(map);
1642 			vm_map_entry_release(count);
1643 			return (KERN_NO_SPACE);
1644 		}
1645 		start = *addr;
1646 	}
1647 	result = vm_map_insert(map, &count,
1648 			       map_object, map_aux,
1649 			       offset, aux_info,
1650 			       start, start + length,
1651 			       maptype, id, prot, max, cow);
1652 	if (object)
1653 		vm_object_drop(object);
1654 	vm_map_unlock(map);
1655 	vm_map_entry_release(count);
1656 
1657 	return (result);
1658 }
1659 
1660 /*
1661  * Simplify the given map entry by merging with either neighbor.  This
1662  * routine also has the ability to merge with both neighbors.
1663  *
1664  * This routine guarentees that the passed entry remains valid (though
1665  * possibly extended).  When merging, this routine may delete one or
1666  * both neighbors.  No action is taken on entries which have their
1667  * in-transition flag set.
1668  *
1669  * The map must be exclusively locked.
1670  */
1671 void
1672 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1673 {
1674 	vm_map_entry_t next, prev;
1675 	vm_size_t prevsize, esize;
1676 
1677 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1678 		++mycpu->gd_cnt.v_intrans_coll;
1679 		return;
1680 	}
1681 
1682 	if (entry->maptype == VM_MAPTYPE_SUBMAP)
1683 		return;
1684 	if (entry->maptype == VM_MAPTYPE_UKSMAP)
1685 		return;
1686 
1687 	prev = vm_map_rb_tree_RB_PREV(entry);
1688 	if (prev) {
1689 		prevsize = prev->ba.end - prev->ba.start;
1690 		if ( (prev->ba.end == entry->ba.start) &&
1691 		     (prev->maptype == entry->maptype) &&
1692 		     (prev->ba.object == entry->ba.object) &&
1693 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1694 		     (!prev->ba.object ||
1695 			(prev->ba.offset + prevsize == entry->ba.offset)) &&
1696 		     (prev->eflags == entry->eflags) &&
1697 		     (prev->protection == entry->protection) &&
1698 		     (prev->max_protection == entry->max_protection) &&
1699 		     (prev->inheritance == entry->inheritance) &&
1700 		     (prev->id == entry->id) &&
1701 		     (prev->wired_count == entry->wired_count)) {
1702 			/*
1703 			 * NOTE: order important.  Unlink before gumming up
1704 			 *	 the RBTREE w/adjust, adjust before disposal
1705 			 *	 of prior entry, to avoid pmap snafus.
1706 			 */
1707 			vm_map_entry_unlink(map, prev);
1708 			vm_map_backing_adjust_start(entry, prev->ba.start);
1709 			if (entry->ba.object == NULL)
1710 				entry->ba.offset = 0;
1711 			vm_map_entry_dispose(map, prev, countp);
1712 		}
1713 	}
1714 
1715 	next = vm_map_rb_tree_RB_NEXT(entry);
1716 	if (next) {
1717 		esize = entry->ba.end - entry->ba.start;
1718 		if ((entry->ba.end == next->ba.start) &&
1719 		    (next->maptype == entry->maptype) &&
1720 		    (next->ba.object == entry->ba.object) &&
1721 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1722 		     (!entry->ba.object ||
1723 			(entry->ba.offset + esize == next->ba.offset)) &&
1724 		    (next->eflags == entry->eflags) &&
1725 		    (next->protection == entry->protection) &&
1726 		    (next->max_protection == entry->max_protection) &&
1727 		    (next->inheritance == entry->inheritance) &&
1728 		    (next->id == entry->id) &&
1729 		    (next->wired_count == entry->wired_count)) {
1730 			/*
1731 			 * NOTE: order important.  Unlink before gumming up
1732 			 *	 the RBTREE w/adjust, adjust before disposal
1733 			 *	 of prior entry, to avoid pmap snafus.
1734 			 */
1735 			vm_map_entry_unlink(map, next);
1736 			vm_map_backing_adjust_end(entry, next->ba.end);
1737 			vm_map_entry_dispose(map, next, countp);
1738 	        }
1739 	}
1740 }
1741 
1742 /*
1743  * Asserts that the given entry begins at or after the specified address.
1744  * If necessary, it splits the entry into two.
1745  */
1746 #define vm_map_clip_start(map, entry, startaddr, countp)		\
1747 {									\
1748 	if (startaddr > entry->ba.start)				\
1749 		_vm_map_clip_start(map, entry, startaddr, countp);	\
1750 }
1751 
1752 /*
1753  * This routine is called only when it is known that the entry must be split.
1754  *
1755  * The map must be exclusively locked.
1756  */
1757 static void
1758 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1759 		   int *countp)
1760 {
1761 	vm_map_entry_t new_entry;
1762 
1763 	/*
1764 	 * Split off the front portion -- note that we must insert the new
1765 	 * entry BEFORE this one, so that this entry has the specified
1766 	 * starting address.
1767 	 */
1768 
1769 	vm_map_simplify_entry(map, entry, countp);
1770 
1771 	/*
1772 	 * If there is no object backing this entry, we might as well create
1773 	 * one now.  If we defer it, an object can get created after the map
1774 	 * is clipped, and individual objects will be created for the split-up
1775 	 * map.  This is a bit of a hack, but is also about the best place to
1776 	 * put this improvement.
1777 	 */
1778 	if (entry->ba.object == NULL && !map->system_map &&
1779 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1780 		vm_map_entry_allocate_object(entry);
1781 	}
1782 
1783 	/*
1784 	 * NOTE: The replicated function will adjust start, end, and offset
1785 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1786 	 *	 the embedded ba.
1787 	 */
1788 	new_entry = vm_map_entry_create(countp);
1789 	*new_entry = *entry;
1790 	new_entry->ba.end = start;
1791 
1792 	/*
1793 	 * Ordering is important, make sure the new entry is replicated
1794 	 * before we cut the exiting entry.
1795 	 */
1796 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1797 	vm_map_backing_adjust_start(entry, start);
1798 	vm_map_entry_link(map, new_entry);
1799 }
1800 
1801 /*
1802  * Asserts that the given entry ends at or before the specified address.
1803  * If necessary, it splits the entry into two.
1804  *
1805  * The map must be exclusively locked.
1806  */
1807 #define vm_map_clip_end(map, entry, endaddr, countp)		\
1808 {								\
1809 	if (endaddr < entry->ba.end)				\
1810 		_vm_map_clip_end(map, entry, endaddr, countp);	\
1811 }
1812 
1813 /*
1814  * This routine is called only when it is known that the entry must be split.
1815  *
1816  * The map must be exclusively locked.
1817  */
1818 static void
1819 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1820 		 int *countp)
1821 {
1822 	vm_map_entry_t new_entry;
1823 
1824 	/*
1825 	 * If there is no object backing this entry, we might as well create
1826 	 * one now.  If we defer it, an object can get created after the map
1827 	 * is clipped, and individual objects will be created for the split-up
1828 	 * map.  This is a bit of a hack, but is also about the best place to
1829 	 * put this improvement.
1830 	 */
1831 
1832 	if (entry->ba.object == NULL && !map->system_map &&
1833 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1834 		vm_map_entry_allocate_object(entry);
1835 	}
1836 
1837 	/*
1838 	 * Create a new entry and insert it AFTER the specified entry
1839 	 *
1840 	 * NOTE: The replicated function will adjust start, end, and offset
1841 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1842 	 *	 the embedded ba.
1843 	 */
1844 	new_entry = vm_map_entry_create(countp);
1845 	*new_entry = *entry;
1846 	new_entry->ba.start = end;
1847 	new_entry->ba.offset += (new_entry->ba.start - entry->ba.start);
1848 
1849 	/*
1850 	 * Ordering is important, make sure the new entry is replicated
1851 	 * before we cut the exiting entry.
1852 	 */
1853 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1854 	vm_map_backing_adjust_end(entry, end);
1855 	vm_map_entry_link(map, new_entry);
1856 }
1857 
1858 /*
1859  * Asserts that the starting and ending region addresses fall within the
1860  * valid range for the map.
1861  */
1862 #define	VM_MAP_RANGE_CHECK(map, start, end)	\
1863 {						\
1864 	if (start < vm_map_min(map))		\
1865 		start = vm_map_min(map);	\
1866 	if (end > vm_map_max(map))		\
1867 		end = vm_map_max(map);		\
1868 	if (start > end)			\
1869 		start = end;			\
1870 }
1871 
1872 /*
1873  * Used to block when an in-transition collison occurs.  The map
1874  * is unlocked for the sleep and relocked before the return.
1875  */
1876 void
1877 vm_map_transition_wait(vm_map_t map, int relock)
1878 {
1879 	tsleep_interlock(map, 0);
1880 	vm_map_unlock(map);
1881 	tsleep(map, PINTERLOCKED, "vment", 0);
1882 	if (relock)
1883 		vm_map_lock(map);
1884 }
1885 
1886 /*
1887  * When we do blocking operations with the map lock held it is
1888  * possible that a clip might have occured on our in-transit entry,
1889  * requiring an adjustment to the entry in our loop.  These macros
1890  * help the pageable and clip_range code deal with the case.  The
1891  * conditional costs virtually nothing if no clipping has occured.
1892  */
1893 
1894 #define CLIP_CHECK_BACK(entry, save_start)			\
1895     do {							\
1896 	    while (entry->ba.start != save_start) {		\
1897 		    entry = vm_map_rb_tree_RB_PREV(entry);	\
1898 		    KASSERT(entry, ("bad entry clip")); 	\
1899 	    }							\
1900     } while(0)
1901 
1902 #define CLIP_CHECK_FWD(entry, save_end)				\
1903     do {							\
1904 	    while (entry->ba.end != save_end) {			\
1905 		    entry = vm_map_rb_tree_RB_NEXT(entry);	\
1906 		    KASSERT(entry, ("bad entry clip")); 	\
1907 	    }							\
1908     } while(0)
1909 
1910 
1911 /*
1912  * Clip the specified range and return the base entry.  The
1913  * range may cover several entries starting at the returned base
1914  * and the first and last entry in the covering sequence will be
1915  * properly clipped to the requested start and end address.
1916  *
1917  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1918  * flag.
1919  *
1920  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1921  * covered by the requested range.
1922  *
1923  * The map must be exclusively locked on entry and will remain locked
1924  * on return. If no range exists or the range contains holes and you
1925  * specified that no holes were allowed, NULL will be returned.  This
1926  * routine may temporarily unlock the map in order avoid a deadlock when
1927  * sleeping.
1928  */
1929 static
1930 vm_map_entry_t
1931 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1932 		  int *countp, int flags)
1933 {
1934 	vm_map_entry_t start_entry;
1935 	vm_map_entry_t entry;
1936 	vm_map_entry_t next;
1937 
1938 	/*
1939 	 * Locate the entry and effect initial clipping.  The in-transition
1940 	 * case does not occur very often so do not try to optimize it.
1941 	 */
1942 again:
1943 	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1944 		return (NULL);
1945 	entry = start_entry;
1946 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1947 		entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1948 		++mycpu->gd_cnt.v_intrans_coll;
1949 		++mycpu->gd_cnt.v_intrans_wait;
1950 		vm_map_transition_wait(map, 1);
1951 		/*
1952 		 * entry and/or start_entry may have been clipped while
1953 		 * we slept, or may have gone away entirely.  We have
1954 		 * to restart from the lookup.
1955 		 */
1956 		goto again;
1957 	}
1958 
1959 	/*
1960 	 * Since we hold an exclusive map lock we do not have to restart
1961 	 * after clipping, even though clipping may block in zalloc.
1962 	 */
1963 	vm_map_clip_start(map, entry, start, countp);
1964 	vm_map_clip_end(map, entry, end, countp);
1965 	entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1966 
1967 	/*
1968 	 * Scan entries covered by the range.  When working on the next
1969 	 * entry a restart need only re-loop on the current entry which
1970 	 * we have already locked, since 'next' may have changed.  Also,
1971 	 * even though entry is safe, it may have been clipped so we
1972 	 * have to iterate forwards through the clip after sleeping.
1973 	 */
1974 	for (;;) {
1975 		next = vm_map_rb_tree_RB_NEXT(entry);
1976 		if (next == NULL || next->ba.start >= end)
1977 			break;
1978 		if (flags & MAP_CLIP_NO_HOLES) {
1979 			if (next->ba.start > entry->ba.end) {
1980 				vm_map_unclip_range(map, start_entry,
1981 					start, entry->ba.end, countp, flags);
1982 				return(NULL);
1983 			}
1984 		}
1985 
1986 		if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1987 			vm_offset_t save_end = entry->ba.end;
1988 			next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1989 			++mycpu->gd_cnt.v_intrans_coll;
1990 			++mycpu->gd_cnt.v_intrans_wait;
1991 			vm_map_transition_wait(map, 1);
1992 
1993 			/*
1994 			 * clips might have occured while we blocked.
1995 			 */
1996 			CLIP_CHECK_FWD(entry, save_end);
1997 			CLIP_CHECK_BACK(start_entry, start);
1998 			continue;
1999 		}
2000 
2001 		/*
2002 		 * No restart necessary even though clip_end may block, we
2003 		 * are holding the map lock.
2004 		 */
2005 		vm_map_clip_end(map, next, end, countp);
2006 		next->eflags |= MAP_ENTRY_IN_TRANSITION;
2007 		entry = next;
2008 	}
2009 	if (flags & MAP_CLIP_NO_HOLES) {
2010 		if (entry->ba.end != end) {
2011 			vm_map_unclip_range(map, start_entry,
2012 				start, entry->ba.end, countp, flags);
2013 			return(NULL);
2014 		}
2015 	}
2016 	return(start_entry);
2017 }
2018 
2019 /*
2020  * Undo the effect of vm_map_clip_range().  You should pass the same
2021  * flags and the same range that you passed to vm_map_clip_range().
2022  * This code will clear the in-transition flag on the entries and
2023  * wake up anyone waiting.  This code will also simplify the sequence
2024  * and attempt to merge it with entries before and after the sequence.
2025  *
2026  * The map must be locked on entry and will remain locked on return.
2027  *
2028  * Note that you should also pass the start_entry returned by
2029  * vm_map_clip_range().  However, if you block between the two calls
2030  * with the map unlocked please be aware that the start_entry may
2031  * have been clipped and you may need to scan it backwards to find
2032  * the entry corresponding with the original start address.  You are
2033  * responsible for this, vm_map_unclip_range() expects the correct
2034  * start_entry to be passed to it and will KASSERT otherwise.
2035  */
2036 static
2037 void
2038 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
2039 		    vm_offset_t start, vm_offset_t end,
2040 		    int *countp, int flags)
2041 {
2042 	vm_map_entry_t entry;
2043 
2044 	entry = start_entry;
2045 
2046 	KASSERT(entry->ba.start == start, ("unclip_range: illegal base entry"));
2047 	while (entry && entry->ba.start < end) {
2048 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2049 			("in-transition flag not set during unclip on: %p",
2050 			entry));
2051 		KASSERT(entry->ba.end <= end,
2052 			("unclip_range: tail wasn't clipped"));
2053 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2054 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2055 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2056 			wakeup(map);
2057 		}
2058 		entry = vm_map_rb_tree_RB_NEXT(entry);
2059 	}
2060 
2061 	/*
2062 	 * Simplification does not block so there is no restart case.
2063 	 */
2064 	entry = start_entry;
2065 	while (entry && entry->ba.start < end) {
2066 		vm_map_simplify_entry(map, entry, countp);
2067 		entry = vm_map_rb_tree_RB_NEXT(entry);
2068 	}
2069 }
2070 
2071 /*
2072  * Mark the given range as handled by a subordinate map.
2073  *
2074  * This range must have been created with vm_map_find(), and no other
2075  * operations may have been performed on this range prior to calling
2076  * vm_map_submap().
2077  *
2078  * Submappings cannot be removed.
2079  *
2080  * No requirements.
2081  */
2082 int
2083 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
2084 {
2085 	vm_map_entry_t entry;
2086 	int result = KERN_INVALID_ARGUMENT;
2087 	int count;
2088 
2089 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2090 	vm_map_lock(map);
2091 
2092 	VM_MAP_RANGE_CHECK(map, start, end);
2093 
2094 	if (vm_map_lookup_entry(map, start, &entry)) {
2095 		vm_map_clip_start(map, entry, start, &count);
2096 	} else if (entry) {
2097 		entry = vm_map_rb_tree_RB_NEXT(entry);
2098 	} else {
2099 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2100 	}
2101 
2102 	vm_map_clip_end(map, entry, end, &count);
2103 
2104 	if ((entry->ba.start == start) && (entry->ba.end == end) &&
2105 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2106 	    (entry->ba.object == NULL)) {
2107 		entry->ba.sub_map = submap;
2108 		entry->maptype = VM_MAPTYPE_SUBMAP;
2109 		result = KERN_SUCCESS;
2110 	}
2111 	vm_map_unlock(map);
2112 	vm_map_entry_release(count);
2113 
2114 	return (result);
2115 }
2116 
2117 /*
2118  * Sets the protection of the specified address region in the target map.
2119  * If "set_max" is specified, the maximum protection is to be set;
2120  * otherwise, only the current protection is affected.
2121  *
2122  * The protection is not applicable to submaps, but is applicable to normal
2123  * maps and maps governed by virtual page tables.  For example, when operating
2124  * on a virtual page table our protection basically controls how COW occurs
2125  * on the backing object, whereas the virtual page table abstraction itself
2126  * is an abstraction for userland.
2127  *
2128  * No requirements.
2129  */
2130 int
2131 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2132 	       vm_prot_t new_prot, boolean_t set_max)
2133 {
2134 	vm_map_entry_t current;
2135 	vm_map_entry_t entry;
2136 	int count;
2137 
2138 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2139 	vm_map_lock(map);
2140 
2141 	VM_MAP_RANGE_CHECK(map, start, end);
2142 
2143 	if (vm_map_lookup_entry(map, start, &entry)) {
2144 		vm_map_clip_start(map, entry, start, &count);
2145 	} else if (entry) {
2146 		entry = vm_map_rb_tree_RB_NEXT(entry);
2147 	} else {
2148 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2149 	}
2150 
2151 	/*
2152 	 * Make a first pass to check for protection violations.
2153 	 */
2154 	current = entry;
2155 	while (current && current->ba.start < end) {
2156 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2157 			vm_map_unlock(map);
2158 			vm_map_entry_release(count);
2159 			return (KERN_INVALID_ARGUMENT);
2160 		}
2161 		if ((new_prot & current->max_protection) != new_prot) {
2162 			vm_map_unlock(map);
2163 			vm_map_entry_release(count);
2164 			return (KERN_PROTECTION_FAILURE);
2165 		}
2166 
2167 		/*
2168 		 * When making a SHARED+RW file mmap writable, update
2169 		 * v_lastwrite_ts.
2170 		 */
2171 		if (new_prot & PROT_WRITE &&
2172 		    (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
2173 		    current->maptype == VM_MAPTYPE_NORMAL &&
2174 		    current->ba.object &&
2175 		    current->ba.object->type == OBJT_VNODE) {
2176 			struct vnode *vp;
2177 
2178 			vp = current->ba.object->handle;
2179 			if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
2180 				vfs_timestamp(&vp->v_lastwrite_ts);
2181 				vsetflags(vp, VLASTWRITETS);
2182 				vn_unlock(vp);
2183 			}
2184 		}
2185 		current = vm_map_rb_tree_RB_NEXT(current);
2186 	}
2187 
2188 	/*
2189 	 * Go back and fix up protections. [Note that clipping is not
2190 	 * necessary the second time.]
2191 	 */
2192 	current = entry;
2193 
2194 	while (current && current->ba.start < end) {
2195 		vm_prot_t old_prot;
2196 
2197 		vm_map_clip_end(map, current, end, &count);
2198 
2199 		old_prot = current->protection;
2200 		if (set_max) {
2201 			current->max_protection = new_prot;
2202 			current->protection = new_prot & old_prot;
2203 		} else {
2204 			current->protection = new_prot;
2205 		}
2206 
2207 		/*
2208 		 * Update physical map if necessary. Worry about copy-on-write
2209 		 * here -- CHECK THIS XXX
2210 		 */
2211 		if (current->protection != old_prot) {
2212 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2213 							VM_PROT_ALL)
2214 
2215 			pmap_protect(map->pmap, current->ba.start,
2216 			    current->ba.end,
2217 			    current->protection & MASK(current));
2218 #undef	MASK
2219 		}
2220 
2221 		vm_map_simplify_entry(map, current, &count);
2222 
2223 		current = vm_map_rb_tree_RB_NEXT(current);
2224 	}
2225 	vm_map_unlock(map);
2226 	vm_map_entry_release(count);
2227 	return (KERN_SUCCESS);
2228 }
2229 
2230 /*
2231  * This routine traverses a processes map handling the madvise
2232  * system call.  Advisories are classified as either those effecting
2233  * the vm_map_entry structure, or those effecting the underlying
2234  * objects.
2235  *
2236  * The <value> argument is used for extended madvise calls.
2237  *
2238  * No requirements.
2239  */
2240 int
2241 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2242 	       int behav, off_t value)
2243 {
2244 	vm_map_entry_t current, entry;
2245 	int modify_map = 0;
2246 	int error = 0;
2247 	int count;
2248 
2249 	/*
2250 	 * Some madvise calls directly modify the vm_map_entry, in which case
2251 	 * we need to use an exclusive lock on the map and we need to perform
2252 	 * various clipping operations.  Otherwise we only need a read-lock
2253 	 * on the map.
2254 	 */
2255 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2256 
2257 	switch(behav) {
2258 	case MADV_NORMAL:
2259 	case MADV_SEQUENTIAL:
2260 	case MADV_RANDOM:
2261 	case MADV_NOSYNC:
2262 	case MADV_AUTOSYNC:
2263 	case MADV_NOCORE:
2264 	case MADV_CORE:
2265 	case MADV_SETMAP:
2266 		modify_map = 1;
2267 		vm_map_lock(map);
2268 		break;
2269 	case MADV_INVAL:
2270 	case MADV_WILLNEED:
2271 	case MADV_DONTNEED:
2272 	case MADV_FREE:
2273 		vm_map_lock_read(map);
2274 		break;
2275 	default:
2276 		vm_map_entry_release(count);
2277 		return (EINVAL);
2278 	}
2279 
2280 	/*
2281 	 * Locate starting entry and clip if necessary.
2282 	 */
2283 
2284 	VM_MAP_RANGE_CHECK(map, start, end);
2285 
2286 	if (vm_map_lookup_entry(map, start, &entry)) {
2287 		if (modify_map)
2288 			vm_map_clip_start(map, entry, start, &count);
2289 	} else if (entry) {
2290 		entry = vm_map_rb_tree_RB_NEXT(entry);
2291 	} else {
2292 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2293 	}
2294 
2295 	if (modify_map) {
2296 		/*
2297 		 * madvise behaviors that are implemented in the vm_map_entry.
2298 		 *
2299 		 * We clip the vm_map_entry so that behavioral changes are
2300 		 * limited to the specified address range.
2301 		 */
2302 		for (current = entry;
2303 		     current && current->ba.start < end;
2304 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2305 			/*
2306 			 * Ignore submaps
2307 			 */
2308 			if (current->maptype == VM_MAPTYPE_SUBMAP)
2309 				continue;
2310 
2311 			vm_map_clip_end(map, current, end, &count);
2312 
2313 			switch (behav) {
2314 			case MADV_NORMAL:
2315 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2316 				break;
2317 			case MADV_SEQUENTIAL:
2318 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2319 				break;
2320 			case MADV_RANDOM:
2321 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2322 				break;
2323 			case MADV_NOSYNC:
2324 				current->eflags |= MAP_ENTRY_NOSYNC;
2325 				break;
2326 			case MADV_AUTOSYNC:
2327 				current->eflags &= ~MAP_ENTRY_NOSYNC;
2328 				break;
2329 			case MADV_NOCORE:
2330 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2331 				break;
2332 			case MADV_CORE:
2333 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2334 				break;
2335 			case MADV_SETMAP:
2336 				/*
2337 				 * Set the page directory page for a map
2338 				 * governed by a virtual page table.
2339 				 *
2340 				 * Software virtual page table support has
2341 				 * been removed, this MADV is no longer
2342 				 * supported.
2343 				 */
2344 				error = EINVAL;
2345 				break;
2346 			case MADV_INVAL:
2347 				/*
2348 				 * Invalidate the related pmap entries, used
2349 				 * to flush portions of the real kernel's
2350 				 * pmap when the caller has removed or
2351 				 * modified existing mappings in a virtual
2352 				 * page table.
2353 				 *
2354 				 * (exclusive locked map version does not
2355 				 * need the range interlock).
2356 				 */
2357 				pmap_remove(map->pmap,
2358 					    current->ba.start, current->ba.end);
2359 				break;
2360 			default:
2361 				error = EINVAL;
2362 				break;
2363 			}
2364 			vm_map_simplify_entry(map, current, &count);
2365 		}
2366 		vm_map_unlock(map);
2367 	} else {
2368 		vm_pindex_t pindex;
2369 		vm_pindex_t delta;
2370 
2371 		/*
2372 		 * madvise behaviors that are implemented in the underlying
2373 		 * vm_object.
2374 		 *
2375 		 * Since we don't clip the vm_map_entry, we have to clip
2376 		 * the vm_object pindex and count.
2377 		 *
2378 		 * NOTE!  These functions are only supported on normal maps.
2379 		 *
2380 		 * NOTE!  These functions only apply to the top-most object.
2381 		 *	  It is not applicable to backing objects.
2382 		 */
2383 		for (current = entry;
2384 		     current && current->ba.start < end;
2385 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2386 			vm_offset_t useStart;
2387 
2388 			if (current->maptype != VM_MAPTYPE_NORMAL)
2389 				continue;
2390 
2391 			pindex = OFF_TO_IDX(current->ba.offset);
2392 			delta = atop(current->ba.end - current->ba.start);
2393 			useStart = current->ba.start;
2394 
2395 			if (current->ba.start < start) {
2396 				pindex += atop(start - current->ba.start);
2397 				delta -= atop(start - current->ba.start);
2398 				useStart = start;
2399 			}
2400 			if (current->ba.end > end)
2401 				delta -= atop(current->ba.end - end);
2402 
2403 			if ((vm_spindex_t)delta <= 0)
2404 				continue;
2405 
2406 			if (behav == MADV_INVAL) {
2407 				/*
2408 				 * Invalidate the related pmap entries, used
2409 				 * to flush portions of the real kernel's
2410 				 * pmap when the caller has removed or
2411 				 * modified existing mappings in a virtual
2412 				 * page table.
2413 				 *
2414 				 * (shared locked map version needs the
2415 				 * interlock, see vm_fault()).
2416 				 */
2417 				struct vm_map_ilock ilock;
2418 
2419 				KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2420 					    useStart + ptoa(delta) <=
2421 					    VM_MAX_USER_ADDRESS,
2422 					 ("Bad range %016jx-%016jx (%016jx)",
2423 					 useStart, useStart + ptoa(delta),
2424 					 delta));
2425 				vm_map_interlock(map, &ilock,
2426 						 useStart,
2427 						 useStart + ptoa(delta));
2428 				pmap_remove(map->pmap,
2429 					    useStart,
2430 					    useStart + ptoa(delta));
2431 				vm_map_deinterlock(map, &ilock);
2432 			} else {
2433 				vm_object_madvise(current->ba.object,
2434 						  pindex, delta, behav);
2435 			}
2436 
2437 			/*
2438 			 * Try to pre-populate the page table.
2439 			 */
2440 			if (behav == MADV_WILLNEED) {
2441 				pmap_object_init_pt(
2442 				    map->pmap, current,
2443 				    useStart,
2444 				    (delta << PAGE_SHIFT),
2445 				    MAP_PREFAULT_MADVISE
2446 				);
2447 			}
2448 		}
2449 		vm_map_unlock_read(map);
2450 	}
2451 	vm_map_entry_release(count);
2452 	return(error);
2453 }
2454 
2455 
2456 /*
2457  * Sets the inheritance of the specified address range in the target map.
2458  * Inheritance affects how the map will be shared with child maps at the
2459  * time of vm_map_fork.
2460  */
2461 int
2462 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2463 	       vm_inherit_t new_inheritance)
2464 {
2465 	vm_map_entry_t entry;
2466 	vm_map_entry_t temp_entry;
2467 	int count;
2468 
2469 	switch (new_inheritance) {
2470 	case VM_INHERIT_NONE:
2471 	case VM_INHERIT_COPY:
2472 	case VM_INHERIT_SHARE:
2473 		break;
2474 	default:
2475 		return (KERN_INVALID_ARGUMENT);
2476 	}
2477 
2478 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2479 	vm_map_lock(map);
2480 
2481 	VM_MAP_RANGE_CHECK(map, start, end);
2482 
2483 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2484 		entry = temp_entry;
2485 		vm_map_clip_start(map, entry, start, &count);
2486 	} else if (temp_entry) {
2487 		entry = vm_map_rb_tree_RB_NEXT(temp_entry);
2488 	} else {
2489 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2490 	}
2491 
2492 	while (entry && entry->ba.start < end) {
2493 		vm_map_clip_end(map, entry, end, &count);
2494 
2495 		entry->inheritance = new_inheritance;
2496 
2497 		vm_map_simplify_entry(map, entry, &count);
2498 
2499 		entry = vm_map_rb_tree_RB_NEXT(entry);
2500 	}
2501 	vm_map_unlock(map);
2502 	vm_map_entry_release(count);
2503 	return (KERN_SUCCESS);
2504 }
2505 
2506 /*
2507  * Implement the semantics of mlock
2508  */
2509 int
2510 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2511 	      boolean_t new_pageable)
2512 {
2513 	vm_map_entry_t entry;
2514 	vm_map_entry_t start_entry;
2515 	vm_offset_t end;
2516 	int rv = KERN_SUCCESS;
2517 	int count;
2518 
2519 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2520 	vm_map_lock(map);
2521 	VM_MAP_RANGE_CHECK(map, start, real_end);
2522 	end = real_end;
2523 
2524 	start_entry = vm_map_clip_range(map, start, end, &count,
2525 					MAP_CLIP_NO_HOLES);
2526 	if (start_entry == NULL) {
2527 		vm_map_unlock(map);
2528 		vm_map_entry_release(count);
2529 		return (KERN_INVALID_ADDRESS);
2530 	}
2531 
2532 	if (new_pageable == 0) {
2533 		entry = start_entry;
2534 		while (entry && entry->ba.start < end) {
2535 			vm_offset_t save_start;
2536 			vm_offset_t save_end;
2537 
2538 			/*
2539 			 * Already user wired or hard wired (trivial cases)
2540 			 */
2541 			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2542 				entry = vm_map_rb_tree_RB_NEXT(entry);
2543 				continue;
2544 			}
2545 			if (entry->wired_count != 0) {
2546 				entry->wired_count++;
2547 				entry->eflags |= MAP_ENTRY_USER_WIRED;
2548 				entry = vm_map_rb_tree_RB_NEXT(entry);
2549 				continue;
2550 			}
2551 
2552 			/*
2553 			 * A new wiring requires instantiation of appropriate
2554 			 * management structures and the faulting in of the
2555 			 * page.
2556 			 */
2557 			if (entry->maptype == VM_MAPTYPE_NORMAL) {
2558 				int copyflag = entry->eflags &
2559 					       MAP_ENTRY_NEEDS_COPY;
2560 				if (copyflag && ((entry->protection &
2561 						  VM_PROT_WRITE) != 0)) {
2562 					vm_map_entry_shadow(entry);
2563 				} else if (entry->ba.object == NULL &&
2564 					   !map->system_map) {
2565 					vm_map_entry_allocate_object(entry);
2566 				}
2567 			}
2568 			entry->wired_count++;
2569 			entry->eflags |= MAP_ENTRY_USER_WIRED;
2570 
2571 			/*
2572 			 * Now fault in the area.  Note that vm_fault_wire()
2573 			 * may release the map lock temporarily, it will be
2574 			 * relocked on return.  The in-transition
2575 			 * flag protects the entries.
2576 			 */
2577 			save_start = entry->ba.start;
2578 			save_end = entry->ba.end;
2579 			rv = vm_fault_wire(map, entry, TRUE, 0);
2580 			if (rv) {
2581 				CLIP_CHECK_BACK(entry, save_start);
2582 				for (;;) {
2583 					KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2584 					entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2585 					entry->wired_count = 0;
2586 					if (entry->ba.end == save_end)
2587 						break;
2588 					entry = vm_map_rb_tree_RB_NEXT(entry);
2589 					KASSERT(entry,
2590 					     ("bad entry clip during backout"));
2591 				}
2592 				end = save_start;	/* unwire the rest */
2593 				break;
2594 			}
2595 			/*
2596 			 * note that even though the entry might have been
2597 			 * clipped, the USER_WIRED flag we set prevents
2598 			 * duplication so we do not have to do a
2599 			 * clip check.
2600 			 */
2601 			entry = vm_map_rb_tree_RB_NEXT(entry);
2602 		}
2603 
2604 		/*
2605 		 * If we failed fall through to the unwiring section to
2606 		 * unwire what we had wired so far.  'end' has already
2607 		 * been adjusted.
2608 		 */
2609 		if (rv)
2610 			new_pageable = 1;
2611 
2612 		/*
2613 		 * start_entry might have been clipped if we unlocked the
2614 		 * map and blocked.  No matter how clipped it has gotten
2615 		 * there should be a fragment that is on our start boundary.
2616 		 */
2617 		CLIP_CHECK_BACK(start_entry, start);
2618 	}
2619 
2620 	/*
2621 	 * Deal with the unwiring case.
2622 	 */
2623 	if (new_pageable) {
2624 		/*
2625 		 * This is the unwiring case.  We must first ensure that the
2626 		 * range to be unwired is really wired down.  We know there
2627 		 * are no holes.
2628 		 */
2629 		entry = start_entry;
2630 		while (entry && entry->ba.start < end) {
2631 			if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2632 				rv = KERN_INVALID_ARGUMENT;
2633 				goto done;
2634 			}
2635 			KASSERT(entry->wired_count != 0,
2636 				("wired count was 0 with USER_WIRED set! %p",
2637 				 entry));
2638 			entry = vm_map_rb_tree_RB_NEXT(entry);
2639 		}
2640 
2641 		/*
2642 		 * Now decrement the wiring count for each region. If a region
2643 		 * becomes completely unwired, unwire its physical pages and
2644 		 * mappings.
2645 		 */
2646 		/*
2647 		 * The map entries are processed in a loop, checking to
2648 		 * make sure the entry is wired and asserting it has a wired
2649 		 * count. However, another loop was inserted more-or-less in
2650 		 * the middle of the unwiring path. This loop picks up the
2651 		 * "entry" loop variable from the first loop without first
2652 		 * setting it to start_entry. Naturally, the secound loop
2653 		 * is never entered and the pages backing the entries are
2654 		 * never unwired. This can lead to a leak of wired pages.
2655 		 */
2656 		entry = start_entry;
2657 		while (entry && entry->ba.start < end) {
2658 			KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2659 				("expected USER_WIRED on entry %p", entry));
2660 			entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2661 			entry->wired_count--;
2662 			if (entry->wired_count == 0)
2663 				vm_fault_unwire(map, entry);
2664 			entry = vm_map_rb_tree_RB_NEXT(entry);
2665 		}
2666 	}
2667 done:
2668 	vm_map_unclip_range(map, start_entry, start, real_end, &count,
2669 		MAP_CLIP_NO_HOLES);
2670 	vm_map_unlock(map);
2671 	vm_map_entry_release(count);
2672 
2673 	return (rv);
2674 }
2675 
2676 /*
2677  * Sets the pageability of the specified address range in the target map.
2678  * Regions specified as not pageable require locked-down physical
2679  * memory and physical page maps.
2680  *
2681  * The map must not be locked, but a reference must remain to the map
2682  * throughout the call.
2683  *
2684  * This function may be called via the zalloc path and must properly
2685  * reserve map entries for kernel_map.
2686  *
2687  * No requirements.
2688  */
2689 int
2690 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2691 {
2692 	vm_map_entry_t entry;
2693 	vm_map_entry_t start_entry;
2694 	vm_offset_t end;
2695 	int rv = KERN_SUCCESS;
2696 	int count;
2697 
2698 	if (kmflags & KM_KRESERVE)
2699 		count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2700 	else
2701 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2702 	vm_map_lock(map);
2703 	VM_MAP_RANGE_CHECK(map, start, real_end);
2704 	end = real_end;
2705 
2706 	start_entry = vm_map_clip_range(map, start, end, &count,
2707 					MAP_CLIP_NO_HOLES);
2708 	if (start_entry == NULL) {
2709 		vm_map_unlock(map);
2710 		rv = KERN_INVALID_ADDRESS;
2711 		goto failure;
2712 	}
2713 	if ((kmflags & KM_PAGEABLE) == 0) {
2714 		/*
2715 		 * Wiring.
2716 		 *
2717 		 * 1.  Holding the write lock, we create any shadow or zero-fill
2718 		 * objects that need to be created. Then we clip each map
2719 		 * entry to the region to be wired and increment its wiring
2720 		 * count.  We create objects before clipping the map entries
2721 		 * to avoid object proliferation.
2722 		 *
2723 		 * 2.  We downgrade to a read lock, and call vm_fault_wire to
2724 		 * fault in the pages for any newly wired area (wired_count is
2725 		 * 1).
2726 		 *
2727 		 * Downgrading to a read lock for vm_fault_wire avoids a
2728 		 * possible deadlock with another process that may have faulted
2729 		 * on one of the pages to be wired (it would mark the page busy,
2730 		 * blocking us, then in turn block on the map lock that we
2731 		 * hold).  Because of problems in the recursive lock package,
2732 		 * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2733 		 * any actions that require the write lock must be done
2734 		 * beforehand.  Because we keep the read lock on the map, the
2735 		 * copy-on-write status of the entries we modify here cannot
2736 		 * change.
2737 		 */
2738 		entry = start_entry;
2739 		while (entry && entry->ba.start < end) {
2740 			/*
2741 			 * Trivial case if the entry is already wired
2742 			 */
2743 			if (entry->wired_count) {
2744 				entry->wired_count++;
2745 				entry = vm_map_rb_tree_RB_NEXT(entry);
2746 				continue;
2747 			}
2748 
2749 			/*
2750 			 * The entry is being newly wired, we have to setup
2751 			 * appropriate management structures.  A shadow
2752 			 * object is required for a copy-on-write region,
2753 			 * or a normal object for a zero-fill region.  We
2754 			 * do not have to do this for entries that point to sub
2755 			 * maps because we won't hold the lock on the sub map.
2756 			 */
2757 			if (entry->maptype == VM_MAPTYPE_NORMAL) {
2758 				int copyflag = entry->eflags &
2759 					       MAP_ENTRY_NEEDS_COPY;
2760 				if (copyflag && ((entry->protection &
2761 						  VM_PROT_WRITE) != 0)) {
2762 					vm_map_entry_shadow(entry);
2763 				} else if (entry->ba.object == NULL &&
2764 					   !map->system_map) {
2765 					vm_map_entry_allocate_object(entry);
2766 				}
2767 			}
2768 			entry->wired_count++;
2769 			entry = vm_map_rb_tree_RB_NEXT(entry);
2770 		}
2771 
2772 		/*
2773 		 * Pass 2.
2774 		 */
2775 
2776 		/*
2777 		 * HACK HACK HACK HACK
2778 		 *
2779 		 * vm_fault_wire() temporarily unlocks the map to avoid
2780 		 * deadlocks.  The in-transition flag from vm_map_clip_range
2781 		 * call should protect us from changes while the map is
2782 		 * unlocked.  T
2783 		 *
2784 		 * NOTE: Previously this comment stated that clipping might
2785 		 *	 still occur while the entry is unlocked, but from
2786 		 *	 what I can tell it actually cannot.
2787 		 *
2788 		 *	 It is unclear whether the CLIP_CHECK_*() calls
2789 		 *	 are still needed but we keep them in anyway.
2790 		 *
2791 		 * HACK HACK HACK HACK
2792 		 */
2793 
2794 		entry = start_entry;
2795 		while (entry && entry->ba.start < end) {
2796 			/*
2797 			 * If vm_fault_wire fails for any page we need to undo
2798 			 * what has been done.  We decrement the wiring count
2799 			 * for those pages which have not yet been wired (now)
2800 			 * and unwire those that have (later).
2801 			 */
2802 			vm_offset_t save_start = entry->ba.start;
2803 			vm_offset_t save_end = entry->ba.end;
2804 
2805 			if (entry->wired_count == 1)
2806 				rv = vm_fault_wire(map, entry, FALSE, kmflags);
2807 			if (rv) {
2808 				CLIP_CHECK_BACK(entry, save_start);
2809 				for (;;) {
2810 					KASSERT(entry->wired_count == 1,
2811 					  ("wired_count changed unexpectedly"));
2812 					entry->wired_count = 0;
2813 					if (entry->ba.end == save_end)
2814 						break;
2815 					entry = vm_map_rb_tree_RB_NEXT(entry);
2816 					KASSERT(entry,
2817 					  ("bad entry clip during backout"));
2818 				}
2819 				end = save_start;
2820 				break;
2821 			}
2822 			CLIP_CHECK_FWD(entry, save_end);
2823 			entry = vm_map_rb_tree_RB_NEXT(entry);
2824 		}
2825 
2826 		/*
2827 		 * If a failure occured undo everything by falling through
2828 		 * to the unwiring code.  'end' has already been adjusted
2829 		 * appropriately.
2830 		 */
2831 		if (rv)
2832 			kmflags |= KM_PAGEABLE;
2833 
2834 		/*
2835 		 * start_entry is still IN_TRANSITION but may have been
2836 		 * clipped since vm_fault_wire() unlocks and relocks the
2837 		 * map.  No matter how clipped it has gotten there should
2838 		 * be a fragment that is on our start boundary.
2839 		 */
2840 		CLIP_CHECK_BACK(start_entry, start);
2841 	}
2842 
2843 	if (kmflags & KM_PAGEABLE) {
2844 		/*
2845 		 * This is the unwiring case.  We must first ensure that the
2846 		 * range to be unwired is really wired down.  We know there
2847 		 * are no holes.
2848 		 */
2849 		entry = start_entry;
2850 		while (entry && entry->ba.start < end) {
2851 			if (entry->wired_count == 0) {
2852 				rv = KERN_INVALID_ARGUMENT;
2853 				goto done;
2854 			}
2855 			entry = vm_map_rb_tree_RB_NEXT(entry);
2856 		}
2857 
2858 		/*
2859 		 * Now decrement the wiring count for each region. If a region
2860 		 * becomes completely unwired, unwire its physical pages and
2861 		 * mappings.
2862 		 */
2863 		entry = start_entry;
2864 		while (entry && entry->ba.start < end) {
2865 			entry->wired_count--;
2866 			if (entry->wired_count == 0)
2867 				vm_fault_unwire(map, entry);
2868 			entry = vm_map_rb_tree_RB_NEXT(entry);
2869 		}
2870 	}
2871 done:
2872 	vm_map_unclip_range(map, start_entry, start, real_end,
2873 			    &count, MAP_CLIP_NO_HOLES);
2874 	vm_map_unlock(map);
2875 failure:
2876 	if (kmflags & KM_KRESERVE)
2877 		vm_map_entry_krelease(count);
2878 	else
2879 		vm_map_entry_release(count);
2880 	return (rv);
2881 }
2882 
2883 /*
2884  * Mark a newly allocated address range as wired but do not fault in
2885  * the pages.  The caller is expected to load the pages into the object.
2886  *
2887  * The map must be locked on entry and will remain locked on return.
2888  * No other requirements.
2889  */
2890 void
2891 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2892 		       int *countp)
2893 {
2894 	vm_map_entry_t scan;
2895 	vm_map_entry_t entry;
2896 
2897 	entry = vm_map_clip_range(map, addr, addr + size,
2898 				  countp, MAP_CLIP_NO_HOLES);
2899 	scan = entry;
2900 	while (scan && scan->ba.start < addr + size) {
2901 		KKASSERT(scan->wired_count == 0);
2902 		scan->wired_count = 1;
2903 		scan = vm_map_rb_tree_RB_NEXT(scan);
2904 	}
2905 	vm_map_unclip_range(map, entry, addr, addr + size,
2906 			    countp, MAP_CLIP_NO_HOLES);
2907 }
2908 
2909 /*
2910  * Push any dirty cached pages in the address range to their pager.
2911  * If syncio is TRUE, dirty pages are written synchronously.
2912  * If invalidate is TRUE, any cached pages are freed as well.
2913  *
2914  * This routine is called by sys_msync()
2915  *
2916  * Returns an error if any part of the specified range is not mapped.
2917  *
2918  * No requirements.
2919  */
2920 int
2921 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2922 	     boolean_t syncio, boolean_t invalidate)
2923 {
2924 	vm_map_entry_t current;
2925 	vm_map_entry_t next;
2926 	vm_map_entry_t entry;
2927 	vm_map_backing_t ba;
2928 	vm_size_t size;
2929 	vm_object_t object;
2930 	vm_ooffset_t offset;
2931 
2932 	vm_map_lock_read(map);
2933 	VM_MAP_RANGE_CHECK(map, start, end);
2934 	if (!vm_map_lookup_entry(map, start, &entry)) {
2935 		vm_map_unlock_read(map);
2936 		return (KERN_INVALID_ADDRESS);
2937 	}
2938 	lwkt_gettoken(&map->token);
2939 
2940 	/*
2941 	 * Make a first pass to check for holes.
2942 	 */
2943 	current = entry;
2944 	while (current && current->ba.start < end) {
2945 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2946 			lwkt_reltoken(&map->token);
2947 			vm_map_unlock_read(map);
2948 			return (KERN_INVALID_ARGUMENT);
2949 		}
2950 		next = vm_map_rb_tree_RB_NEXT(current);
2951 		if (end > current->ba.end &&
2952 		    (next == NULL ||
2953 		     current->ba.end != next->ba.start)) {
2954 			lwkt_reltoken(&map->token);
2955 			vm_map_unlock_read(map);
2956 			return (KERN_INVALID_ADDRESS);
2957 		}
2958 		current = next;
2959 	}
2960 
2961 	if (invalidate)
2962 		pmap_remove(vm_map_pmap(map), start, end);
2963 
2964 	/*
2965 	 * Make a second pass, cleaning/uncaching pages from the indicated
2966 	 * objects as we go.
2967 	 */
2968 	current = entry;
2969 	while (current && current->ba.start < end) {
2970 		offset = current->ba.offset + (start - current->ba.start);
2971 		size = (end <= current->ba.end ? end : current->ba.end) - start;
2972 
2973 		switch(current->maptype) {
2974 		case VM_MAPTYPE_SUBMAP:
2975 		{
2976 			vm_map_t smap;
2977 			vm_map_entry_t tentry;
2978 			vm_size_t tsize;
2979 
2980 			smap = current->ba.sub_map;
2981 			vm_map_lock_read(smap);
2982 			vm_map_lookup_entry(smap, offset, &tentry);
2983 			if (tentry == NULL) {
2984 				tsize = vm_map_max(smap) - offset;
2985 				ba = NULL;
2986 				offset = 0 + (offset - vm_map_min(smap));
2987 			} else {
2988 				tsize = tentry->ba.end - offset;
2989 				ba = &tentry->ba;
2990 				offset = tentry->ba.offset +
2991 					 (offset - tentry->ba.start);
2992 			}
2993 			vm_map_unlock_read(smap);
2994 			if (tsize < size)
2995 				size = tsize;
2996 			break;
2997 		}
2998 		case VM_MAPTYPE_NORMAL:
2999 			ba = &current->ba;
3000 			break;
3001 		default:
3002 			ba = NULL;
3003 			break;
3004 		}
3005 		if (ba) {
3006 			object = ba->object;
3007 			if (object)
3008 				vm_object_hold(object);
3009 		} else {
3010 			object = NULL;
3011 		}
3012 
3013 		/*
3014 		 * Note that there is absolutely no sense in writing out
3015 		 * anonymous objects, so we track down the vnode object
3016 		 * to write out.
3017 		 * We invalidate (remove) all pages from the address space
3018 		 * anyway, for semantic correctness.
3019 		 *
3020 		 * note: certain anonymous maps, such as MAP_NOSYNC maps,
3021 		 * may start out with a NULL object.
3022 		 *
3023 		 * XXX do we really want to stop at the first backing store
3024 		 * here if there are more? XXX
3025 		 */
3026 		if (ba) {
3027 			vm_object_t tobj;
3028 
3029 			tobj = object;
3030 			while (ba->backing_ba != NULL) {
3031 				offset -= ba->offset;
3032 				ba = ba->backing_ba;
3033 				offset += ba->offset;
3034 				tobj = ba->object;
3035 				if (tobj->size < OFF_TO_IDX(offset + size))
3036 					size = IDX_TO_OFF(tobj->size) - offset;
3037 				break; /* XXX this break is not correct */
3038 			}
3039 			if (object != tobj) {
3040 				if (object)
3041 					vm_object_drop(object);
3042 				object = tobj;
3043 				vm_object_hold(object);
3044 			}
3045 		}
3046 
3047 		if (object && (object->type == OBJT_VNODE) &&
3048 		    (current->protection & VM_PROT_WRITE) &&
3049 		    (object->flags & OBJ_NOMSYNC) == 0) {
3050 			/*
3051 			 * Flush pages if writing is allowed, invalidate them
3052 			 * if invalidation requested.  Pages undergoing I/O
3053 			 * will be ignored by vm_object_page_remove().
3054 			 *
3055 			 * We cannot lock the vnode and then wait for paging
3056 			 * to complete without deadlocking against vm_fault.
3057 			 * Instead we simply call vm_object_page_remove() and
3058 			 * allow it to block internally on a page-by-page
3059 			 * basis when it encounters pages undergoing async
3060 			 * I/O.
3061 			 */
3062 			int flags;
3063 
3064 			/* no chain wait needed for vnode objects */
3065 			vm_object_reference_locked(object);
3066 			vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
3067 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
3068 			flags |= invalidate ? OBJPC_INVAL : 0;
3069 
3070 			if (current->maptype == VM_MAPTYPE_NORMAL) {
3071 				vm_object_page_clean(object,
3072 				    OFF_TO_IDX(offset),
3073 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3074 				    flags);
3075 			}
3076 			vn_unlock(((struct vnode *)object->handle));
3077 			vm_object_deallocate_locked(object);
3078 		}
3079 		if (object && invalidate &&
3080 		   ((object->type == OBJT_VNODE) ||
3081 		    (object->type == OBJT_DEVICE) ||
3082 		    (object->type == OBJT_MGTDEVICE))) {
3083 			int clean_only =
3084 				((object->type == OBJT_DEVICE) ||
3085 				(object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
3086 			/* no chain wait needed for vnode/device objects */
3087 			vm_object_reference_locked(object);
3088 			if (current->maptype == VM_MAPTYPE_NORMAL) {
3089 				vm_object_page_remove(object,
3090 				    OFF_TO_IDX(offset),
3091 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3092 				    clean_only);
3093 			}
3094 			vm_object_deallocate_locked(object);
3095 		}
3096 		start += size;
3097 		if (object)
3098 			vm_object_drop(object);
3099 		current = vm_map_rb_tree_RB_NEXT(current);
3100 	}
3101 
3102 	lwkt_reltoken(&map->token);
3103 	vm_map_unlock_read(map);
3104 
3105 	return (KERN_SUCCESS);
3106 }
3107 
3108 /*
3109  * Make the region specified by this entry pageable.
3110  *
3111  * The vm_map must be exclusively locked.
3112  */
3113 static void
3114 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3115 {
3116 	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3117 	entry->wired_count = 0;
3118 	vm_fault_unwire(map, entry);
3119 }
3120 
3121 /*
3122  * Deallocate the given entry from the target map.
3123  *
3124  * The vm_map must be exclusively locked.
3125  */
3126 static void
3127 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
3128 {
3129 	vm_map_entry_unlink(map, entry);
3130 	map->size -= entry->ba.end - entry->ba.start;
3131 	vm_map_entry_dispose(map, entry, countp);
3132 }
3133 
3134 /*
3135  * Deallocates the given address range from the target map.
3136  *
3137  * The vm_map must be exclusively locked.
3138  */
3139 int
3140 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
3141 {
3142 	vm_object_t object;
3143 	vm_map_entry_t entry;
3144 	vm_map_entry_t first_entry;
3145 	vm_offset_t hole_start;
3146 
3147 	ASSERT_VM_MAP_LOCKED(map);
3148 	lwkt_gettoken(&map->token);
3149 again:
3150 	/*
3151 	 * Find the start of the region, and clip it.  Set entry to point
3152 	 * at the first record containing the requested address or, if no
3153 	 * such record exists, the next record with a greater address.  The
3154 	 * loop will run from this point until a record beyond the termination
3155 	 * address is encountered.
3156 	 *
3157 	 * Adjust freehint[] for either the clip case or the extension case.
3158 	 *
3159 	 * GGG see other GGG comment.
3160 	 */
3161 	if (vm_map_lookup_entry(map, start, &first_entry)) {
3162 		entry = first_entry;
3163 		vm_map_clip_start(map, entry, start, countp);
3164 		hole_start = start;
3165 	} else {
3166 		if (first_entry) {
3167 			entry = vm_map_rb_tree_RB_NEXT(first_entry);
3168 			if (entry == NULL)
3169 				hole_start = first_entry->ba.start;
3170 			else
3171 				hole_start = first_entry->ba.end;
3172 		} else {
3173 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3174 			if (entry == NULL)
3175 				hole_start = vm_map_min(map);
3176 			else
3177 				hole_start = vm_map_max(map);
3178 		}
3179 	}
3180 
3181 	/*
3182 	 * Step through all entries in this region
3183 	 */
3184 	while (entry && entry->ba.start < end) {
3185 		vm_map_entry_t next;
3186 		vm_offset_t s, e;
3187 		vm_pindex_t offidxstart, offidxend, count;
3188 
3189 		/*
3190 		 * If we hit an in-transition entry we have to sleep and
3191 		 * retry.  It's easier (and not really slower) to just retry
3192 		 * since this case occurs so rarely and the hint is already
3193 		 * pointing at the right place.  We have to reset the
3194 		 * start offset so as not to accidently delete an entry
3195 		 * another process just created in vacated space.
3196 		 */
3197 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3198 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3199 			start = entry->ba.start;
3200 			++mycpu->gd_cnt.v_intrans_coll;
3201 			++mycpu->gd_cnt.v_intrans_wait;
3202 			vm_map_transition_wait(map, 1);
3203 			goto again;
3204 		}
3205 		vm_map_clip_end(map, entry, end, countp);
3206 
3207 		s = entry->ba.start;
3208 		e = entry->ba.end;
3209 		next = vm_map_rb_tree_RB_NEXT(entry);
3210 
3211 		offidxstart = OFF_TO_IDX(entry->ba.offset);
3212 		count = OFF_TO_IDX(e - s);
3213 
3214 		switch(entry->maptype) {
3215 		case VM_MAPTYPE_NORMAL:
3216 		case VM_MAPTYPE_SUBMAP:
3217 			object = entry->ba.object;
3218 			break;
3219 		default:
3220 			object = NULL;
3221 			break;
3222 		}
3223 
3224 		/*
3225 		 * Unwire before removing addresses from the pmap; otherwise,
3226 		 * unwiring will put the entries back in the pmap.
3227 		 *
3228 		 * Generally speaking, doing a bulk pmap_remove() before
3229 		 * removing the pages from the VM object is better at
3230 		 * reducing unnecessary IPIs.  The pmap code is now optimized
3231 		 * to not blindly iterate the range when pt and pd pages
3232 		 * are missing.
3233 		 */
3234 		if (entry->wired_count != 0)
3235 			vm_map_entry_unwire(map, entry);
3236 
3237 		offidxend = offidxstart + count;
3238 
3239 		if (object == &kernel_object) {
3240 			pmap_remove(map->pmap, s, e);
3241 			vm_object_hold(object);
3242 			vm_object_page_remove(object, offidxstart,
3243 					      offidxend, FALSE);
3244 			vm_object_drop(object);
3245 		} else if (object && object->type != OBJT_DEFAULT &&
3246 			   object->type != OBJT_SWAP) {
3247 			/*
3248 			 * vnode object routines cannot be chain-locked,
3249 			 * but since we aren't removing pages from the
3250 			 * object here we can use a shared hold.
3251 			 */
3252 			vm_object_hold_shared(object);
3253 			pmap_remove(map->pmap, s, e);
3254 			vm_object_drop(object);
3255 		} else if (object) {
3256 			vm_object_hold(object);
3257 			pmap_remove(map->pmap, s, e);
3258 
3259 			if (object != NULL &&
3260 			    object->ref_count != 1 &&
3261 			    (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3262 			     OBJ_ONEMAPPING &&
3263 			    (object->type == OBJT_DEFAULT ||
3264 			     object->type == OBJT_SWAP)) {
3265 				/*
3266 				 * When ONEMAPPING is set we can destroy the
3267 				 * pages underlying the entry's range.
3268 				 */
3269 				vm_object_page_remove(object, offidxstart,
3270 						      offidxend, FALSE);
3271 				if (object->type == OBJT_SWAP) {
3272 					swap_pager_freespace(object,
3273 							     offidxstart,
3274 							     count);
3275 				}
3276 				if (offidxend >= object->size &&
3277 				    offidxstart < object->size) {
3278 					object->size = offidxstart;
3279 				}
3280 			}
3281 			vm_object_drop(object);
3282 		} else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3283 			pmap_remove(map->pmap, s, e);
3284 		}
3285 
3286 		/*
3287 		 * Delete the entry (which may delete the object) only after
3288 		 * removing all pmap entries pointing to its pages.
3289 		 * (Otherwise, its page frames may be reallocated, and any
3290 		 * modify bits will be set in the wrong object!)
3291 		 */
3292 		vm_map_entry_delete(map, entry, countp);
3293 		entry = next;
3294 	}
3295 
3296 	/*
3297 	 * We either reached the end and use vm_map_max as the end
3298 	 * address, or we didn't and we use the next entry as the
3299 	 * end address.
3300 	 */
3301 	if (entry == NULL) {
3302 		vm_map_freehint_hole(map, hole_start,
3303 				     vm_map_max(map) - hole_start);
3304 	} else {
3305 		vm_map_freehint_hole(map, hole_start,
3306 				     entry->ba.start - hole_start);
3307 	}
3308 
3309 	lwkt_reltoken(&map->token);
3310 
3311 	return (KERN_SUCCESS);
3312 }
3313 
3314 /*
3315  * Remove the given address range from the target map.
3316  * This is the exported form of vm_map_delete.
3317  *
3318  * No requirements.
3319  */
3320 int
3321 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3322 {
3323 	int result;
3324 	int count;
3325 
3326 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3327 	vm_map_lock(map);
3328 	VM_MAP_RANGE_CHECK(map, start, end);
3329 	result = vm_map_delete(map, start, end, &count);
3330 	vm_map_unlock(map);
3331 	vm_map_entry_release(count);
3332 
3333 	return (result);
3334 }
3335 
3336 /*
3337  * Assert that the target map allows the specified privilege on the
3338  * entire address region given.  The entire region must be allocated.
3339  *
3340  * The caller must specify whether the vm_map is already locked or not.
3341  */
3342 boolean_t
3343 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3344 			vm_prot_t protection, boolean_t have_lock)
3345 {
3346 	vm_map_entry_t entry;
3347 	vm_map_entry_t tmp_entry;
3348 	boolean_t result;
3349 
3350 	if (have_lock == FALSE)
3351 		vm_map_lock_read(map);
3352 
3353 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3354 		if (have_lock == FALSE)
3355 			vm_map_unlock_read(map);
3356 		return (FALSE);
3357 	}
3358 	entry = tmp_entry;
3359 
3360 	result = TRUE;
3361 	while (start < end) {
3362 		if (entry == NULL) {
3363 			result = FALSE;
3364 			break;
3365 		}
3366 
3367 		/*
3368 		 * No holes allowed!
3369 		 */
3370 
3371 		if (start < entry->ba.start) {
3372 			result = FALSE;
3373 			break;
3374 		}
3375 		/*
3376 		 * Check protection associated with entry.
3377 		 */
3378 
3379 		if ((entry->protection & protection) != protection) {
3380 			result = FALSE;
3381 			break;
3382 		}
3383 		/* go to next entry */
3384 		start = entry->ba.end;
3385 		entry = vm_map_rb_tree_RB_NEXT(entry);
3386 	}
3387 	if (have_lock == FALSE)
3388 		vm_map_unlock_read(map);
3389 	return (result);
3390 }
3391 
3392 /*
3393  * vm_map_backing structures are not shared across forks and must be
3394  * replicated.
3395  *
3396  * Generally speaking we must reallocate the backing_ba sequence and
3397  * also adjust it for any changes made to the base entry->ba.start and
3398  * entry->ba.end.  The first ba in the chain is of course &entry->ba,
3399  * so we only need to adjust subsequent ba's start, end, and offset.
3400  *
3401  * MAP_BACK_CLIPPED	- Called as part of a clipping replication.
3402  *			  Do not clear OBJ_ONEMAPPING.
3403  *
3404  * MAP_BACK_BASEOBJREFD - Called from vm_map_insert().  The base object
3405  *			  has already been referenced.
3406  */
3407 static
3408 void
3409 vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
3410 {
3411 	vm_map_backing_t ba;
3412 	vm_map_backing_t nba;
3413 	vm_object_t object;
3414 
3415 	ba = &entry->ba;
3416 	for (;;) {
3417 		ba->pmap = map->pmap;
3418 
3419 		if (ba->map_object) {
3420 			switch(entry->maptype) {
3421 			case VM_MAPTYPE_NORMAL:
3422 				object = ba->object;
3423 				if (ba != &entry->ba ||
3424 				    (flags & MAP_BACK_BASEOBJREFD) == 0) {
3425 					vm_object_reference_quick(object);
3426 				}
3427 				vm_map_backing_attach(entry, ba);
3428 				if ((flags & MAP_BACK_CLIPPED) == 0 &&
3429 				    object->ref_count > 1) {
3430 					vm_object_clear_flag(object,
3431 							     OBJ_ONEMAPPING);
3432 				}
3433 				break;
3434 			case VM_MAPTYPE_UKSMAP:
3435 				vm_map_backing_attach(entry, ba);
3436 				break;
3437 			default:
3438 				break;
3439 			}
3440 		}
3441 		if (ba->backing_ba == NULL)
3442 			break;
3443 
3444 		/*
3445 		 * NOTE: The aux_info field is retained.
3446 		 */
3447 		nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT);
3448 		*nba = *ba->backing_ba;
3449 		nba->offset += (ba->start - nba->start);  /* += (new - old) */
3450 		nba->start = ba->start;
3451 		nba->end = ba->end;
3452 		ba->backing_ba = nba;
3453 		ba = nba;
3454 		/* pmap is replaced at the top of the loop */
3455 	}
3456 }
3457 
3458 static
3459 void
3460 vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start)
3461 {
3462 	vm_map_backing_t ba;
3463 
3464 	if (entry->maptype == VM_MAPTYPE_NORMAL) {
3465 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3466 			if (ba->object) {
3467 				lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3468 				ba->offset += (start - ba->start);
3469 				ba->start = start;
3470 				lockmgr(&ba->object->backing_lk, LK_RELEASE);
3471 			} else {
3472 				ba->offset += (start - ba->start);
3473 				ba->start = start;
3474 			}
3475 		}
3476 	} else {
3477 		/* not an object and can't be shadowed */
3478 	}
3479 }
3480 
3481 static
3482 void
3483 vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end)
3484 {
3485 	vm_map_backing_t ba;
3486 
3487 	if (entry->maptype == VM_MAPTYPE_NORMAL) {
3488 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3489 			if (ba->object) {
3490 				lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3491 				ba->end = end;
3492 				lockmgr(&ba->object->backing_lk, LK_RELEASE);
3493 			} else {
3494 				ba->end = end;
3495 			}
3496 		}
3497 	} /* else not an object and/or can't be shadowed */
3498 }
3499 
3500 /*
3501  * Handles the dirty work of making src_entry and dst_entry copy-on-write
3502  * after src_entry has been cloned to dst_entry.  For normal entries only.
3503  *
3504  * The vm_maps must be exclusively locked.
3505  * The vm_map's token must be held.
3506  *
3507  * Because the maps are locked no faults can be in progress during the
3508  * operation.
3509  */
3510 static void
3511 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3512 		  vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3513 {
3514 	vm_object_t obj;
3515 
3516 	KKASSERT(dst_entry->maptype == VM_MAPTYPE_NORMAL);
3517 
3518 	if (src_entry->wired_count) {
3519 		/*
3520 		 * Of course, wired down pages can't be set copy-on-write.
3521 		 * Cause wired pages to be copied into the new map by
3522 		 * simulating faults (the new pages are pageable)
3523 		 *
3524 		 * Scrap ba.object (its ref-count has not yet been adjusted
3525 		 * so we can just NULL out the field).  Remove the backing
3526 		 * store.
3527 		 *
3528 		 * Then call vm_fault_copy_entry() to create a new object
3529 		 * in dst_entry and copy the wired pages from src to dst.
3530 		 *
3531 		 * The fault-copy code doesn't work with virtual page
3532 		 * tables.
3533 		 *
3534 		 * NOTE: obj is not actually an object for all MAPTYPEs,
3535 		 *	 just test against NULL.
3536 		 */
3537 		if (dst_entry->ba.map_object != NULL) {
3538 			vm_map_backing_detach(dst_entry, &dst_entry->ba);
3539 			dst_entry->ba.map_object = NULL;
3540 			vm_map_entry_dispose_ba(dst_entry,
3541 						dst_entry->ba.backing_ba);
3542 			dst_entry->ba.backing_ba = NULL;
3543 			dst_entry->ba.backing_count = 0;
3544 		}
3545 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3546 	} else {
3547 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3548 			/*
3549 			 * If the source entry is not already marked NEEDS_COPY
3550 			 * we need to write-protect the PTEs.
3551 			 */
3552 			pmap_protect(src_map->pmap,
3553 				     src_entry->ba.start,
3554 				     src_entry->ba.end,
3555 				     src_entry->protection & ~VM_PROT_WRITE);
3556 		}
3557 
3558 		/*
3559 		 * dst_entry.ba_object might be stale.  Update it (its
3560 		 * ref-count has not yet been updated so just overwrite
3561 		 * the field).
3562 		 *
3563 		 * If there is no object then we are golden.  Also, in
3564 		 * this situation if there are no backing_ba linkages then
3565 		 * we can set ba.offset to whatever we want.  For now we
3566 		 * set the offset for 0 for make debugging object sizes
3567 		 * easier.
3568 		 */
3569 		obj = src_entry->ba.object;
3570 
3571 		if (obj) {
3572 			src_entry->eflags |= (MAP_ENTRY_COW |
3573 					      MAP_ENTRY_NEEDS_COPY);
3574 			dst_entry->eflags |= (MAP_ENTRY_COW |
3575 					      MAP_ENTRY_NEEDS_COPY);
3576 			KKASSERT(dst_entry->ba.offset == src_entry->ba.offset);
3577 		} else {
3578 			dst_entry->ba.offset = 0;
3579 		}
3580 
3581 		/*
3582 		 * Normal, allow the backing_ba link depth to
3583 		 * increase.
3584 		 */
3585 		pmap_copy(dst_map->pmap, src_map->pmap,
3586 			  dst_entry->ba.start,
3587 			  dst_entry->ba.end - dst_entry->ba.start,
3588 			  src_entry->ba.start);
3589 	}
3590 }
3591 
3592 /*
3593  * Create a vmspace for a new process and its related vm_map based on an
3594  * existing vmspace.  The new map inherits information from the old map
3595  * according to inheritance settings.
3596  *
3597  * The source map must not be locked.
3598  * No requirements.
3599  */
3600 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3601 			  vm_map_entry_t old_entry, int *countp);
3602 static void vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3603 			  vm_map_t old_map, vm_map_t new_map,
3604 			  vm_map_entry_t old_entry, int *countp);
3605 
3606 struct vmspace *
3607 vmspace_fork(struct vmspace *vm1, struct proc *p2, struct lwp *lp2)
3608 {
3609 	struct vmspace *vm2;
3610 	vm_map_t old_map = &vm1->vm_map;
3611 	vm_map_t new_map;
3612 	vm_map_entry_t old_entry;
3613 	int count;
3614 
3615 	lwkt_gettoken(&vm1->vm_map.token);
3616 	vm_map_lock(old_map);
3617 
3618 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map));
3619 	lwkt_gettoken(&vm2->vm_map.token);
3620 
3621 	/*
3622 	 * We must bump the timestamp to force any concurrent fault
3623 	 * to retry.
3624 	 */
3625 	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3626 	      (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3627 	new_map = &vm2->vm_map;	/* XXX */
3628 	new_map->timestamp = 1;
3629 
3630 	vm_map_lock(new_map);
3631 
3632 	count = old_map->nentries;
3633 	count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3634 
3635 	RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) {
3636 		switch(old_entry->maptype) {
3637 		case VM_MAPTYPE_SUBMAP:
3638 			panic("vm_map_fork: encountered a submap");
3639 			break;
3640 		case VM_MAPTYPE_UKSMAP:
3641 			vmspace_fork_uksmap_entry(p2, lp2,
3642 						  old_map, new_map,
3643 						  old_entry, &count);
3644 			break;
3645 		case VM_MAPTYPE_NORMAL:
3646 			vmspace_fork_normal_entry(old_map, new_map,
3647 						  old_entry, &count);
3648 			break;
3649 		default:
3650 			/* nothing to do */
3651 			break;
3652 		}
3653 	}
3654 
3655 	new_map->size = old_map->size;
3656 	vm_map_unlock(new_map);
3657 	vm_map_unlock(old_map);
3658 	vm_map_entry_release(count);
3659 
3660 	lwkt_reltoken(&vm2->vm_map.token);
3661 	lwkt_reltoken(&vm1->vm_map.token);
3662 
3663 	return (vm2);
3664 }
3665 
3666 static
3667 void
3668 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3669 			  vm_map_entry_t old_entry, int *countp)
3670 {
3671 	vm_map_entry_t new_entry;
3672 	vm_map_backing_t ba;
3673 	vm_object_t object;
3674 
3675 	/*
3676 	 * If the backing_ba link list gets too long then fault it
3677 	 * all into the head object and dispose of the list.  We do
3678 	 * this in old_entry prior to cloning in order to benefit both
3679 	 * parent and child.
3680 	 *
3681 	 * We can test our fronting object's size against its
3682 	 * resident_page_count for a really cheap (but probably not perfect)
3683 	 * all-shadowed test, allowing us to disconnect the backing_ba
3684 	 * link list early.
3685 	 */
3686 	object = old_entry->ba.object;
3687 	if (old_entry->ba.backing_ba &&
3688 	    (old_entry->ba.backing_count >= vm_map_backing_limit ||
3689 	     (vm_map_backing_shadow_test && object &&
3690 	      object->size == object->resident_page_count))) {
3691 		/*
3692 		 * If there are too many backing_ba linkages we
3693 		 * collapse everything into the head
3694 		 *
3695 		 * This will also remove all the pte's.
3696 		 */
3697 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY)
3698 			vm_map_entry_shadow(old_entry);
3699 		if (object == NULL)
3700 			vm_map_entry_allocate_object(old_entry);
3701 		if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) {
3702 			ba = old_entry->ba.backing_ba;
3703 			old_entry->ba.backing_ba = NULL;
3704 			old_entry->ba.backing_count = 0;
3705 			vm_map_entry_dispose_ba(old_entry, ba);
3706 		}
3707 	}
3708 	object = NULL;	/* object variable is now invalid */
3709 
3710 	/*
3711 	 * Fork the entry
3712 	 */
3713 	switch (old_entry->inheritance) {
3714 	case VM_INHERIT_NONE:
3715 		break;
3716 	case VM_INHERIT_SHARE:
3717 		/*
3718 		 * Clone the entry as a shared entry.  This will look like
3719 		 * shared memory across the old and the new process.  We must
3720 		 * ensure that the object is allocated.
3721 		 */
3722 		if (old_entry->ba.object == NULL)
3723 			vm_map_entry_allocate_object(old_entry);
3724 
3725 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3726 			/*
3727 			 * Create the fronting vm_map_backing for
3728 			 * an entry which needs a copy, plus an extra
3729 			 * ref because we are going to duplicate it
3730 			 * in the fork.
3731 			 *
3732 			 * The call to vm_map_entry_shadow() will also clear
3733 			 * OBJ_ONEMAPPING.
3734 			 *
3735 			 * XXX no more collapse.  Still need extra ref
3736 			 * for the fork.
3737 			 */
3738 			vm_map_entry_shadow(old_entry);
3739 		} else if (old_entry->ba.object) {
3740 			object = old_entry->ba.object;
3741 		}
3742 
3743 		/*
3744 		 * Clone the entry.  We've already bumped the ref on
3745 		 * the vm_object for our new entry.
3746 		 */
3747 		new_entry = vm_map_entry_create(countp);
3748 		*new_entry = *old_entry;
3749 
3750 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3751 		new_entry->wired_count = 0;
3752 
3753 		/*
3754 		 * Replicate and index the vm_map_backing.  Don't share
3755 		 * the vm_map_backing across vm_map's (only across clips).
3756 		 *
3757 		 * Insert the entry into the new map -- we know we're
3758 		 * inserting at the end of the new map.
3759 		 */
3760 		vm_map_backing_replicated(new_map, new_entry, 0);
3761 		vm_map_entry_link(new_map, new_entry);
3762 
3763 		/*
3764 		 * Update the physical map
3765 		 */
3766 		pmap_copy(new_map->pmap, old_map->pmap,
3767 			  new_entry->ba.start,
3768 			  (old_entry->ba.end - old_entry->ba.start),
3769 			  old_entry->ba.start);
3770 		break;
3771 	case VM_INHERIT_COPY:
3772 		/*
3773 		 * Clone the entry and link the copy into the new map.
3774 		 *
3775 		 * Note that ref-counting adjustment for old_entry->ba.object
3776 		 * (if it isn't a special map that is) is handled by
3777 		 * vm_map_copy_entry().
3778 		 */
3779 		new_entry = vm_map_entry_create(countp);
3780 		*new_entry = *old_entry;
3781 
3782 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3783 		new_entry->wired_count = 0;
3784 
3785 		vm_map_backing_replicated(new_map, new_entry, 0);
3786 		vm_map_entry_link(new_map, new_entry);
3787 
3788 		/*
3789 		 * This does the actual dirty work of making both entries
3790 		 * copy-on-write, and will also handle the fronting object.
3791 		 */
3792 		vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
3793 		break;
3794 	}
3795 }
3796 
3797 /*
3798  * When forking user-kernel shared maps, the map might change in the
3799  * child so do not try to copy the underlying pmap entries.
3800  */
3801 static
3802 void
3803 vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3804 			  vm_map_t old_map, vm_map_t new_map,
3805 			  vm_map_entry_t old_entry, int *countp)
3806 {
3807 	vm_map_entry_t new_entry;
3808 
3809 	/*
3810 	 * Do not fork lpmap entries whos TIDs do not match lp2's tid.
3811 	 *
3812 	 * XXX if p2 is NULL and lp2 is non-NULL, we retain the lpmap entry
3813 	 * (this is for e.g. resident'ing vmspace's) but set the field
3814 	 * to NULL.  Upon restore it should be restored. XXX NOT IMPL YET
3815 	 */
3816 	if (old_entry->aux.dev) {
3817 		switch(minor(old_entry->aux.dev)) {
3818 		case 5:
3819 			break;
3820 		case 6:
3821 			break;
3822 		case 7:
3823 			if (lp2 == NULL)
3824 				return;
3825 			if (old_entry->ba.aux_info == NULL)
3826 				return;
3827 			if (((struct lwp *)old_entry->ba.aux_info)->lwp_tid !=
3828 			    lp2->lwp_tid)
3829 				return;
3830 			break;
3831 		}
3832 	}
3833 
3834 	new_entry = vm_map_entry_create(countp);
3835 	*new_entry = *old_entry;
3836 
3837 	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3838 	new_entry->wired_count = 0;
3839 	KKASSERT(new_entry->ba.backing_ba == NULL);
3840 
3841 	if (new_entry->aux.dev) {
3842 		switch(minor(new_entry->aux.dev)) {
3843 		case 5:
3844 			/*
3845 			 * upmap
3846 			 */
3847 			new_entry->ba.aux_info = p2;
3848 			break;
3849 		case 6:
3850 			/*
3851 			 * kpmap
3852 			 */
3853 			new_entry->ba.aux_info = NULL;
3854 			break;
3855 		case 7:
3856 			/*
3857 			 * lpmap
3858 			 */
3859 			new_entry->ba.aux_info = lp2;
3860 			break;
3861 		}
3862 	} else {
3863 		new_entry->ba.aux_info = NULL;
3864 	}
3865 
3866 	vm_map_backing_replicated(new_map, new_entry, 0);
3867 
3868 	vm_map_entry_link(new_map, new_entry);
3869 }
3870 
3871 /*
3872  * Create an auto-grow stack entry
3873  *
3874  * No requirements.
3875  */
3876 int
3877 vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize,
3878 	      int flags, vm_prot_t prot, vm_prot_t max, int cow)
3879 {
3880 	vm_map_entry_t	prev_entry;
3881 	vm_map_entry_t	next;
3882 	vm_size_t	init_ssize;
3883 	int		rv;
3884 	int		count;
3885 	vm_offset_t	tmpaddr;
3886 
3887 	cow |= MAP_IS_STACK;
3888 
3889 	if (max_ssize < sgrowsiz)
3890 		init_ssize = max_ssize;
3891 	else
3892 		init_ssize = sgrowsiz;
3893 
3894 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3895 	vm_map_lock(map);
3896 
3897 	/*
3898 	 * Find space for the mapping
3899 	 */
3900 	if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3901 		if (vm_map_findspace(map, *addrbos, max_ssize, 1,
3902 				     flags, &tmpaddr)) {
3903 			vm_map_unlock(map);
3904 			vm_map_entry_release(count);
3905 			return (KERN_NO_SPACE);
3906 		}
3907 		*addrbos = tmpaddr;
3908 	}
3909 
3910 	/* If addr is already mapped, no go */
3911 	if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) {
3912 		vm_map_unlock(map);
3913 		vm_map_entry_release(count);
3914 		return (KERN_NO_SPACE);
3915 	}
3916 
3917 #if 0
3918 	/* XXX already handled by kern_mmap() */
3919 	/* If we would blow our VMEM resource limit, no go */
3920 	if (map->size + init_ssize >
3921 	    curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3922 		vm_map_unlock(map);
3923 		vm_map_entry_release(count);
3924 		return (KERN_NO_SPACE);
3925 	}
3926 #endif
3927 
3928 	/*
3929 	 * If we can't accomodate max_ssize in the current mapping,
3930 	 * no go.  However, we need to be aware that subsequent user
3931 	 * mappings might map into the space we have reserved for
3932 	 * stack, and currently this space is not protected.
3933 	 *
3934 	 * Hopefully we will at least detect this condition
3935 	 * when we try to grow the stack.
3936 	 */
3937 	if (prev_entry)
3938 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
3939 	else
3940 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3941 
3942 	if (next && next->ba.start < *addrbos + max_ssize) {
3943 		vm_map_unlock(map);
3944 		vm_map_entry_release(count);
3945 		return (KERN_NO_SPACE);
3946 	}
3947 
3948 	/*
3949 	 * We initially map a stack of only init_ssize.  We will
3950 	 * grow as needed later.  Since this is to be a grow
3951 	 * down stack, we map at the top of the range.
3952 	 *
3953 	 * Note: we would normally expect prot and max to be
3954 	 * VM_PROT_ALL, and cow to be 0.  Possibly we should
3955 	 * eliminate these as input parameters, and just
3956 	 * pass these values here in the insert call.
3957 	 */
3958 	rv = vm_map_insert(map, &count,
3959 			   NULL, NULL,
3960 			   0, NULL,
3961 			   *addrbos + max_ssize - init_ssize,
3962 	                   *addrbos + max_ssize,
3963 			   VM_MAPTYPE_NORMAL,
3964 			   VM_SUBSYS_STACK, prot, max, cow);
3965 
3966 	/* Now set the avail_ssize amount */
3967 	if (rv == KERN_SUCCESS) {
3968 		if (prev_entry)
3969 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
3970 		else
3971 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3972 		if (prev_entry != NULL) {
3973 			vm_map_clip_end(map,
3974 					prev_entry,
3975 					*addrbos + max_ssize - init_ssize,
3976 					&count);
3977 		}
3978 		if (next->ba.end   != *addrbos + max_ssize ||
3979 		    next->ba.start != *addrbos + max_ssize - init_ssize){
3980 			panic ("Bad entry start/end for new stack entry");
3981 		} else {
3982 			next->aux.avail_ssize = max_ssize - init_ssize;
3983 		}
3984 	}
3985 
3986 	vm_map_unlock(map);
3987 	vm_map_entry_release(count);
3988 	return (rv);
3989 }
3990 
3991 /*
3992  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3993  * desired address is already mapped, or if we successfully grow
3994  * the stack.  Also returns KERN_SUCCESS if addr is outside the
3995  * stack range (this is strange, but preserves compatibility with
3996  * the grow function in vm_machdep.c).
3997  *
3998  * No requirements.
3999  */
4000 int
4001 vm_map_growstack (vm_map_t map, vm_offset_t addr)
4002 {
4003 	vm_map_entry_t prev_entry;
4004 	vm_map_entry_t stack_entry;
4005 	vm_map_entry_t next;
4006 	struct vmspace *vm;
4007 	struct lwp *lp;
4008 	struct proc *p;
4009 	vm_offset_t    end;
4010 	int grow_amount;
4011 	int rv = KERN_SUCCESS;
4012 	int is_procstack;
4013 	int use_read_lock = 1;
4014 	int count;
4015 
4016 	/*
4017 	 * Find the vm
4018 	 */
4019 	lp = curthread->td_lwp;
4020 	p = curthread->td_proc;
4021 	KKASSERT(lp != NULL);
4022 	vm = lp->lwp_vmspace;
4023 
4024 	/*
4025 	 * Growstack is only allowed on the current process.  We disallow
4026 	 * other use cases, e.g. trying to access memory via procfs that
4027 	 * the stack hasn't grown into.
4028 	 */
4029 	if (map != &vm->vm_map) {
4030 		return KERN_FAILURE;
4031 	}
4032 
4033 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4034 Retry:
4035 	if (use_read_lock)
4036 		vm_map_lock_read(map);
4037 	else
4038 		vm_map_lock(map);
4039 
4040 	/*
4041 	 * If addr is already in the entry range, no need to grow.
4042 	 * prev_entry returns NULL if addr is at the head.
4043 	 */
4044 	if (vm_map_lookup_entry(map, addr, &prev_entry))
4045 		goto done;
4046 	if (prev_entry)
4047 		stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry);
4048 	else
4049 		stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
4050 
4051 	if (stack_entry == NULL)
4052 		goto done;
4053 	if (prev_entry == NULL)
4054 		end = stack_entry->ba.start - stack_entry->aux.avail_ssize;
4055 	else
4056 		end = prev_entry->ba.end;
4057 
4058 	/*
4059 	 * This next test mimics the old grow function in vm_machdep.c.
4060 	 * It really doesn't quite make sense, but we do it anyway
4061 	 * for compatibility.
4062 	 *
4063 	 * If not growable stack, return success.  This signals the
4064 	 * caller to proceed as he would normally with normal vm.
4065 	 */
4066 	if (stack_entry->aux.avail_ssize < 1 ||
4067 	    addr >= stack_entry->ba.start ||
4068 	    addr <  stack_entry->ba.start - stack_entry->aux.avail_ssize) {
4069 		goto done;
4070 	}
4071 
4072 	/* Find the minimum grow amount */
4073 	grow_amount = roundup (stack_entry->ba.start - addr, PAGE_SIZE);
4074 	if (grow_amount > stack_entry->aux.avail_ssize) {
4075 		rv = KERN_NO_SPACE;
4076 		goto done;
4077 	}
4078 
4079 	/*
4080 	 * If there is no longer enough space between the entries
4081 	 * nogo, and adjust the available space.  Note: this
4082 	 * should only happen if the user has mapped into the
4083 	 * stack area after the stack was created, and is
4084 	 * probably an error.
4085 	 *
4086 	 * This also effectively destroys any guard page the user
4087 	 * might have intended by limiting the stack size.
4088 	 */
4089 	if (grow_amount > stack_entry->ba.start - end) {
4090 		if (use_read_lock && vm_map_lock_upgrade(map)) {
4091 			/* lost lock */
4092 			use_read_lock = 0;
4093 			goto Retry;
4094 		}
4095 		use_read_lock = 0;
4096 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4097 		rv = KERN_NO_SPACE;
4098 		goto done;
4099 	}
4100 
4101 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
4102 
4103 	/* If this is the main process stack, see if we're over the
4104 	 * stack limit.
4105 	 */
4106 	if (is_procstack && (vm->vm_ssize + grow_amount >
4107 			     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4108 		rv = KERN_NO_SPACE;
4109 		goto done;
4110 	}
4111 
4112 	/* Round up the grow amount modulo SGROWSIZ */
4113 	grow_amount = roundup (grow_amount, sgrowsiz);
4114 	if (grow_amount > stack_entry->aux.avail_ssize) {
4115 		grow_amount = stack_entry->aux.avail_ssize;
4116 	}
4117 	if (is_procstack && (vm->vm_ssize + grow_amount >
4118 	                     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4119 		grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
4120 	}
4121 
4122 	/* If we would blow our VMEM resource limit, no go */
4123 	if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
4124 		rv = KERN_NO_SPACE;
4125 		goto done;
4126 	}
4127 
4128 	if (use_read_lock && vm_map_lock_upgrade(map)) {
4129 		/* lost lock */
4130 		use_read_lock = 0;
4131 		goto Retry;
4132 	}
4133 	use_read_lock = 0;
4134 
4135 	/* Get the preliminary new entry start value */
4136 	addr = stack_entry->ba.start - grow_amount;
4137 
4138 	/* If this puts us into the previous entry, cut back our growth
4139 	 * to the available space.  Also, see the note above.
4140 	 */
4141 	if (addr < end) {
4142 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4143 		addr = end;
4144 	}
4145 
4146 	rv = vm_map_insert(map, &count,
4147 			   NULL, NULL,
4148 			   0, NULL,
4149 			   addr, stack_entry->ba.start,
4150 			   VM_MAPTYPE_NORMAL,
4151 			   VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4152 
4153 	/* Adjust the available stack space by the amount we grew. */
4154 	if (rv == KERN_SUCCESS) {
4155 		if (prev_entry) {
4156 			vm_map_clip_end(map, prev_entry, addr, &count);
4157 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
4158 		} else {
4159 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
4160 		}
4161 		if (next->ba.end != stack_entry->ba.start  ||
4162 		    next->ba.start != addr) {
4163 			panic ("Bad stack grow start/end in new stack entry");
4164 		} else {
4165 			next->aux.avail_ssize =
4166 				stack_entry->aux.avail_ssize -
4167 				(next->ba.end - next->ba.start);
4168 			if (is_procstack) {
4169 				vm->vm_ssize += next->ba.end -
4170 						next->ba.start;
4171 			}
4172 		}
4173 
4174 		if (map->flags & MAP_WIREFUTURE)
4175 			vm_map_unwire(map, next->ba.start, next->ba.end, FALSE);
4176 	}
4177 
4178 done:
4179 	if (use_read_lock)
4180 		vm_map_unlock_read(map);
4181 	else
4182 		vm_map_unlock(map);
4183 	vm_map_entry_release(count);
4184 	return (rv);
4185 }
4186 
4187 /*
4188  * Unshare the specified VM space for exec.  If other processes are
4189  * mapped to it, then create a new one.  The new vmspace is null.
4190  *
4191  * No requirements.
4192  */
4193 void
4194 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4195 {
4196 	struct vmspace *oldvmspace = p->p_vmspace;
4197 	struct vmspace *newvmspace;
4198 	vm_map_t map = &p->p_vmspace->vm_map;
4199 
4200 	/*
4201 	 * If we are execing a resident vmspace we fork it, otherwise
4202 	 * we create a new vmspace.  Note that exitingcnt is not
4203 	 * copied to the new vmspace.
4204 	 */
4205 	lwkt_gettoken(&oldvmspace->vm_map.token);
4206 	if (vmcopy)  {
4207 		newvmspace = vmspace_fork(vmcopy, NULL, NULL);
4208 		lwkt_gettoken(&newvmspace->vm_map.token);
4209 	} else {
4210 		newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map));
4211 		lwkt_gettoken(&newvmspace->vm_map.token);
4212 		bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4213 		      (caddr_t)&oldvmspace->vm_endcopy -
4214 		       (caddr_t)&oldvmspace->vm_startcopy);
4215 	}
4216 
4217 	/*
4218 	 * Finish initializing the vmspace before assigning it
4219 	 * to the process.  The vmspace will become the current vmspace
4220 	 * if p == curproc.
4221 	 */
4222 	pmap_pinit2(vmspace_pmap(newvmspace));
4223 	pmap_replacevm(p, newvmspace, 0);
4224 	lwkt_reltoken(&newvmspace->vm_map.token);
4225 	lwkt_reltoken(&oldvmspace->vm_map.token);
4226 	vmspace_rel(oldvmspace);
4227 }
4228 
4229 /*
4230  * Unshare the specified VM space for forcing COW.  This
4231  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4232  */
4233 void
4234 vmspace_unshare(struct proc *p)
4235 {
4236 	struct vmspace *oldvmspace = p->p_vmspace;
4237 	struct vmspace *newvmspace;
4238 
4239 	lwkt_gettoken(&oldvmspace->vm_map.token);
4240 	if (vmspace_getrefs(oldvmspace) == 1) {
4241 		lwkt_reltoken(&oldvmspace->vm_map.token);
4242 		return;
4243 	}
4244 	newvmspace = vmspace_fork(oldvmspace, NULL, NULL);
4245 	lwkt_gettoken(&newvmspace->vm_map.token);
4246 	pmap_pinit2(vmspace_pmap(newvmspace));
4247 	pmap_replacevm(p, newvmspace, 0);
4248 	lwkt_reltoken(&newvmspace->vm_map.token);
4249 	lwkt_reltoken(&oldvmspace->vm_map.token);
4250 	vmspace_rel(oldvmspace);
4251 }
4252 
4253 /*
4254  * vm_map_hint: return the beginning of the best area suitable for
4255  * creating a new mapping with "prot" protection.
4256  *
4257  * No requirements.
4258  */
4259 vm_offset_t
4260 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4261 {
4262 	struct vmspace *vms = p->p_vmspace;
4263 	struct rlimit limit;
4264 	rlim_t dsiz;
4265 
4266 	/*
4267 	 * Acquire datasize limit for mmap() operation,
4268 	 * calculate nearest power of 2.
4269 	 */
4270 	if (kern_getrlimit(RLIMIT_DATA, &limit))
4271 		limit.rlim_cur = maxdsiz;
4272 	dsiz = limit.rlim_cur;
4273 
4274 	if (!randomize_mmap || addr != 0) {
4275 		/*
4276 		 * Set a reasonable start point for the hint if it was
4277 		 * not specified or if it falls within the heap space.
4278 		 * Hinted mmap()s do not allocate out of the heap space.
4279 		 */
4280 		if (addr == 0 ||
4281 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4282 		     addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4283 			addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4284 		}
4285 
4286 		return addr;
4287 	}
4288 
4289 	/*
4290 	 * randomize_mmap && addr == 0.  For now randomize the
4291 	 * address within a dsiz range beyond the data limit.
4292 	 */
4293 	addr = (vm_offset_t)vms->vm_daddr + dsiz;
4294 	if (dsiz)
4295 		addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4296 	return (round_page(addr));
4297 }
4298 
4299 /*
4300  * Finds the VM object, offset, and protection for a given virtual address
4301  * in the specified map, assuming a page fault of the type specified.
4302  *
4303  * Leaves the map in question locked for read; return values are guaranteed
4304  * until a vm_map_lookup_done call is performed.  Note that the map argument
4305  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4306  *
4307  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4308  * that fast.
4309  *
4310  * If a lookup is requested with "write protection" specified, the map may
4311  * be changed to perform virtual copying operations, although the data
4312  * referenced will remain the same.
4313  *
4314  * No requirements.
4315  */
4316 int
4317 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
4318 	      vm_offset_t vaddr,
4319 	      vm_prot_t fault_typea,
4320 	      vm_map_entry_t *out_entry,	/* OUT */
4321 	      struct vm_map_backing **bap,	/* OUT */
4322 	      vm_pindex_t *pindex,		/* OUT */
4323 	      vm_pindex_t *pcount,		/* OUT */
4324 	      vm_prot_t *out_prot,		/* OUT */
4325 	      int *wflags)			/* OUT */
4326 {
4327 	vm_map_entry_t entry;
4328 	vm_map_t map = *var_map;
4329 	vm_prot_t prot;
4330 	vm_prot_t fault_type = fault_typea;
4331 	int use_read_lock = 1;
4332 	int rv = KERN_SUCCESS;
4333 	int count;
4334 	thread_t td = curthread;
4335 
4336 	/*
4337 	 * vm_map_entry_reserve() implements an important mitigation
4338 	 * against mmap() span running the kernel out of vm_map_entry
4339 	 * structures, but it can also cause an infinite call recursion.
4340 	 * Use td_nest_count to prevent an infinite recursion (allows
4341 	 * the vm_map code to dig into the pcpu vm_map_entry reserve).
4342 	 */
4343 	count = 0;
4344 	if (td->td_nest_count == 0) {
4345 		++td->td_nest_count;
4346 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4347 		--td->td_nest_count;
4348 	}
4349 RetryLookup:
4350 	if (use_read_lock)
4351 		vm_map_lock_read(map);
4352 	else
4353 		vm_map_lock(map);
4354 
4355 	/*
4356 	 * Always do a full lookup.  The hint doesn't get us much anymore
4357 	 * now that the map is RB'd.
4358 	 */
4359 	cpu_ccfence();
4360 	*out_entry = NULL;
4361 	*bap = NULL;
4362 
4363 	{
4364 		vm_map_entry_t tmp_entry;
4365 
4366 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4367 			rv = KERN_INVALID_ADDRESS;
4368 			goto done;
4369 		}
4370 		entry = tmp_entry;
4371 		*out_entry = entry;
4372 	}
4373 
4374 	/*
4375 	 * Handle submaps.
4376 	 */
4377 	if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4378 		vm_map_t old_map = map;
4379 
4380 		*var_map = map = entry->ba.sub_map;
4381 		if (use_read_lock)
4382 			vm_map_unlock_read(old_map);
4383 		else
4384 			vm_map_unlock(old_map);
4385 		use_read_lock = 1;
4386 		goto RetryLookup;
4387 	}
4388 
4389 	/*
4390 	 * Check whether this task is allowed to have this page.
4391 	 * Note the special case for MAP_ENTRY_COW pages with an override.
4392 	 * This is to implement a forced COW for debuggers.
4393 	 */
4394 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
4395 		prot = entry->max_protection;
4396 	else
4397 		prot = entry->protection;
4398 
4399 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4400 	if ((fault_type & prot) != fault_type) {
4401 		rv = KERN_PROTECTION_FAILURE;
4402 		goto done;
4403 	}
4404 
4405 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4406 	    (entry->eflags & MAP_ENTRY_COW) &&
4407 	    (fault_type & VM_PROT_WRITE) &&
4408 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4409 		rv = KERN_PROTECTION_FAILURE;
4410 		goto done;
4411 	}
4412 
4413 	/*
4414 	 * If this page is not pageable, we have to get it for all possible
4415 	 * accesses.
4416 	 */
4417 	*wflags = 0;
4418 	if (entry->wired_count) {
4419 		*wflags |= FW_WIRED;
4420 		prot = fault_type = entry->protection;
4421 	}
4422 
4423 	if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4424 	    pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4425 		if ((prot & VM_PROT_WRITE) == 0)
4426 			fault_type |= VM_PROT_WRITE;
4427 	}
4428 
4429 	/*
4430 	 * Only NORMAL maps are object-based.  UKSMAPs are not.
4431 	 */
4432 	if (entry->maptype != VM_MAPTYPE_NORMAL) {
4433 		*bap = NULL;
4434 		goto skip;
4435 	}
4436 
4437 	/*
4438 	 * If the entry was copy-on-write, we either ...
4439 	 */
4440 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4441 		/*
4442 		 * If we want to write the page, we may as well handle that
4443 		 * now since we've got the map locked.
4444 		 *
4445 		 * If we don't need to write the page, we just demote the
4446 		 * permissions allowed.
4447 		 */
4448 		if (fault_type & VM_PROT_WRITE) {
4449 			/*
4450 			 * Not allowed if TDF_NOFAULT is set as the shadowing
4451 			 * operation can deadlock against the faulting
4452 			 * function due to the copy-on-write.
4453 			 */
4454 			if (curthread->td_flags & TDF_NOFAULT) {
4455 				rv = KERN_FAILURE_NOFAULT;
4456 				goto done;
4457 			}
4458 
4459 			/*
4460 			 * Make a new vm_map_backing + object, and place it
4461 			 * in the object chain.  Note that no new references
4462 			 * have appeared -- one just moved from the map to
4463 			 * the new object.
4464 			 */
4465 			if (use_read_lock && vm_map_lock_upgrade(map)) {
4466 				/* lost lock */
4467 				use_read_lock = 0;
4468 				goto RetryLookup;
4469 			}
4470 			use_read_lock = 0;
4471 			vm_map_entry_shadow(entry);
4472 			*wflags |= FW_DIDCOW;
4473 		} else {
4474 			/*
4475 			 * We're attempting to read a copy-on-write page --
4476 			 * don't allow writes.
4477 			 */
4478 			prot &= ~VM_PROT_WRITE;
4479 		}
4480 	}
4481 
4482 	/*
4483 	 * Create an object if necessary.  This code also handles
4484 	 * partitioning large entries to improve vm_fault performance.
4485 	 */
4486 	if (entry->ba.object == NULL && !map->system_map) {
4487 		if (use_read_lock && vm_map_lock_upgrade(map))  {
4488 			/* lost lock */
4489 			use_read_lock = 0;
4490 			goto RetryLookup;
4491 		}
4492 		use_read_lock = 0;
4493 
4494 		/*
4495 		 * Partition large entries, giving each its own VM object,
4496 		 * to improve concurrent fault performance.  This is only
4497 		 * applicable to userspace.
4498 		 */
4499 		if (map != &kernel_map &&
4500 		    entry->maptype == VM_MAPTYPE_NORMAL &&
4501 		    ((entry->ba.start ^ entry->ba.end) &
4502 		     ~MAP_ENTRY_PARTITION_MASK) &&
4503 		    vm_map_partition_enable) {
4504 			if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4505 				entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4506 				++mycpu->gd_cnt.v_intrans_coll;
4507 				++mycpu->gd_cnt.v_intrans_wait;
4508 				vm_map_transition_wait(map, 0);
4509 				goto RetryLookup;
4510 			}
4511 			vm_map_entry_partition(map, entry, vaddr, &count);
4512 		}
4513 		vm_map_entry_allocate_object(entry);
4514 	}
4515 
4516 	/*
4517 	 * Return the object/offset from this entry.  If the entry was
4518 	 * copy-on-write or empty, it has been fixed up.
4519 	 */
4520 	*bap = &entry->ba;
4521 
4522 skip:
4523 	*pindex = OFF_TO_IDX((vaddr - entry->ba.start) + entry->ba.offset);
4524 	*pcount = OFF_TO_IDX(entry->ba.end - trunc_page(vaddr));
4525 
4526 	/*
4527 	 * Return whether this is the only map sharing this data.  On
4528 	 * success we return with a read lock held on the map.  On failure
4529 	 * we return with the map unlocked.
4530 	 */
4531 	*out_prot = prot;
4532 done:
4533 	if (rv == KERN_SUCCESS) {
4534 		if (use_read_lock == 0)
4535 			vm_map_lock_downgrade(map);
4536 	} else if (use_read_lock) {
4537 		vm_map_unlock_read(map);
4538 	} else {
4539 		vm_map_unlock(map);
4540 	}
4541 	if (count > 0)
4542 		vm_map_entry_release(count);
4543 
4544 	return (rv);
4545 }
4546 
4547 /*
4548  * Releases locks acquired by a vm_map_lookup()
4549  * (according to the handle returned by that lookup).
4550  *
4551  * No other requirements.
4552  */
4553 void
4554 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4555 {
4556 	/*
4557 	 * Unlock the main-level map
4558 	 */
4559 	vm_map_unlock_read(map);
4560 	if (count)
4561 		vm_map_entry_release(count);
4562 }
4563 
4564 static void
4565 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4566 		       vm_offset_t vaddr, int *countp)
4567 {
4568 	vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4569 	vm_map_clip_start(map, entry, vaddr, countp);
4570 	vaddr += MAP_ENTRY_PARTITION_SIZE;
4571 	vm_map_clip_end(map, entry, vaddr, countp);
4572 }
4573 
4574 /*
4575  * Quick hack, needs some help to make it more SMP friendly.
4576  */
4577 void
4578 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4579 		 vm_offset_t ran_beg, vm_offset_t ran_end)
4580 {
4581 	struct vm_map_ilock *scan;
4582 
4583 	ilock->ran_beg = ran_beg;
4584 	ilock->ran_end = ran_end;
4585 	ilock->flags = 0;
4586 
4587 	spin_lock(&map->ilock_spin);
4588 restart:
4589 	for (scan = map->ilock_base; scan; scan = scan->next) {
4590 		if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4591 			scan->flags |= ILOCK_WAITING;
4592 			ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4593 			goto restart;
4594 		}
4595 	}
4596 	ilock->next = map->ilock_base;
4597 	map->ilock_base = ilock;
4598 	spin_unlock(&map->ilock_spin);
4599 }
4600 
4601 void
4602 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4603 {
4604 	struct vm_map_ilock *scan;
4605 	struct vm_map_ilock **scanp;
4606 
4607 	spin_lock(&map->ilock_spin);
4608 	scanp = &map->ilock_base;
4609 	while ((scan = *scanp) != NULL) {
4610 		if (scan == ilock) {
4611 			*scanp = ilock->next;
4612 			spin_unlock(&map->ilock_spin);
4613 			if (ilock->flags & ILOCK_WAITING)
4614 				wakeup(ilock);
4615 			return;
4616 		}
4617 		scanp = &scan->next;
4618 	}
4619 	spin_unlock(&map->ilock_spin);
4620 	panic("vm_map_deinterlock: missing ilock!");
4621 }
4622 
4623 #include "opt_ddb.h"
4624 #ifdef DDB
4625 #include <ddb/ddb.h>
4626 
4627 /*
4628  * Debugging only
4629  */
4630 DB_SHOW_COMMAND(map, vm_map_print)
4631 {
4632 	static int nlines;
4633 	/* XXX convert args. */
4634 	vm_map_t map = (vm_map_t)addr;
4635 	boolean_t full = have_addr;
4636 
4637 	vm_map_entry_t entry;
4638 
4639 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4640 	    (void *)map,
4641 	    (void *)map->pmap, map->nentries, map->timestamp);
4642 	nlines++;
4643 
4644 	if (!full && db_indent)
4645 		return;
4646 
4647 	db_indent += 2;
4648 	RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
4649 		db_iprintf("map entry %p: start=%p, end=%p\n",
4650 		    (void *)entry,
4651 		    (void *)entry->ba.start, (void *)entry->ba.end);
4652 		nlines++;
4653 		{
4654 			static char *inheritance_name[4] =
4655 			{"share", "copy", "none", "donate_copy"};
4656 
4657 			db_iprintf(" prot=%x/%x/%s",
4658 			    entry->protection,
4659 			    entry->max_protection,
4660 			    inheritance_name[(int)(unsigned char)
4661 						entry->inheritance]);
4662 			if (entry->wired_count != 0)
4663 				db_printf(", wired");
4664 		}
4665 		switch(entry->maptype) {
4666 		case VM_MAPTYPE_SUBMAP:
4667 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4668 			db_printf(", share=%p, offset=0x%lx\n",
4669 			    (void *)entry->ba.sub_map,
4670 			    (long)entry->ba.offset);
4671 			nlines++;
4672 
4673 			db_indent += 2;
4674 			vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
4675 				     full, 0, NULL);
4676 			db_indent -= 2;
4677 			break;
4678 		case VM_MAPTYPE_NORMAL:
4679 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4680 			db_printf(", object=%p, offset=0x%lx",
4681 			    (void *)entry->ba.object,
4682 			    (long)entry->ba.offset);
4683 			if (entry->eflags & MAP_ENTRY_COW)
4684 				db_printf(", copy (%s)",
4685 				    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) ?
4686 				     "needed" : "done"));
4687 			db_printf("\n");
4688 			nlines++;
4689 
4690 			if (entry->ba.object) {
4691 				db_indent += 2;
4692 				vm_object_print((db_expr_t)(intptr_t)
4693 						entry->ba.object,
4694 						full, 0, NULL);
4695 				nlines += 4;
4696 				db_indent -= 2;
4697 			}
4698 			break;
4699 		case VM_MAPTYPE_UKSMAP:
4700 			db_printf(", uksmap=%p, offset=0x%lx",
4701 			    (void *)entry->ba.uksmap,
4702 			    (long)entry->ba.offset);
4703 			if (entry->eflags & MAP_ENTRY_COW)
4704 				db_printf(", copy (%s)",
4705 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4706 			db_printf("\n");
4707 			nlines++;
4708 			break;
4709 		default:
4710 			break;
4711 		}
4712 	}
4713 	db_indent -= 2;
4714 	if (db_indent == 0)
4715 		nlines = 0;
4716 }
4717 
4718 /*
4719  * Debugging only
4720  */
4721 DB_SHOW_COMMAND(procvm, procvm)
4722 {
4723 	struct proc *p;
4724 
4725 	if (have_addr) {
4726 		p = (struct proc *) addr;
4727 	} else {
4728 		p = curproc;
4729 	}
4730 
4731 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4732 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4733 	    (void *)vmspace_pmap(p->p_vmspace));
4734 
4735 	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4736 }
4737 
4738 #endif /* DDB */
4739