xref: /dragonfly/sys/vm/vm_map.c (revision 8f2ce533)
1 /*
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003-2022 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
37  *
38  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
39  * All rights reserved.
40  *
41  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/proc.h>
67 #include <sys/serialize.h>
68 #include <sys/lock.h>
69 #include <sys/vmmeter.h>
70 #include <sys/mman.h>
71 #include <sys/vnode.h>
72 #include <sys/resourcevar.h>
73 #include <sys/shm.h>
74 #include <sys/tree.h>
75 #include <sys/malloc.h>
76 #include <sys/objcache.h>
77 #include <sys/kern_syscall.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_kern.h>
87 #include <vm/vm_extern.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_zone.h>
90 
91 #include <sys/random.h>
92 #include <sys/sysctl.h>
93 #include <sys/spinlock.h>
94 
95 #include <sys/thread2.h>
96 #include <sys/spinlock2.h>
97 
98 /*
99  * Virtual memory maps provide for the mapping, protection, and sharing
100  * of virtual memory objects.  In addition, this module provides for an
101  * efficient virtual copy of memory from one map to another.
102  *
103  * Synchronization is required prior to most operations.
104  *
105  * Maps consist of an ordered doubly-linked list of simple entries.
106  * A hint and a RB tree is used to speed-up lookups.
107  *
108  * Callers looking to modify maps specify start/end addresses which cause
109  * the related map entry to be clipped if necessary, and then later
110  * recombined if the pieces remained compatible.
111  *
112  * Virtual copy operations are performed by copying VM object references
113  * from one map to another, and then marking both regions as copy-on-write.
114  */
115 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
116 static void vmspace_dtor(void *obj, void *privdata);
117 static void vmspace_terminate(struct vmspace *vm, int final);
118 
119 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
120 MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
121 static struct objcache *vmspace_cache;
122 
123 /*
124  * per-cpu page table cross mappings are initialized in early boot
125  * and might require a considerable number of vm_map_entry structures.
126  */
127 #define MAPENTRYBSP_CACHE	(MAXCPU+1)
128 #define MAPENTRYAP_CACHE	8
129 
130 /*
131  * Partioning threaded programs with large anonymous memory areas can
132  * improve concurrent fault performance.
133  */
134 #define MAP_ENTRY_PARTITION_SIZE	((vm_offset_t)(32 * 1024 * 1024))
135 #define MAP_ENTRY_PARTITION_MASK	(MAP_ENTRY_PARTITION_SIZE - 1)
136 
137 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)	\
138 	((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
139 
140 static struct vm_zone mapentzone_store;
141 __read_mostly static vm_zone_t mapentzone;
142 
143 static struct vm_map_entry map_entry_init[MAX_MAPENT];
144 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
145 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
146 
147 __read_mostly static int randomize_mmap;
148 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
149     "Randomize mmap offsets");
150 __read_mostly static int vm_map_relock_enable = 1;
151 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
152 	   &vm_map_relock_enable, 0, "insert pop pgtable optimization");
153 __read_mostly static int vm_map_partition_enable = 1;
154 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
155 	   &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
156 __read_mostly static int vm_map_backing_limit = 5;
157 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
158 	   &vm_map_backing_limit, 0, "ba.backing_ba link depth");
159 __read_mostly static int vm_map_backing_shadow_test = 1;
160 SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW,
161 	   &vm_map_backing_shadow_test, 0, "ba.object shadow test");
162 
163 static void vmspace_drop_notoken(struct vmspace *vm);
164 static void vm_map_entry_shadow(vm_map_entry_t entry);
165 static vm_map_entry_t vm_map_entry_create(int *);
166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
167 static void vm_map_entry_dispose_ba (vm_map_entry_t entry, vm_map_backing_t ba);
168 static void vm_map_backing_replicated(vm_map_t map,
169 		vm_map_entry_t entry, int flags);
170 static void vm_map_backing_adjust_start(vm_map_entry_t entry,
171 		vm_ooffset_t start);
172 static void vm_map_backing_adjust_end(vm_map_entry_t entry,
173 		vm_ooffset_t end);
174 static void vm_map_backing_attach (vm_map_entry_t entry, vm_map_backing_t ba);
175 static void vm_map_backing_detach (vm_map_entry_t entry, vm_map_backing_t ba);
176 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
177 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
178 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
179 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
180 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
181 		vm_map_entry_t);
182 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
183 		vm_offset_t start, vm_offset_t end, int *countp, int flags);
184 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
185 		vm_offset_t vaddr, int *countp);
186 
187 #define MAP_BACK_CLIPPED	0x0001
188 #define MAP_BACK_BASEOBJREFD	0x0002
189 
190 /*
191  * Initialize the vm_map module.  Must be called before any other vm_map
192  * routines.
193  *
194  * Map and entry structures are allocated from the general purpose
195  * memory pool with some exceptions:
196  *
197  *	- The kernel map is allocated statically.
198  *	- Initial kernel map entries are allocated out of a static pool.
199  *	- We must set ZONE_SPECIAL here or the early boot code can get
200  *	  stuck if there are >63 cores.
201  *
202  *	These restrictions are necessary since malloc() uses the
203  *	maps and requires map entries.
204  *
205  * Called from the low level boot code only.
206  */
207 void
208 vm_map_startup(void)
209 {
210 	mapentzone = &mapentzone_store;
211 	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
212 		  map_entry_init, MAX_MAPENT);
213 	mapentzone_store.zflags |= ZONE_SPECIAL;
214 }
215 
216 /*
217  * Called prior to any vmspace allocations.
218  *
219  * Called from the low level boot code only.
220  */
221 void
222 vm_init2(void)
223 {
224 	vmspace_cache = objcache_create_mbacked(M_VMSPACE,
225 						sizeof(struct vmspace),
226 						0, ncpus * 4,
227 						vmspace_ctor, vmspace_dtor,
228 						NULL);
229 	zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
230 	pmap_init2();
231 	vm_object_init2();
232 }
233 
234 /*
235  * objcache support.  We leave the pmap root cached as long as possible
236  * for performance reasons.
237  */
238 static
239 boolean_t
240 vmspace_ctor(void *obj, void *privdata, int ocflags)
241 {
242 	struct vmspace *vm = obj;
243 
244 	bzero(vm, sizeof(*vm));
245 	vm->vm_refcnt = VM_REF_DELETED;
246 
247 	return 1;
248 }
249 
250 static
251 void
252 vmspace_dtor(void *obj, void *privdata)
253 {
254 	struct vmspace *vm = obj;
255 
256 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
257 	pmap_puninit(vmspace_pmap(vm));
258 }
259 
260 /*
261  * Red black tree functions
262  *
263  * The caller must hold the related map lock.
264  */
265 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
266 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
267 
268 /* a->ba.start is address, and the only field which must be initialized */
269 static int
270 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
271 {
272 	if (a->ba.start < b->ba.start)
273 		return(-1);
274 	else if (a->ba.start > b->ba.start)
275 		return(1);
276 	return(0);
277 }
278 
279 /*
280  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
281  * every refcnt.
282  */
283 void
284 vmspace_initrefs(struct vmspace *vm)
285 {
286 	vm->vm_refcnt = 1;
287 	vm->vm_holdcnt = 1;
288 }
289 
290 /*
291  * Allocate a vmspace structure, including a vm_map and pmap.
292  * Initialize numerous fields.  While the initial allocation is zerod,
293  * subsequence reuse from the objcache leaves elements of the structure
294  * intact (particularly the pmap), so portions must be zerod.
295  *
296  * Returns a referenced vmspace.
297  *
298  * No requirements.
299  */
300 struct vmspace *
301 vmspace_alloc(vm_offset_t min, vm_offset_t max)
302 {
303 	struct vmspace *vm;
304 
305 	vm = objcache_get(vmspace_cache, M_WAITOK);
306 
307 	bzero(&vm->vm_startcopy,
308 	      (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
309 	vm_map_init(&vm->vm_map, min, max, NULL);	/* initializes token */
310 
311 	/*
312 	 * NOTE: hold to acquires token for safety.
313 	 *
314 	 * On return vmspace is referenced (refs=1, hold=1).  That is,
315 	 * each refcnt also has a holdcnt.  There can be additional holds
316 	 * (holdcnt) above and beyond the refcnt.  Finalization is handled in
317 	 * two stages, one on refs 1->0, and the the second on hold 1->0.
318 	 */
319 	KKASSERT(vm->vm_holdcnt == 0);
320 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
321 	vmspace_initrefs(vm);
322 	vmspace_hold(vm);
323 	pmap_pinit(vmspace_pmap(vm));		/* (some fields reused) */
324 	vm->vm_map.pmap = vmspace_pmap(vm);	/* XXX */
325 	vm->vm_shm = NULL;
326 	vm->vm_flags = 0;
327 	cpu_vmspace_alloc(vm);
328 	vmspace_drop(vm);
329 
330 	return (vm);
331 }
332 
333 /*
334  * NOTE: Can return 0 if the vmspace is exiting.
335  */
336 int
337 vmspace_getrefs(struct vmspace *vm)
338 {
339 	int32_t n;
340 
341 	n = vm->vm_refcnt;
342 	cpu_ccfence();
343 	if (n & VM_REF_DELETED)
344 		n = -1;
345 	return n;
346 }
347 
348 void
349 vmspace_hold(struct vmspace *vm)
350 {
351 	atomic_add_int(&vm->vm_holdcnt, 1);
352 	lwkt_gettoken(&vm->vm_map.token);
353 }
354 
355 /*
356  * Drop with final termination interlock.
357  */
358 void
359 vmspace_drop(struct vmspace *vm)
360 {
361 	lwkt_reltoken(&vm->vm_map.token);
362 	vmspace_drop_notoken(vm);
363 }
364 
365 static void
366 vmspace_drop_notoken(struct vmspace *vm)
367 {
368 	if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
369 		if (vm->vm_refcnt & VM_REF_DELETED)
370 			vmspace_terminate(vm, 1);
371 	}
372 }
373 
374 /*
375  * A vmspace object must not be in a terminated state to be able to obtain
376  * additional refs on it.
377  *
378  * These are official references to the vmspace, the count is used to check
379  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
380  *
381  * XXX we need to combine hold & ref together into one 64-bit field to allow
382  * holds to prevent stage-1 termination.
383  */
384 void
385 vmspace_ref(struct vmspace *vm)
386 {
387 	uint32_t n;
388 
389 	atomic_add_int(&vm->vm_holdcnt, 1);
390 	n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
391 	KKASSERT((n & VM_REF_DELETED) == 0);
392 }
393 
394 /*
395  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
396  * termination of the vmspace.  Then, on the final drop of the hold we
397  * will do stage-2 final termination.
398  */
399 void
400 vmspace_rel(struct vmspace *vm)
401 {
402 	uint32_t n;
403 
404 	/*
405 	 * Drop refs.  Each ref also has a hold which is also dropped.
406 	 *
407 	 * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
408 	 * prevent finalization) to start termination processing.
409 	 * Finalization occurs when the last hold count drops to 0.
410 	 */
411 	n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
412 	while (n == 0) {
413 		if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
414 			vmspace_terminate(vm, 0);
415 			break;
416 		}
417 		n = vm->vm_refcnt;
418 		cpu_ccfence();
419 	}
420 	vmspace_drop_notoken(vm);
421 }
422 
423 /*
424  * This is called during exit indicating that the vmspace is no
425  * longer in used by an exiting process, but the process has not yet
426  * been reaped.
427  *
428  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
429  * to prevent stage-2 until the process is reaped.  Note hte order of
430  * operation, we must hold first.
431  *
432  * No requirements.
433  */
434 void
435 vmspace_relexit(struct vmspace *vm)
436 {
437 	atomic_add_int(&vm->vm_holdcnt, 1);
438 	vmspace_rel(vm);
439 }
440 
441 /*
442  * Called during reap to disconnect the remainder of the vmspace from
443  * the process.  On the hold drop the vmspace termination is finalized.
444  *
445  * No requirements.
446  */
447 void
448 vmspace_exitfree(struct proc *p)
449 {
450 	struct vmspace *vm;
451 
452 	vm = p->p_vmspace;
453 	p->p_vmspace = NULL;
454 	vmspace_drop_notoken(vm);
455 }
456 
457 /*
458  * Called in two cases:
459  *
460  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
461  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
462  *     and holdcnt will still be non-zero.
463  *
464  * (2) When holdcnt becomes 0, called with final == 1.  There should no
465  *     longer be anyone with access to the vmspace.
466  *
467  * VMSPACE_EXIT1 flags the primary deactivation
468  * VMSPACE_EXIT2 flags the last reap
469  */
470 static void
471 vmspace_terminate(struct vmspace *vm, int final)
472 {
473 	int count;
474 
475 	lwkt_gettoken(&vm->vm_map.token);
476 	if (final == 0) {
477 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
478 		vm->vm_flags |= VMSPACE_EXIT1;
479 
480 		/*
481 		 * Get rid of most of the resources.  Leave the kernel pmap
482 		 * intact.
483 		 *
484 		 * If the pmap does not contain wired pages we can bulk-delete
485 		 * the pmap as a performance optimization before removing the
486 		 * related mappings.
487 		 *
488 		 * If the pmap contains wired pages we cannot do this
489 		 * pre-optimization because currently vm_fault_unwire()
490 		 * expects the pmap pages to exist and will not decrement
491 		 * p->wire_count if they do not.
492 		 */
493 		shmexit(vm);
494 		if (vmspace_pmap(vm)->pm_stats.wired_count) {
495 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
496 				      VM_MAX_USER_ADDRESS);
497 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
498 					  VM_MAX_USER_ADDRESS);
499 		} else {
500 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
501 					  VM_MAX_USER_ADDRESS);
502 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
503 				      VM_MAX_USER_ADDRESS);
504 		}
505 		lwkt_reltoken(&vm->vm_map.token);
506 	} else {
507 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
508 		KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
509 
510 		/*
511 		 * Get rid of remaining basic resources.
512 		 */
513 		vm->vm_flags |= VMSPACE_EXIT2;
514 		shmexit(vm);
515 
516 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
517 		vm_map_lock(&vm->vm_map);
518 		cpu_vmspace_free(vm);
519 
520 		/*
521 		 * Lock the map, to wait out all other references to it.
522 		 * Delete all of the mappings and pages they hold, then call
523 		 * the pmap module to reclaim anything left.
524 		 */
525 		vm_map_delete(&vm->vm_map,
526 			      vm_map_min(&vm->vm_map),
527 			      vm_map_max(&vm->vm_map),
528 			      &count);
529 		vm_map_unlock(&vm->vm_map);
530 		vm_map_entry_release(count);
531 
532 		pmap_release(vmspace_pmap(vm));
533 		lwkt_reltoken(&vm->vm_map.token);
534 		objcache_put(vmspace_cache, vm);
535 	}
536 }
537 
538 /*
539  * Swap useage is determined by taking the proportional swap used by
540  * VM objects backing the VM map.  To make up for fractional losses,
541  * if the VM object has any swap use at all the associated map entries
542  * count for at least 1 swap page.
543  *
544  * No requirements.
545  */
546 vm_offset_t
547 vmspace_swap_count(struct vmspace *vm)
548 {
549 	vm_map_t map = &vm->vm_map;
550 	vm_map_entry_t cur;
551 	vm_object_t object;
552 	vm_offset_t count = 0;
553 	vm_offset_t n;
554 
555 	vmspace_hold(vm);
556 
557 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
558 		switch(cur->maptype) {
559 		case VM_MAPTYPE_NORMAL:
560 			if ((object = cur->ba.object) == NULL)
561 				break;
562 			if (object->swblock_count) {
563 				n = (cur->ba.end - cur->ba.start) / PAGE_SIZE;
564 				count += object->swblock_count *
565 				    SWAP_META_PAGES * n / object->size + 1;
566 			}
567 			break;
568 		default:
569 			break;
570 		}
571 	}
572 	vmspace_drop(vm);
573 
574 	return(count);
575 }
576 
577 /*
578  * Calculate the approximate number of anonymous pages in use by
579  * this vmspace.  To make up for fractional losses, we count each
580  * VM object as having at least 1 anonymous page.
581  *
582  * No requirements.
583  */
584 vm_offset_t
585 vmspace_anonymous_count(struct vmspace *vm)
586 {
587 	vm_map_t map = &vm->vm_map;
588 	vm_map_entry_t cur;
589 	vm_object_t object;
590 	vm_offset_t count = 0;
591 
592 	vmspace_hold(vm);
593 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
594 		switch(cur->maptype) {
595 		case VM_MAPTYPE_NORMAL:
596 			if ((object = cur->ba.object) == NULL)
597 				break;
598 			if (object->type != OBJT_DEFAULT &&
599 			    object->type != OBJT_SWAP) {
600 				break;
601 			}
602 			count += object->resident_page_count;
603 			break;
604 		default:
605 			break;
606 		}
607 	}
608 	vmspace_drop(vm);
609 
610 	return(count);
611 }
612 
613 /*
614  * Initialize an existing vm_map structure such as that in the vmspace
615  * structure.  The pmap is initialized elsewhere.
616  *
617  * No requirements.
618  */
619 void
620 vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr,
621 	    pmap_t pmap)
622 {
623 	RB_INIT(&map->rb_root);
624 	spin_init(&map->ilock_spin, "ilock");
625 	map->ilock_base = NULL;
626 	map->nentries = 0;
627 	map->size = 0;
628 	map->system_map = 0;
629 	vm_map_min(map) = min_addr;
630 	vm_map_max(map) = max_addr;
631 	map->pmap = pmap;
632 	map->timestamp = 0;
633 	map->flags = 0;
634 	bzero(&map->freehint, sizeof(map->freehint));
635 	lwkt_token_init(&map->token, "vm_map");
636 	lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
637 }
638 
639 /*
640  * Find the first possible free address for the specified request length.
641  * Returns 0 if we don't have one cached.
642  */
643 static
644 vm_offset_t
645 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
646 {
647 	vm_map_freehint_t *scan;
648 
649 	scan = &map->freehint[0];
650 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
651 		if (scan->length == length && scan->align == align)
652 			return(scan->start);
653 		++scan;
654 	}
655 	return 0;
656 }
657 
658 /*
659  * Unconditionally set the freehint.  Called by vm_map_findspace() after
660  * it finds an address.  This will help us iterate optimally on the next
661  * similar findspace.
662  */
663 static
664 void
665 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
666 		       vm_size_t length, vm_size_t align)
667 {
668 	vm_map_freehint_t *scan;
669 
670 	scan = &map->freehint[0];
671 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
672 		if (scan->length == length && scan->align == align) {
673 			scan->start = start;
674 			return;
675 		}
676 		++scan;
677 	}
678 	scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
679 	scan->start = start;
680 	scan->align = align;
681 	scan->length = length;
682 	++map->freehint_newindex;
683 }
684 
685 /*
686  * Update any existing freehints (for any alignment), for the hole we just
687  * added.
688  */
689 static
690 void
691 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
692 {
693 	vm_map_freehint_t *scan;
694 
695 	scan = &map->freehint[0];
696 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
697 		if (scan->length <= length && scan->start > start)
698 			scan->start = start;
699 		++scan;
700 	}
701 }
702 
703 /*
704  * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
705  * object in the entry for COW faults.
706  *
707  * The entire chain including entry->ba (prior to inserting the fronting
708  * object) essentially becomes set in stone... elements of it can be paged
709  * in or out, but cannot be further modified.
710  *
711  * NOTE: If we do not optimize the backing chain then a unique copy is not
712  *	 needed.  Note, however, that because portions of the chain are
713  *	 shared across pmaps we cannot make any changes to the vm_map_backing
714  *	 elements themselves.
715  *
716  * If the map segment is governed by a virtual page table then it is
717  * possible to address offsets beyond the mapped area.  Just allocate
718  * a maximally sized object for this case.
719  *
720  * If addref is non-zero an additional reference is added to the returned
721  * entry.  This mechanic exists because the additional reference might have
722  * to be added atomically and not after return to prevent a premature
723  * collapse.  XXX currently there is no collapse code.
724  *
725  * The vm_map must be exclusively locked.
726  * No other requirements.
727  */
728 static
729 void
730 vm_map_entry_shadow(vm_map_entry_t entry)
731 {
732 	vm_map_backing_t ba;
733 	vm_size_t length;
734 	vm_object_t source;
735 	vm_object_t result;
736 
737 	/*
738 	 * Number of bytes we have to shadow
739 	 */
740 	length = atop(entry->ba.end - entry->ba.start);
741 
742 	/*
743 	 * Don't create the new object if the old object isn't shared.
744 	 * This case occurs quite often when programs fork/exec/wait.
745 	 *
746 	 * Caller ensures source exists (all backing_ba's must have objects),
747 	 * typically indirectly by virtue of the NEEDS_COPY flag being set.
748 	 * We have a ref on source by virtue of the entry and do not need
749 	 * to lock it to do this test.
750 	 */
751 	source = entry->ba.object;
752 	KKASSERT(source);
753 
754 	if (source->type != OBJT_VNODE) {
755 		if (source->ref_count == 1 &&
756 		    source->handle == NULL &&
757 		    (source->type == OBJT_DEFAULT ||
758 		     source->type == OBJT_SWAP)) {
759 			goto done;
760 		}
761 	}
762 	ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
763 	vm_object_hold_shared(source);
764 
765 	/*
766 	 * Once it becomes part of a backing_ba chain it can wind up anywhere,
767 	 * drop the ONEMAPPING flag now.
768 	 */
769 	vm_object_clear_flag(source, OBJ_ONEMAPPING);
770 
771 	/*
772 	 * Allocate a new object with the given length.  The new object
773 	 * is returned referenced but we may have to add another one.
774 	 * If we are adding a second reference we must clear OBJ_ONEMAPPING.
775 	 * (typically because the caller is about to clone a vm_map_entry).
776 	 *
777 	 * The source object currently has an extra reference to prevent
778 	 * collapses into it while we mess with its shadow list, which
779 	 * we will remove later in this routine.
780 	 *
781 	 * The target object may require a second reference if asked for one
782 	 * by the caller.
783 	 */
784 	result = vm_object_allocate_hold(OBJT_DEFAULT, length);
785 	if (result == NULL)
786 		panic("vm_object_shadow: no object for shadowing");
787 
788 	/*
789 	 * The new object shadows the source object.
790 	 *
791 	 * Try to optimize the result object's page color when shadowing
792 	 * in order to maintain page coloring consistency in the combined
793 	 * shadowed object.
794 	 *
795 	 * The source object is moved to ba, retaining its existing ref-count.
796 	 * No additional ref is needed.
797 	 *
798 	 * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
799 	 */
800 	vm_map_backing_detach(entry, &entry->ba);
801 	*ba = entry->ba;		/* previous ba */
802 	entry->ba.object = result;	/* new ba (at head of entry) */
803 	entry->ba.backing_ba = ba;
804 	entry->ba.backing_count = ba->backing_count + 1;
805 	entry->ba.offset = 0;
806 
807 	/* cpu localization twist */
808 	result->pg_color = vm_quickcolor();
809 
810 	vm_map_backing_attach(entry, &entry->ba);
811 	vm_map_backing_attach(entry, ba);
812 
813 	/*
814 	 * Adjust the return storage.  Drop the ref on source before
815 	 * returning.
816 	 */
817 	vm_object_drop(result);
818 	vm_object_drop(source);
819 done:
820 	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
821 }
822 
823 /*
824  * Allocate an object for a vm_map_entry.
825  *
826  * Object allocation for anonymous mappings is defered as long as possible.
827  * This function is called when we can defer no longer, generally when a map
828  * entry might be split or forked or takes a page fault.
829  *
830  * If the map segment is governed by a virtual page table then it is
831  * possible to address offsets beyond the mapped area.  Just allocate
832  * a maximally sized object for this case.
833  *
834  * The vm_map must be exclusively locked.
835  * No other requirements.
836  */
837 void
838 vm_map_entry_allocate_object(vm_map_entry_t entry)
839 {
840 	vm_object_t obj;
841 
842 	/*
843 	 * ba.offset is NOT cumulatively added in the backing_ba scan like
844 	 * it was in the old object chain, so we can assign whatever offset
845 	 * we like to the new object.
846 	 *
847 	 * For now assign a value of 0 to make debugging object sizes
848 	 * easier.
849 	 */
850 	entry->ba.offset = 0;
851 
852 	obj = vm_object_allocate(OBJT_DEFAULT,
853 				 atop(entry->ba.end - entry->ba.start) +
854 				 entry->ba.offset);
855 	entry->ba.object = obj;
856 	vm_map_backing_attach(entry, &entry->ba);
857 }
858 
859 /*
860  * Set an initial negative count so the first attempt to reserve
861  * space preloads a bunch of vm_map_entry's for this cpu.  Also
862  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
863  * map a new page for vm_map_entry structures.  SMP systems are
864  * particularly sensitive.
865  *
866  * This routine is called in early boot so we cannot just call
867  * vm_map_entry_reserve().
868  *
869  * Called from the low level boot code only (for each cpu)
870  *
871  * WARNING! Take care not to have too-big a static/BSS structure here
872  *	    as MAXCPU can be 256+, otherwise the loader's 64MB heap
873  *	    can get blown out by the kernel plus the initrd image.
874  */
875 void
876 vm_map_entry_reserve_cpu_init(globaldata_t gd)
877 {
878 	vm_map_entry_t entry;
879 	int count;
880 	int i;
881 
882 	atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
883 	if (gd->gd_cpuid == 0) {
884 		entry = &cpu_map_entry_init_bsp[0];
885 		count = MAPENTRYBSP_CACHE;
886 	} else {
887 		entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
888 		count = MAPENTRYAP_CACHE;
889 	}
890 	for (i = 0; i < count; ++i, ++entry) {
891 		MAPENT_FREELIST(entry) = gd->gd_vme_base;
892 		gd->gd_vme_base = entry;
893 	}
894 }
895 
896 /*
897  * Reserves vm_map_entry structures so code later-on can manipulate
898  * map_entry structures within a locked map without blocking trying
899  * to allocate a new vm_map_entry.
900  *
901  * No requirements.
902  *
903  * WARNING!  We must not decrement gd_vme_avail until after we have
904  *	     ensured that sufficient entries exist, otherwise we can
905  *	     get into an endless call recursion in the zalloc code
906  *	     itself.
907  */
908 int
909 vm_map_entry_reserve(int count)
910 {
911 	struct globaldata *gd = mycpu;
912 	vm_map_entry_t entry;
913 
914 	/*
915 	 * Make sure we have enough structures in gd_vme_base to handle
916 	 * the reservation request.
917 	 *
918 	 * Use a critical section to protect against VM faults.  It might
919 	 * not be needed, but we have to be careful here.
920 	 */
921 	if (gd->gd_vme_avail < count) {
922 		crit_enter();
923 		while (gd->gd_vme_avail < count) {
924 			entry = zalloc(mapentzone);
925 			MAPENT_FREELIST(entry) = gd->gd_vme_base;
926 			gd->gd_vme_base = entry;
927 			atomic_add_int(&gd->gd_vme_avail, 1);
928 		}
929 		crit_exit();
930 	}
931 	atomic_add_int(&gd->gd_vme_avail, -count);
932 
933 	return(count);
934 }
935 
936 /*
937  * Releases previously reserved vm_map_entry structures that were not
938  * used.  If we have too much junk in our per-cpu cache clean some of
939  * it out.
940  *
941  * No requirements.
942  */
943 void
944 vm_map_entry_release(int count)
945 {
946 	struct globaldata *gd = mycpu;
947 	vm_map_entry_t entry;
948 	vm_map_entry_t efree;
949 
950 	count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
951 	if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
952 		efree = NULL;
953 		crit_enter();
954 		while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
955 			entry = gd->gd_vme_base;
956 			KKASSERT(entry != NULL);
957 			gd->gd_vme_base = MAPENT_FREELIST(entry);
958 			atomic_add_int(&gd->gd_vme_avail, -1);
959 			MAPENT_FREELIST(entry) = efree;
960 			efree = entry;
961 		}
962 		crit_exit();
963 		while ((entry = efree) != NULL) {
964 			efree = MAPENT_FREELIST(efree);
965 			zfree(mapentzone, entry);
966 		}
967 	}
968 }
969 
970 /*
971  * Reserve map entry structures for use in kernel_map itself.  These
972  * entries have *ALREADY* been reserved on a per-cpu basis when the map
973  * was inited.  This function is used by zalloc() to avoid a recursion
974  * when zalloc() itself needs to allocate additional kernel memory.
975  *
976  * This function works like the normal reserve but does not load the
977  * vm_map_entry cache (because that would result in an infinite
978  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
979  *
980  * Any caller of this function must be sure to renormalize after
981  * potentially eating entries to ensure that the reserve supply
982  * remains intact.
983  *
984  * No requirements.
985  */
986 int
987 vm_map_entry_kreserve(int count)
988 {
989 	struct globaldata *gd = mycpu;
990 
991 	atomic_add_int(&gd->gd_vme_avail, -count);
992 	KASSERT(gd->gd_vme_base != NULL,
993 		("no reserved entries left, gd_vme_avail = %d",
994 		gd->gd_vme_avail));
995 	return(count);
996 }
997 
998 /*
999  * Release previously reserved map entries for kernel_map.  We do not
1000  * attempt to clean up like the normal release function as this would
1001  * cause an unnecessary (but probably not fatal) deep procedure call.
1002  *
1003  * No requirements.
1004  */
1005 void
1006 vm_map_entry_krelease(int count)
1007 {
1008 	struct globaldata *gd = mycpu;
1009 
1010 	atomic_add_int(&gd->gd_vme_avail, count);
1011 }
1012 
1013 /*
1014  * Allocates a VM map entry for insertion.  No entry fields are filled in.
1015  *
1016  * The entries should have previously been reserved.  The reservation count
1017  * is tracked in (*countp).
1018  *
1019  * No requirements.
1020  */
1021 static vm_map_entry_t
1022 vm_map_entry_create(int *countp)
1023 {
1024 	struct globaldata *gd = mycpu;
1025 	vm_map_entry_t entry;
1026 
1027 	KKASSERT(*countp > 0);
1028 	--*countp;
1029 	crit_enter();
1030 	entry = gd->gd_vme_base;
1031 	KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
1032 	gd->gd_vme_base = MAPENT_FREELIST(entry);
1033 	crit_exit();
1034 
1035 	return(entry);
1036 }
1037 
1038 /*
1039  * Attach and detach backing store elements
1040  */
1041 static void
1042 vm_map_backing_attach(vm_map_entry_t entry, vm_map_backing_t ba)
1043 {
1044 	vm_object_t obj;
1045 
1046 	switch(entry->maptype) {
1047 	case VM_MAPTYPE_NORMAL:
1048 		obj = ba->object;
1049 		lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1050 		TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry);
1051 		lockmgr(&obj->backing_lk, LK_RELEASE);
1052 		break;
1053 	case VM_MAPTYPE_UKSMAP:
1054 		ba->uksmap(ba, UKSMAPOP_ADD, entry->aux.dev, NULL);
1055 		break;
1056 	}
1057 }
1058 
1059 static void
1060 vm_map_backing_detach(vm_map_entry_t entry, vm_map_backing_t ba)
1061 {
1062 	vm_object_t obj;
1063 
1064 	switch(entry->maptype) {
1065 	case VM_MAPTYPE_NORMAL:
1066 		obj = ba->object;
1067 		lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1068 		TAILQ_REMOVE(&obj->backing_list, ba, entry);
1069 		lockmgr(&obj->backing_lk, LK_RELEASE);
1070 		break;
1071 	case VM_MAPTYPE_UKSMAP:
1072 		ba->uksmap(ba, UKSMAPOP_REM, entry->aux.dev, NULL);
1073 		break;
1074 	}
1075 }
1076 
1077 /*
1078  * Dispose of the dynamically allocated backing_ba chain associated
1079  * with a vm_map_entry.
1080  *
1081  * We decrement the (possibly shared) element and kfree() on the
1082  * 1->0 transition.  We only iterate to the next backing_ba when
1083  * the previous one went through a 1->0 transition.
1084  *
1085  * These can only be normal vm_object based backings.
1086  */
1087 static void
1088 vm_map_entry_dispose_ba(vm_map_entry_t entry, vm_map_backing_t ba)
1089 {
1090 	vm_map_backing_t next;
1091 
1092 	while (ba) {
1093 		if (ba->map_object) {
1094 			vm_map_backing_detach(entry, ba);
1095 			vm_object_deallocate(ba->object);
1096 		}
1097 		next = ba->backing_ba;
1098 		kfree(ba, M_MAP_BACKING);
1099 		ba = next;
1100 	}
1101 }
1102 
1103 /*
1104  * Dispose of a vm_map_entry that is no longer being referenced.
1105  *
1106  * No requirements.
1107  */
1108 static void
1109 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
1110 {
1111 	struct globaldata *gd = mycpu;
1112 
1113 	/*
1114 	 * Dispose of the base object and the backing link.
1115 	 */
1116 	switch(entry->maptype) {
1117 	case VM_MAPTYPE_NORMAL:
1118 		if (entry->ba.map_object) {
1119 			vm_map_backing_detach(entry, &entry->ba);
1120 			vm_object_deallocate(entry->ba.object);
1121 		}
1122 		break;
1123 	case VM_MAPTYPE_SUBMAP:
1124 		break;
1125 	case VM_MAPTYPE_UKSMAP:
1126 		vm_map_backing_detach(entry, &entry->ba);
1127 		break;
1128 	default:
1129 		break;
1130 	}
1131 	vm_map_entry_dispose_ba(entry, entry->ba.backing_ba);
1132 
1133 	/*
1134 	 * Cleanup for safety.
1135 	 */
1136 	entry->ba.backing_ba = NULL;
1137 	entry->ba.object = NULL;
1138 	entry->ba.offset = 0;
1139 
1140 	++*countp;
1141 	crit_enter();
1142 	MAPENT_FREELIST(entry) = gd->gd_vme_base;
1143 	gd->gd_vme_base = entry;
1144 	crit_exit();
1145 }
1146 
1147 
1148 /*
1149  * Insert/remove entries from maps.
1150  *
1151  * The related map must be exclusively locked.
1152  * The caller must hold map->token
1153  * No other requirements.
1154  */
1155 static __inline void
1156 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1157 {
1158 	ASSERT_VM_MAP_LOCKED(map);
1159 
1160 	map->nentries++;
1161 	if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
1162 		panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
1163 }
1164 
1165 static __inline void
1166 vm_map_entry_unlink(vm_map_t map,
1167 		    vm_map_entry_t entry)
1168 {
1169 	ASSERT_VM_MAP_LOCKED(map);
1170 
1171 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1172 		panic("vm_map_entry_unlink: attempt to mess with "
1173 		      "locked entry! %p", entry);
1174 	}
1175 	vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
1176 	map->nentries--;
1177 }
1178 
1179 /*
1180  * Finds the map entry containing (or immediately preceding) the specified
1181  * address in the given map.  The entry is returned in (*entry).
1182  *
1183  * The boolean result indicates whether the address is actually contained
1184  * in the map.
1185  *
1186  * The related map must be locked.
1187  * No other requirements.
1188  */
1189 boolean_t
1190 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1191 {
1192 	vm_map_entry_t tmp;
1193 	vm_map_entry_t last;
1194 
1195 	ASSERT_VM_MAP_LOCKED(map);
1196 
1197 	/*
1198 	 * Locate the record from the top of the tree.  'last' tracks the
1199 	 * closest prior record and is returned if no match is found, which
1200 	 * in binary tree terms means tracking the most recent right-branch
1201 	 * taken.  If there is no prior record, *entry is set to NULL.
1202 	 */
1203 	last = NULL;
1204 	tmp = RB_ROOT(&map->rb_root);
1205 
1206 	while (tmp) {
1207 		if (address >= tmp->ba.start) {
1208 			if (address < tmp->ba.end) {
1209 				*entry = tmp;
1210 				return(TRUE);
1211 			}
1212 			last = tmp;
1213 			tmp = RB_RIGHT(tmp, rb_entry);
1214 		} else {
1215 			tmp = RB_LEFT(tmp, rb_entry);
1216 		}
1217 	}
1218 	*entry = last;
1219 	return (FALSE);
1220 }
1221 
1222 /*
1223  * Inserts the given whole VM object into the target map at the specified
1224  * address range.  The object's size should match that of the address range.
1225  *
1226  * The map must be exclusively locked.
1227  * The object must be held.
1228  * The caller must have reserved sufficient vm_map_entry structures.
1229  *
1230  * If object is non-NULL, ref count must be bumped by caller prior to
1231  * making call to account for the new entry.  XXX API is a bit messy.
1232  */
1233 int
1234 vm_map_insert(vm_map_t map, int *countp,
1235 	      void *map_object, void *map_aux,
1236 	      vm_ooffset_t offset, void *aux_info,
1237 	      vm_offset_t start, vm_offset_t end,
1238 	      vm_maptype_t maptype, vm_subsys_t id,
1239 	      vm_prot_t prot, vm_prot_t max, int cow)
1240 {
1241 	vm_map_entry_t new_entry;
1242 	vm_map_entry_t prev_entry;
1243 	vm_map_entry_t next;
1244 	vm_map_entry_t temp_entry;
1245 	vm_eflags_t protoeflags;
1246 	vm_object_t object;
1247 	int must_drop = 0;
1248 
1249 	if (maptype == VM_MAPTYPE_UKSMAP)
1250 		object = NULL;
1251 	else
1252 		object = map_object;
1253 
1254 	ASSERT_VM_MAP_LOCKED(map);
1255 	if (object)
1256 		ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1257 
1258 	/*
1259 	 * Check that the start and end points are not bogus.
1260 	 */
1261 	if ((start < vm_map_min(map)) || (end > vm_map_max(map)) ||
1262 	    (start >= end)) {
1263 		return (KERN_INVALID_ADDRESS);
1264 	}
1265 
1266 	/*
1267 	 * Find the entry prior to the proposed starting address; if it's part
1268 	 * of an existing entry, this range is bogus.
1269 	 */
1270 	if (vm_map_lookup_entry(map, start, &temp_entry))
1271 		return (KERN_NO_SPACE);
1272 	prev_entry = temp_entry;
1273 
1274 	/*
1275 	 * Assert that the next entry doesn't overlap the end point.
1276 	 */
1277 	if (prev_entry)
1278 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
1279 	else
1280 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
1281 	if (next && next->ba.start < end)
1282 		return (KERN_NO_SPACE);
1283 
1284 	protoeflags = 0;
1285 
1286 	if (cow & MAP_COPY_ON_WRITE)
1287 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1288 
1289 	if (cow & MAP_NOFAULT) {
1290 		protoeflags |= MAP_ENTRY_NOFAULT;
1291 
1292 		KASSERT(object == NULL,
1293 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
1294 	}
1295 	if (cow & MAP_DISABLE_SYNCER)
1296 		protoeflags |= MAP_ENTRY_NOSYNC;
1297 	if (cow & MAP_DISABLE_COREDUMP)
1298 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1299 	if (cow & MAP_IS_STACK)
1300 		protoeflags |= MAP_ENTRY_STACK;
1301 	if (cow & MAP_IS_KSTACK)
1302 		protoeflags |= MAP_ENTRY_KSTACK;
1303 
1304 	lwkt_gettoken(&map->token);
1305 
1306 	if (object) {
1307 		;
1308 	} else if (prev_entry &&
1309 		 (prev_entry->eflags == protoeflags) &&
1310 		 (prev_entry->ba.end == start) &&
1311 		 (prev_entry->wired_count == 0) &&
1312 		 (prev_entry->id == id) &&
1313 		 prev_entry->maptype == maptype &&
1314 		 maptype == VM_MAPTYPE_NORMAL &&
1315 		 prev_entry->ba.backing_ba == NULL &&	/* not backed */
1316 		 ((prev_entry->ba.object == NULL) ||
1317 		  vm_object_coalesce(prev_entry->ba.object,
1318 				     OFF_TO_IDX(prev_entry->ba.offset),
1319 				     (vm_size_t)(prev_entry->ba.end - prev_entry->ba.start),
1320 				     (vm_size_t)(end - prev_entry->ba.end)))) {
1321 		/*
1322 		 * We were able to extend the object.  Determine if we
1323 		 * can extend the previous map entry to include the
1324 		 * new range as well.
1325 		 */
1326 		if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1327 		    (prev_entry->protection == prot) &&
1328 		    (prev_entry->max_protection == max)) {
1329 			map->size += (end - prev_entry->ba.end);
1330 			vm_map_backing_adjust_end(prev_entry, end);
1331 			vm_map_simplify_entry(map, prev_entry, countp);
1332 			lwkt_reltoken(&map->token);
1333 			return (KERN_SUCCESS);
1334 		}
1335 
1336 		/*
1337 		 * If we can extend the object but cannot extend the
1338 		 * map entry, we have to create a new map entry.  We
1339 		 * must bump the ref count on the extended object to
1340 		 * account for it.  object may be NULL.
1341 		 */
1342 		object = prev_entry->ba.object;
1343 		offset = prev_entry->ba.offset +
1344 			(prev_entry->ba.end - prev_entry->ba.start);
1345 		if (object) {
1346 			vm_object_hold(object);
1347 			vm_object_lock_swap(); /* map->token order */
1348 			vm_object_reference_locked(object);
1349 			map_object = object;
1350 			must_drop = 1;
1351 		}
1352 	}
1353 
1354 	/*
1355 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
1356 	 * in things like the buffer map where we manage kva but do not manage
1357 	 * backing objects.
1358 	 */
1359 
1360 	/*
1361 	 * Create a new entry
1362 	 */
1363 	new_entry = vm_map_entry_create(countp);
1364 	new_entry->ba.pmap = map->pmap;
1365 	new_entry->ba.start = start;
1366 	new_entry->ba.end = end;
1367 	new_entry->id = id;
1368 
1369 	new_entry->maptype = maptype;
1370 	new_entry->eflags = protoeflags;
1371 	new_entry->aux.master_pde = 0;		/* in case size is different */
1372 	new_entry->aux.map_aux = map_aux;
1373 	new_entry->ba.map_object = map_object;
1374 	new_entry->ba.backing_ba = NULL;
1375 	new_entry->ba.backing_count = 0;
1376 	new_entry->ba.offset = offset;
1377 	new_entry->ba.aux_info = aux_info;
1378 	new_entry->ba.flags = 0;
1379 	new_entry->ba.pmap = map->pmap;
1380 
1381 	new_entry->inheritance = VM_INHERIT_DEFAULT;
1382 	new_entry->protection = prot;
1383 	new_entry->max_protection = max;
1384 	new_entry->wired_count = 0;
1385 
1386 	/*
1387 	 * Insert the new entry into the list
1388 	 */
1389 	vm_map_backing_replicated(map, new_entry, MAP_BACK_BASEOBJREFD);
1390 	vm_map_entry_link(map, new_entry);
1391 	map->size += new_entry->ba.end - new_entry->ba.start;
1392 
1393 	/*
1394 	 * Don't worry about updating freehint[] when inserting, allow
1395 	 * addresses to be lower than the actual first free spot.
1396 	 */
1397 #if 0
1398 	/*
1399 	 * Temporarily removed to avoid MAP_STACK panic, due to
1400 	 * MAP_STACK being a huge hack.  Will be added back in
1401 	 * when MAP_STACK (and the user stack mapping) is fixed.
1402 	 */
1403 	/*
1404 	 * It may be possible to simplify the entry
1405 	 */
1406 	vm_map_simplify_entry(map, new_entry, countp);
1407 #endif
1408 
1409 	/*
1410 	 * Try to pre-populate the page table.  Mappings governed by virtual
1411 	 * page tables cannot be prepopulated without a lot of work, so
1412 	 * don't try.
1413 	 */
1414 	if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1415 	    maptype != VM_MAPTYPE_UKSMAP) {
1416 		int dorelock = 0;
1417 		if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1418 			dorelock = 1;
1419 			vm_object_lock_swap();
1420 			vm_object_drop(object);
1421 		}
1422 		pmap_object_init_pt(map->pmap, new_entry,
1423 				    new_entry->ba.start,
1424 				    new_entry->ba.end - new_entry->ba.start,
1425 				    cow & MAP_PREFAULT_PARTIAL);
1426 		if (dorelock) {
1427 			vm_object_hold(object);
1428 			vm_object_lock_swap();
1429 		}
1430 	}
1431 	lwkt_reltoken(&map->token);
1432 	if (must_drop)
1433 		vm_object_drop(object);
1434 
1435 	return (KERN_SUCCESS);
1436 }
1437 
1438 /*
1439  * Find sufficient space for `length' bytes in the given map, starting at
1440  * `start'.  Returns 0 on success, 1 on no space.
1441  *
1442  * This function will returned an arbitrarily aligned pointer.  If no
1443  * particular alignment is required you should pass align as 1.  Note that
1444  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1445  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1446  * argument.
1447  *
1448  * 'align' should be a power of 2 but is not required to be.
1449  *
1450  * The map must be exclusively locked.
1451  * No other requirements.
1452  */
1453 int
1454 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1455 		 vm_size_t align, int flags, vm_offset_t *addr)
1456 {
1457 	vm_map_entry_t entry;
1458 	vm_map_entry_t tmp;
1459 	vm_offset_t hole_start;
1460 	vm_offset_t end;
1461 	vm_offset_t align_mask;
1462 
1463 	if (start < vm_map_min(map))
1464 		start = vm_map_min(map);
1465 	if (start > vm_map_max(map))
1466 		return (1);
1467 
1468 	/*
1469 	 * If the alignment is not a power of 2 we will have to use
1470 	 * a mod/division, set align_mask to a special value.
1471 	 */
1472 	if ((align | (align - 1)) + 1 != (align << 1))
1473 		align_mask = (vm_offset_t)-1;
1474 	else
1475 		align_mask = align - 1;
1476 
1477 	/*
1478 	 * Use freehint to adjust the start point, hopefully reducing
1479 	 * the iteration to O(1).
1480 	 */
1481 	hole_start = vm_map_freehint_find(map, length, align);
1482 	if (start < hole_start)
1483 		start = hole_start;
1484 	if (vm_map_lookup_entry(map, start, &tmp))
1485 		start = tmp->ba.end;
1486 	entry = tmp;	/* may be NULL */
1487 
1488 	/*
1489 	 * Look through the rest of the map, trying to fit a new region in the
1490 	 * gap between existing regions, or after the very last region.
1491 	 */
1492 	for (;;) {
1493 		/*
1494 		 * Adjust the proposed start by the requested alignment,
1495 		 * be sure that we didn't wrap the address.
1496 		 */
1497 		if (align_mask == (vm_offset_t)-1)
1498 			end = roundup(start, align);
1499 		else
1500 			end = (start + align_mask) & ~align_mask;
1501 		if (end < start)
1502 			return (1);
1503 		start = end;
1504 
1505 		/*
1506 		 * Find the end of the proposed new region.  Be sure we didn't
1507 		 * go beyond the end of the map, or wrap around the address.
1508 		 * Then check to see if this is the last entry or if the
1509 		 * proposed end fits in the gap between this and the next
1510 		 * entry.
1511 		 */
1512 		end = start + length;
1513 		if (end > vm_map_max(map) || end < start)
1514 			return (1);
1515 
1516 		/*
1517 		 * Locate the next entry, we can stop if this is the
1518 		 * last entry (we know we are in-bounds so that would
1519 		 * be a sucess).
1520 		 */
1521 		if (entry)
1522 			entry = vm_map_rb_tree_RB_NEXT(entry);
1523 		else
1524 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
1525 		if (entry == NULL)
1526 			break;
1527 
1528 		/*
1529 		 * Determine if the proposed area would overlap the
1530 		 * next entry.
1531 		 *
1532 		 * When matching against a STACK entry, only allow the
1533 		 * memory map to intrude on the ungrown portion of the
1534 		 * STACK entry when MAP_TRYFIXED is set.
1535 		 */
1536 		if (entry->ba.start >= end) {
1537 			if ((entry->eflags & MAP_ENTRY_STACK) == 0)
1538 				break;
1539 			if (flags & MAP_TRYFIXED)
1540 				break;
1541 			if (entry->ba.start - entry->aux.avail_ssize >= end)
1542 				break;
1543 		}
1544 		start = entry->ba.end;
1545 	}
1546 
1547 	/*
1548 	 * Update the freehint
1549 	 */
1550 	vm_map_freehint_update(map, start, length, align);
1551 
1552 	/*
1553 	 * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1554 	 * if it fails.  The kernel_map is locked and nothing can steal
1555 	 * our address space if pmap_growkernel() blocks.
1556 	 *
1557 	 * NOTE: This may be unconditionally called for kldload areas on
1558 	 *	 x86_64 because these do not bump kernel_vm_end (which would
1559 	 *	 fill 128G worth of page tables!).  Therefore we must not
1560 	 *	 retry.
1561 	 */
1562 	if (map == kernel_map) {
1563 		vm_offset_t kstop;
1564 
1565 		kstop = round_page(start + length);
1566 		if (kstop > kernel_vm_end)
1567 			pmap_growkernel(start, kstop);
1568 	}
1569 	*addr = start;
1570 	return (0);
1571 }
1572 
1573 /*
1574  * vm_map_find finds an unallocated region in the target address map with
1575  * the given length and allocates it.  The search is defined to be first-fit
1576  * from the specified address; the region found is returned in the same
1577  * parameter.
1578  *
1579  * If object is non-NULL, ref count must be bumped by caller
1580  * prior to making call to account for the new entry.
1581  *
1582  * No requirements.  This function will lock the map temporarily.
1583  */
1584 int
1585 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1586 	    vm_ooffset_t offset, vm_offset_t *addr,
1587 	    vm_size_t length, vm_size_t align, boolean_t fitit,
1588 	    vm_maptype_t maptype, vm_subsys_t id,
1589 	    vm_prot_t prot, vm_prot_t max, int cow)
1590 {
1591 	vm_offset_t start;
1592 	vm_object_t object;
1593 	void *aux_info;
1594 	int result;
1595 	int count;
1596 
1597 	/*
1598 	 * Certain UKSMAPs may need aux_info.
1599 	 *
1600 	 * (map_object is the callback function, aux_info is the process
1601 	 *  or thread, if necessary).
1602 	 */
1603 	aux_info = NULL;
1604 	if (maptype == VM_MAPTYPE_UKSMAP) {
1605 		KKASSERT(map_aux != NULL && map_object != NULL);
1606 
1607 		switch(minor(((struct cdev *)map_aux))) {
1608 		case 5:
1609 			/*
1610 			 * /dev/upmap
1611 			 */
1612 			aux_info = curproc;
1613 			break;
1614 		case 6:
1615 			/*
1616 			 * /dev/kpmap
1617 			 */
1618 			break;
1619 		case 7:
1620 			/*
1621 			 * /dev/lpmap
1622 			 */
1623 			aux_info = curthread->td_lwp;
1624 			break;
1625 		}
1626 		object = NULL;
1627 	} else {
1628 		object = map_object;
1629 	}
1630 
1631 	start = *addr;
1632 
1633 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1634 	vm_map_lock(map);
1635 	if (object)
1636 		vm_object_hold_shared(object);
1637 	if (fitit) {
1638 		if (vm_map_findspace(map, start, length, align, 0, addr)) {
1639 			if (object)
1640 				vm_object_drop(object);
1641 			vm_map_unlock(map);
1642 			vm_map_entry_release(count);
1643 			return (KERN_NO_SPACE);
1644 		}
1645 		start = *addr;
1646 	}
1647 	result = vm_map_insert(map, &count,
1648 			       map_object, map_aux,
1649 			       offset, aux_info,
1650 			       start, start + length,
1651 			       maptype, id, prot, max, cow);
1652 	if (object)
1653 		vm_object_drop(object);
1654 	vm_map_unlock(map);
1655 	vm_map_entry_release(count);
1656 
1657 	return (result);
1658 }
1659 
1660 /*
1661  * Simplify the given map entry by merging with either neighbor.  This
1662  * routine also has the ability to merge with both neighbors.
1663  *
1664  * This routine guarentees that the passed entry remains valid (though
1665  * possibly extended).  When merging, this routine may delete one or
1666  * both neighbors.  No action is taken on entries which have their
1667  * in-transition flag set.
1668  *
1669  * The map must be exclusively locked.
1670  */
1671 void
1672 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1673 {
1674 	vm_map_entry_t next, prev;
1675 	vm_size_t prevsize, esize;
1676 
1677 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1678 		++mycpu->gd_cnt.v_intrans_coll;
1679 		return;
1680 	}
1681 
1682 	if (entry->maptype == VM_MAPTYPE_SUBMAP)
1683 		return;
1684 	if (entry->maptype == VM_MAPTYPE_UKSMAP)
1685 		return;
1686 
1687 	prev = vm_map_rb_tree_RB_PREV(entry);
1688 	if (prev) {
1689 		prevsize = prev->ba.end - prev->ba.start;
1690 		if ( (prev->ba.end == entry->ba.start) &&
1691 		     (prev->maptype == entry->maptype) &&
1692 		     (prev->ba.object == entry->ba.object) &&
1693 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1694 		     (!prev->ba.object ||
1695 			(prev->ba.offset + prevsize == entry->ba.offset)) &&
1696 		     (prev->eflags == entry->eflags) &&
1697 		     (prev->protection == entry->protection) &&
1698 		     (prev->max_protection == entry->max_protection) &&
1699 		     (prev->inheritance == entry->inheritance) &&
1700 		     (prev->id == entry->id) &&
1701 		     (prev->wired_count == entry->wired_count)) {
1702 			/*
1703 			 * NOTE: order important.  Unlink before gumming up
1704 			 *	 the RBTREE w/adjust, adjust before disposal
1705 			 *	 of prior entry, to avoid pmap snafus.
1706 			 */
1707 			vm_map_entry_unlink(map, prev);
1708 			vm_map_backing_adjust_start(entry, prev->ba.start);
1709 			if (entry->ba.object == NULL)
1710 				entry->ba.offset = 0;
1711 			vm_map_entry_dispose(map, prev, countp);
1712 		}
1713 	}
1714 
1715 	next = vm_map_rb_tree_RB_NEXT(entry);
1716 	if (next) {
1717 		esize = entry->ba.end - entry->ba.start;
1718 		if ((entry->ba.end == next->ba.start) &&
1719 		    (next->maptype == entry->maptype) &&
1720 		    (next->ba.object == entry->ba.object) &&
1721 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1722 		     (!entry->ba.object ||
1723 			(entry->ba.offset + esize == next->ba.offset)) &&
1724 		    (next->eflags == entry->eflags) &&
1725 		    (next->protection == entry->protection) &&
1726 		    (next->max_protection == entry->max_protection) &&
1727 		    (next->inheritance == entry->inheritance) &&
1728 		    (next->id == entry->id) &&
1729 		    (next->wired_count == entry->wired_count)) {
1730 			/*
1731 			 * NOTE: order important.  Unlink before gumming up
1732 			 *	 the RBTREE w/adjust, adjust before disposal
1733 			 *	 of prior entry, to avoid pmap snafus.
1734 			 */
1735 			vm_map_entry_unlink(map, next);
1736 			vm_map_backing_adjust_end(entry, next->ba.end);
1737 			vm_map_entry_dispose(map, next, countp);
1738 	        }
1739 	}
1740 }
1741 
1742 /*
1743  * Asserts that the given entry begins at or after the specified address.
1744  * If necessary, it splits the entry into two.
1745  */
1746 #define vm_map_clip_start(map, entry, startaddr, countp)		\
1747 {									\
1748 	if (startaddr > entry->ba.start)				\
1749 		_vm_map_clip_start(map, entry, startaddr, countp);	\
1750 }
1751 
1752 /*
1753  * This routine is called only when it is known that the entry must be split.
1754  *
1755  * The map must be exclusively locked.
1756  */
1757 static void
1758 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1759 		   int *countp)
1760 {
1761 	vm_map_entry_t new_entry;
1762 
1763 	/*
1764 	 * Split off the front portion -- note that we must insert the new
1765 	 * entry BEFORE this one, so that this entry has the specified
1766 	 * starting address.
1767 	 */
1768 
1769 	vm_map_simplify_entry(map, entry, countp);
1770 
1771 	/*
1772 	 * If there is no object backing this entry, we might as well create
1773 	 * one now.  If we defer it, an object can get created after the map
1774 	 * is clipped, and individual objects will be created for the split-up
1775 	 * map.  This is a bit of a hack, but is also about the best place to
1776 	 * put this improvement.
1777 	 */
1778 	if (entry->ba.object == NULL && !map->system_map &&
1779 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1780 		vm_map_entry_allocate_object(entry);
1781 	}
1782 
1783 	/*
1784 	 * NOTE: The replicated function will adjust start, end, and offset
1785 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1786 	 *	 the embedded ba.
1787 	 */
1788 	new_entry = vm_map_entry_create(countp);
1789 	*new_entry = *entry;
1790 	new_entry->ba.end = start;
1791 
1792 	/*
1793 	 * Ordering is important, make sure the new entry is replicated
1794 	 * before we cut the exiting entry.
1795 	 */
1796 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1797 	vm_map_backing_adjust_start(entry, start);
1798 	vm_map_entry_link(map, new_entry);
1799 }
1800 
1801 /*
1802  * Asserts that the given entry ends at or before the specified address.
1803  * If necessary, it splits the entry into two.
1804  *
1805  * The map must be exclusively locked.
1806  */
1807 #define vm_map_clip_end(map, entry, endaddr, countp)		\
1808 {								\
1809 	if (endaddr < entry->ba.end)				\
1810 		_vm_map_clip_end(map, entry, endaddr, countp);	\
1811 }
1812 
1813 /*
1814  * This routine is called only when it is known that the entry must be split.
1815  *
1816  * The map must be exclusively locked.
1817  */
1818 static void
1819 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1820 		 int *countp)
1821 {
1822 	vm_map_entry_t new_entry;
1823 
1824 	/*
1825 	 * If there is no object backing this entry, we might as well create
1826 	 * one now.  If we defer it, an object can get created after the map
1827 	 * is clipped, and individual objects will be created for the split-up
1828 	 * map.  This is a bit of a hack, but is also about the best place to
1829 	 * put this improvement.
1830 	 */
1831 
1832 	if (entry->ba.object == NULL && !map->system_map &&
1833 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1834 		vm_map_entry_allocate_object(entry);
1835 	}
1836 
1837 	/*
1838 	 * Create a new entry and insert it AFTER the specified entry
1839 	 *
1840 	 * NOTE: The replicated function will adjust start, end, and offset
1841 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1842 	 *	 the embedded ba.
1843 	 */
1844 	new_entry = vm_map_entry_create(countp);
1845 	*new_entry = *entry;
1846 	new_entry->ba.start = end;
1847 	new_entry->ba.offset += (new_entry->ba.start - entry->ba.start);
1848 
1849 	/*
1850 	 * Ordering is important, make sure the new entry is replicated
1851 	 * before we cut the exiting entry.
1852 	 */
1853 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1854 	vm_map_backing_adjust_end(entry, end);
1855 	vm_map_entry_link(map, new_entry);
1856 }
1857 
1858 /*
1859  * Asserts that the starting and ending region addresses fall within the
1860  * valid range for the map.
1861  */
1862 #define	VM_MAP_RANGE_CHECK(map, start, end)	\
1863 {						\
1864 	if (start < vm_map_min(map))		\
1865 		start = vm_map_min(map);	\
1866 	if (end > vm_map_max(map))		\
1867 		end = vm_map_max(map);		\
1868 	if (start > end)			\
1869 		start = end;			\
1870 }
1871 
1872 /*
1873  * Used to block when an in-transition collison occurs.  The map
1874  * is unlocked for the sleep and relocked before the return.
1875  */
1876 void
1877 vm_map_transition_wait(vm_map_t map, int relock)
1878 {
1879 	tsleep_interlock(map, 0);
1880 	vm_map_unlock(map);
1881 	tsleep(map, PINTERLOCKED, "vment", 0);
1882 	if (relock)
1883 		vm_map_lock(map);
1884 }
1885 
1886 /*
1887  * When we do blocking operations with the map lock held it is
1888  * possible that a clip might have occured on our in-transit entry,
1889  * requiring an adjustment to the entry in our loop.  These macros
1890  * help the pageable and clip_range code deal with the case.  The
1891  * conditional costs virtually nothing if no clipping has occured.
1892  */
1893 
1894 #define CLIP_CHECK_BACK(entry, save_start)			\
1895     do {							\
1896 	    while (entry->ba.start != save_start) {		\
1897 		    entry = vm_map_rb_tree_RB_PREV(entry);	\
1898 		    KASSERT(entry, ("bad entry clip")); 	\
1899 	    }							\
1900     } while(0)
1901 
1902 #define CLIP_CHECK_FWD(entry, save_end)				\
1903     do {							\
1904 	    while (entry->ba.end != save_end) {			\
1905 		    entry = vm_map_rb_tree_RB_NEXT(entry);	\
1906 		    KASSERT(entry, ("bad entry clip")); 	\
1907 	    }							\
1908     } while(0)
1909 
1910 
1911 /*
1912  * Clip the specified range and return the base entry.  The
1913  * range may cover several entries starting at the returned base
1914  * and the first and last entry in the covering sequence will be
1915  * properly clipped to the requested start and end address.
1916  *
1917  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1918  * flag.
1919  *
1920  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1921  * covered by the requested range.
1922  *
1923  * The map must be exclusively locked on entry and will remain locked
1924  * on return. If no range exists or the range contains holes and you
1925  * specified that no holes were allowed, NULL will be returned.  This
1926  * routine may temporarily unlock the map in order avoid a deadlock when
1927  * sleeping.
1928  */
1929 static
1930 vm_map_entry_t
1931 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1932 		  int *countp, int flags)
1933 {
1934 	vm_map_entry_t start_entry;
1935 	vm_map_entry_t entry;
1936 	vm_map_entry_t next;
1937 
1938 	/*
1939 	 * Locate the entry and effect initial clipping.  The in-transition
1940 	 * case does not occur very often so do not try to optimize it.
1941 	 */
1942 again:
1943 	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1944 		return (NULL);
1945 	entry = start_entry;
1946 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1947 		entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1948 		++mycpu->gd_cnt.v_intrans_coll;
1949 		++mycpu->gd_cnt.v_intrans_wait;
1950 		vm_map_transition_wait(map, 1);
1951 		/*
1952 		 * entry and/or start_entry may have been clipped while
1953 		 * we slept, or may have gone away entirely.  We have
1954 		 * to restart from the lookup.
1955 		 */
1956 		goto again;
1957 	}
1958 
1959 	/*
1960 	 * Since we hold an exclusive map lock we do not have to restart
1961 	 * after clipping, even though clipping may block in zalloc.
1962 	 */
1963 	vm_map_clip_start(map, entry, start, countp);
1964 	vm_map_clip_end(map, entry, end, countp);
1965 	entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1966 
1967 	/*
1968 	 * Scan entries covered by the range.  When working on the next
1969 	 * entry a restart need only re-loop on the current entry which
1970 	 * we have already locked, since 'next' may have changed.  Also,
1971 	 * even though entry is safe, it may have been clipped so we
1972 	 * have to iterate forwards through the clip after sleeping.
1973 	 */
1974 	for (;;) {
1975 		next = vm_map_rb_tree_RB_NEXT(entry);
1976 		if (next == NULL || next->ba.start >= end)
1977 			break;
1978 		if (flags & MAP_CLIP_NO_HOLES) {
1979 			if (next->ba.start > entry->ba.end) {
1980 				vm_map_unclip_range(map, start_entry,
1981 					start, entry->ba.end, countp, flags);
1982 				return(NULL);
1983 			}
1984 		}
1985 
1986 		if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1987 			vm_offset_t save_end = entry->ba.end;
1988 			next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1989 			++mycpu->gd_cnt.v_intrans_coll;
1990 			++mycpu->gd_cnt.v_intrans_wait;
1991 			vm_map_transition_wait(map, 1);
1992 
1993 			/*
1994 			 * clips might have occured while we blocked.
1995 			 */
1996 			CLIP_CHECK_FWD(entry, save_end);
1997 			CLIP_CHECK_BACK(start_entry, start);
1998 			continue;
1999 		}
2000 
2001 		/*
2002 		 * No restart necessary even though clip_end may block, we
2003 		 * are holding the map lock.
2004 		 */
2005 		vm_map_clip_end(map, next, end, countp);
2006 		next->eflags |= MAP_ENTRY_IN_TRANSITION;
2007 		entry = next;
2008 	}
2009 	if (flags & MAP_CLIP_NO_HOLES) {
2010 		if (entry->ba.end != end) {
2011 			vm_map_unclip_range(map, start_entry,
2012 				start, entry->ba.end, countp, flags);
2013 			return(NULL);
2014 		}
2015 	}
2016 	return(start_entry);
2017 }
2018 
2019 /*
2020  * Undo the effect of vm_map_clip_range().  You should pass the same
2021  * flags and the same range that you passed to vm_map_clip_range().
2022  * This code will clear the in-transition flag on the entries and
2023  * wake up anyone waiting.  This code will also simplify the sequence
2024  * and attempt to merge it with entries before and after the sequence.
2025  *
2026  * The map must be locked on entry and will remain locked on return.
2027  *
2028  * Note that you should also pass the start_entry returned by
2029  * vm_map_clip_range().  However, if you block between the two calls
2030  * with the map unlocked please be aware that the start_entry may
2031  * have been clipped and you may need to scan it backwards to find
2032  * the entry corresponding with the original start address.  You are
2033  * responsible for this, vm_map_unclip_range() expects the correct
2034  * start_entry to be passed to it and will KASSERT otherwise.
2035  */
2036 static
2037 void
2038 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
2039 		    vm_offset_t start, vm_offset_t end,
2040 		    int *countp, int flags)
2041 {
2042 	vm_map_entry_t entry;
2043 
2044 	entry = start_entry;
2045 
2046 	KASSERT(entry->ba.start == start, ("unclip_range: illegal base entry"));
2047 	while (entry && entry->ba.start < end) {
2048 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2049 			("in-transition flag not set during unclip on: %p",
2050 			entry));
2051 		KASSERT(entry->ba.end <= end,
2052 			("unclip_range: tail wasn't clipped"));
2053 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2054 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2055 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2056 			wakeup(map);
2057 		}
2058 		entry = vm_map_rb_tree_RB_NEXT(entry);
2059 	}
2060 
2061 	/*
2062 	 * Simplification does not block so there is no restart case.
2063 	 */
2064 	entry = start_entry;
2065 	while (entry && entry->ba.start < end) {
2066 		vm_map_simplify_entry(map, entry, countp);
2067 		entry = vm_map_rb_tree_RB_NEXT(entry);
2068 	}
2069 }
2070 
2071 /*
2072  * Mark the given range as handled by a subordinate map.
2073  *
2074  * This range must have been created with vm_map_find(), and no other
2075  * operations may have been performed on this range prior to calling
2076  * vm_map_submap().
2077  *
2078  * Submappings cannot be removed.
2079  *
2080  * No requirements.
2081  */
2082 int
2083 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
2084 {
2085 	vm_map_entry_t entry;
2086 	int result = KERN_INVALID_ARGUMENT;
2087 	int count;
2088 
2089 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2090 	vm_map_lock(map);
2091 
2092 	VM_MAP_RANGE_CHECK(map, start, end);
2093 
2094 	if (vm_map_lookup_entry(map, start, &entry)) {
2095 		vm_map_clip_start(map, entry, start, &count);
2096 	} else if (entry) {
2097 		entry = vm_map_rb_tree_RB_NEXT(entry);
2098 	} else {
2099 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2100 	}
2101 
2102 	vm_map_clip_end(map, entry, end, &count);
2103 
2104 	if ((entry->ba.start == start) && (entry->ba.end == end) &&
2105 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2106 	    (entry->ba.object == NULL)) {
2107 		entry->ba.sub_map = submap;
2108 		entry->maptype = VM_MAPTYPE_SUBMAP;
2109 		result = KERN_SUCCESS;
2110 	}
2111 	vm_map_unlock(map);
2112 	vm_map_entry_release(count);
2113 
2114 	return (result);
2115 }
2116 
2117 /*
2118  * Sets the protection of the specified address region in the target map.
2119  * If "set_max" is specified, the maximum protection is to be set;
2120  * otherwise, only the current protection is affected.
2121  *
2122  * The protection is not applicable to submaps, but is applicable to normal
2123  * maps and maps governed by virtual page tables.  For example, when operating
2124  * on a virtual page table our protection basically controls how COW occurs
2125  * on the backing object, whereas the virtual page table abstraction itself
2126  * is an abstraction for userland.
2127  *
2128  * No requirements.
2129  */
2130 int
2131 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2132 	       vm_prot_t new_prot, boolean_t set_max)
2133 {
2134 	vm_map_entry_t current;
2135 	vm_map_entry_t entry;
2136 	int count;
2137 
2138 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2139 	vm_map_lock(map);
2140 
2141 	VM_MAP_RANGE_CHECK(map, start, end);
2142 
2143 	if (vm_map_lookup_entry(map, start, &entry)) {
2144 		vm_map_clip_start(map, entry, start, &count);
2145 	} else if (entry) {
2146 		entry = vm_map_rb_tree_RB_NEXT(entry);
2147 	} else {
2148 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2149 	}
2150 
2151 	/*
2152 	 * Make a first pass to check for protection violations.
2153 	 */
2154 	current = entry;
2155 	while (current && current->ba.start < end) {
2156 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2157 			vm_map_unlock(map);
2158 			vm_map_entry_release(count);
2159 			return (KERN_INVALID_ARGUMENT);
2160 		}
2161 		if ((new_prot & current->max_protection) != new_prot) {
2162 			vm_map_unlock(map);
2163 			vm_map_entry_release(count);
2164 			return (KERN_PROTECTION_FAILURE);
2165 		}
2166 
2167 		/*
2168 		 * When making a SHARED+RW file mmap writable, update
2169 		 * v_lastwrite_ts.
2170 		 */
2171 		if (new_prot & PROT_WRITE &&
2172 		    (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
2173 		    current->maptype == VM_MAPTYPE_NORMAL &&
2174 		    current->ba.object &&
2175 		    current->ba.object->type == OBJT_VNODE) {
2176 			struct vnode *vp;
2177 
2178 			vp = current->ba.object->handle;
2179 			if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
2180 				vfs_timestamp(&vp->v_lastwrite_ts);
2181 				vsetflags(vp, VLASTWRITETS);
2182 				vn_unlock(vp);
2183 			}
2184 		}
2185 		current = vm_map_rb_tree_RB_NEXT(current);
2186 	}
2187 
2188 	/*
2189 	 * Go back and fix up protections. [Note that clipping is not
2190 	 * necessary the second time.]
2191 	 */
2192 	current = entry;
2193 
2194 	while (current && current->ba.start < end) {
2195 		vm_prot_t old_prot;
2196 
2197 		vm_map_clip_end(map, current, end, &count);
2198 
2199 		old_prot = current->protection;
2200 		if (set_max) {
2201 			current->max_protection = new_prot;
2202 			current->protection = new_prot & old_prot;
2203 		} else {
2204 			current->protection = new_prot;
2205 		}
2206 
2207 		/*
2208 		 * Update physical map if necessary. Worry about copy-on-write
2209 		 * here -- CHECK THIS XXX
2210 		 */
2211 		if (current->protection != old_prot) {
2212 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2213 							VM_PROT_ALL)
2214 
2215 			pmap_protect(map->pmap, current->ba.start,
2216 			    current->ba.end,
2217 			    current->protection & MASK(current));
2218 #undef	MASK
2219 		}
2220 
2221 		vm_map_simplify_entry(map, current, &count);
2222 
2223 		current = vm_map_rb_tree_RB_NEXT(current);
2224 	}
2225 	vm_map_unlock(map);
2226 	vm_map_entry_release(count);
2227 	return (KERN_SUCCESS);
2228 }
2229 
2230 /*
2231  * This routine traverses a processes map handling the madvise
2232  * system call.  Advisories are classified as either those effecting
2233  * the vm_map_entry structure, or those effecting the underlying
2234  * objects.
2235  *
2236  * The <value> argument is used for extended madvise calls.
2237  *
2238  * No requirements.
2239  */
2240 int
2241 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2242 	       int behav, off_t value)
2243 {
2244 	vm_map_entry_t current, entry;
2245 	int modify_map = 0;
2246 	int error = 0;
2247 	int count;
2248 
2249 	/*
2250 	 * Some madvise calls directly modify the vm_map_entry, in which case
2251 	 * we need to use an exclusive lock on the map and we need to perform
2252 	 * various clipping operations.  Otherwise we only need a read-lock
2253 	 * on the map.
2254 	 */
2255 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2256 
2257 	switch(behav) {
2258 	case MADV_NORMAL:
2259 	case MADV_SEQUENTIAL:
2260 	case MADV_RANDOM:
2261 	case MADV_NOSYNC:
2262 	case MADV_AUTOSYNC:
2263 	case MADV_NOCORE:
2264 	case MADV_CORE:
2265 	case MADV_SETMAP:
2266 		modify_map = 1;
2267 		vm_map_lock(map);
2268 		break;
2269 	case MADV_INVAL:
2270 	case MADV_WILLNEED:
2271 	case MADV_DONTNEED:
2272 	case MADV_FREE:
2273 		vm_map_lock_read(map);
2274 		break;
2275 	default:
2276 		vm_map_entry_release(count);
2277 		return (EINVAL);
2278 	}
2279 
2280 	/*
2281 	 * Locate starting entry and clip if necessary.
2282 	 */
2283 
2284 	VM_MAP_RANGE_CHECK(map, start, end);
2285 
2286 	if (vm_map_lookup_entry(map, start, &entry)) {
2287 		if (modify_map)
2288 			vm_map_clip_start(map, entry, start, &count);
2289 	} else if (entry) {
2290 		entry = vm_map_rb_tree_RB_NEXT(entry);
2291 	} else {
2292 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2293 	}
2294 
2295 	if (modify_map) {
2296 		/*
2297 		 * madvise behaviors that are implemented in the vm_map_entry.
2298 		 *
2299 		 * We clip the vm_map_entry so that behavioral changes are
2300 		 * limited to the specified address range.
2301 		 */
2302 		for (current = entry;
2303 		     current && current->ba.start < end;
2304 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2305 			/*
2306 			 * Ignore submaps
2307 			 */
2308 			if (current->maptype == VM_MAPTYPE_SUBMAP)
2309 				continue;
2310 
2311 			vm_map_clip_end(map, current, end, &count);
2312 
2313 			switch (behav) {
2314 			case MADV_NORMAL:
2315 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2316 				break;
2317 			case MADV_SEQUENTIAL:
2318 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2319 				break;
2320 			case MADV_RANDOM:
2321 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2322 				break;
2323 			case MADV_NOSYNC:
2324 				current->eflags |= MAP_ENTRY_NOSYNC;
2325 				break;
2326 			case MADV_AUTOSYNC:
2327 				current->eflags &= ~MAP_ENTRY_NOSYNC;
2328 				break;
2329 			case MADV_NOCORE:
2330 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2331 				break;
2332 			case MADV_CORE:
2333 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2334 				break;
2335 			case MADV_SETMAP:
2336 				/*
2337 				 * Set the page directory page for a map
2338 				 * governed by a virtual page table.
2339 				 *
2340 				 * Software virtual page table support has
2341 				 * been removed, this MADV is no longer
2342 				 * supported.
2343 				 */
2344 				error = EINVAL;
2345 				break;
2346 			case MADV_INVAL:
2347 				/*
2348 				 * Invalidate the related pmap entries, used
2349 				 * to flush portions of the real kernel's
2350 				 * pmap when the caller has removed or
2351 				 * modified existing mappings in a virtual
2352 				 * page table.
2353 				 *
2354 				 * (exclusive locked map version does not
2355 				 * need the range interlock).
2356 				 */
2357 				pmap_remove(map->pmap,
2358 					    current->ba.start, current->ba.end);
2359 				break;
2360 			default:
2361 				error = EINVAL;
2362 				break;
2363 			}
2364 			vm_map_simplify_entry(map, current, &count);
2365 		}
2366 		vm_map_unlock(map);
2367 	} else {
2368 		vm_pindex_t pindex;
2369 		vm_pindex_t delta;
2370 
2371 		/*
2372 		 * madvise behaviors that are implemented in the underlying
2373 		 * vm_object.
2374 		 *
2375 		 * Since we don't clip the vm_map_entry, we have to clip
2376 		 * the vm_object pindex and count.
2377 		 *
2378 		 * NOTE!  These functions are only supported on normal maps.
2379 		 *
2380 		 * NOTE!  These functions only apply to the top-most object.
2381 		 *	  It is not applicable to backing objects.
2382 		 */
2383 		for (current = entry;
2384 		     current && current->ba.start < end;
2385 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2386 			vm_offset_t useStart;
2387 
2388 			if (current->maptype != VM_MAPTYPE_NORMAL)
2389 				continue;
2390 
2391 			pindex = OFF_TO_IDX(current->ba.offset);
2392 			delta = atop(current->ba.end - current->ba.start);
2393 			useStart = current->ba.start;
2394 
2395 			if (current->ba.start < start) {
2396 				pindex += atop(start - current->ba.start);
2397 				delta -= atop(start - current->ba.start);
2398 				useStart = start;
2399 			}
2400 			if (current->ba.end > end)
2401 				delta -= atop(current->ba.end - end);
2402 
2403 			if ((vm_spindex_t)delta <= 0)
2404 				continue;
2405 
2406 			if (behav == MADV_INVAL) {
2407 				/*
2408 				 * Invalidate the related pmap entries, used
2409 				 * to flush portions of the real kernel's
2410 				 * pmap when the caller has removed or
2411 				 * modified existing mappings in a virtual
2412 				 * page table.
2413 				 *
2414 				 * (shared locked map version needs the
2415 				 * interlock, see vm_fault()).
2416 				 */
2417 				struct vm_map_ilock ilock;
2418 
2419 				KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2420 					    useStart + ptoa(delta) <=
2421 					    VM_MAX_USER_ADDRESS,
2422 					 ("Bad range %016jx-%016jx (%016jx)",
2423 					 useStart, useStart + ptoa(delta),
2424 					 delta));
2425 				vm_map_interlock(map, &ilock,
2426 						 useStart,
2427 						 useStart + ptoa(delta));
2428 				pmap_remove(map->pmap,
2429 					    useStart,
2430 					    useStart + ptoa(delta));
2431 				vm_map_deinterlock(map, &ilock);
2432 			} else {
2433 				vm_object_madvise(current->ba.object,
2434 						  pindex, delta, behav);
2435 			}
2436 
2437 			/*
2438 			 * Try to pre-populate the page table.
2439 			 */
2440 			if (behav == MADV_WILLNEED) {
2441 				pmap_object_init_pt(
2442 				    map->pmap, current,
2443 				    useStart,
2444 				    (delta << PAGE_SHIFT),
2445 				    MAP_PREFAULT_MADVISE
2446 				);
2447 			}
2448 		}
2449 		vm_map_unlock_read(map);
2450 	}
2451 	vm_map_entry_release(count);
2452 	return(error);
2453 }
2454 
2455 
2456 /*
2457  * Sets the inheritance of the specified address range in the target map.
2458  * Inheritance affects how the map will be shared with child maps at the
2459  * time of vm_map_fork.
2460  */
2461 int
2462 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2463 	       vm_inherit_t new_inheritance)
2464 {
2465 	vm_map_entry_t entry;
2466 	vm_map_entry_t temp_entry;
2467 	int count;
2468 
2469 	switch (new_inheritance) {
2470 	case VM_INHERIT_NONE:
2471 	case VM_INHERIT_COPY:
2472 	case VM_INHERIT_SHARE:
2473 		break;
2474 	default:
2475 		return (KERN_INVALID_ARGUMENT);
2476 	}
2477 
2478 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2479 	vm_map_lock(map);
2480 
2481 	VM_MAP_RANGE_CHECK(map, start, end);
2482 
2483 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2484 		entry = temp_entry;
2485 		vm_map_clip_start(map, entry, start, &count);
2486 	} else if (temp_entry) {
2487 		entry = vm_map_rb_tree_RB_NEXT(temp_entry);
2488 	} else {
2489 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2490 	}
2491 
2492 	while (entry && entry->ba.start < end) {
2493 		vm_map_clip_end(map, entry, end, &count);
2494 
2495 		entry->inheritance = new_inheritance;
2496 
2497 		vm_map_simplify_entry(map, entry, &count);
2498 
2499 		entry = vm_map_rb_tree_RB_NEXT(entry);
2500 	}
2501 	vm_map_unlock(map);
2502 	vm_map_entry_release(count);
2503 	return (KERN_SUCCESS);
2504 }
2505 
2506 /*
2507  * Wiring/Unwiring of memory for user-related operation.
2508  *
2509  * Implement the semantics of mlock
2510  */
2511 int
2512 vm_map_user_wiring(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2513 		   boolean_t new_pageable)
2514 {
2515 	vm_map_entry_t entry;
2516 	vm_map_entry_t start_entry;
2517 	vm_offset_t end;
2518 	int rv = KERN_SUCCESS;
2519 	int count;
2520 
2521 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2522 	vm_map_lock(map);
2523 	VM_MAP_RANGE_CHECK(map, start, real_end);
2524 	end = real_end;
2525 
2526 	start_entry = vm_map_clip_range(map, start, end, &count,
2527 					MAP_CLIP_NO_HOLES);
2528 	if (start_entry == NULL) {
2529 		vm_map_unlock(map);
2530 		vm_map_entry_release(count);
2531 		return (KERN_INVALID_ADDRESS);
2532 	}
2533 
2534 	if (new_pageable == 0) {
2535 		entry = start_entry;
2536 		while (entry && entry->ba.start < end) {
2537 			vm_offset_t save_start;
2538 			vm_offset_t save_end;
2539 
2540 			/*
2541 			 * Already user wired or hard wired (trivial cases)
2542 			 */
2543 			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2544 				entry = vm_map_rb_tree_RB_NEXT(entry);
2545 				continue;
2546 			}
2547 			if (entry->wired_count != 0) {
2548 				entry->wired_count++;
2549 				entry->eflags |= MAP_ENTRY_USER_WIRED;
2550 				entry = vm_map_rb_tree_RB_NEXT(entry);
2551 				continue;
2552 			}
2553 
2554 			/*
2555 			 * A new wiring requires instantiation of appropriate
2556 			 * management structures and the faulting in of the
2557 			 * page.
2558 			 */
2559 			if (entry->maptype == VM_MAPTYPE_NORMAL) {
2560 				int copyflag = entry->eflags &
2561 					       MAP_ENTRY_NEEDS_COPY;
2562 				if (copyflag && ((entry->protection &
2563 						  VM_PROT_WRITE) != 0)) {
2564 					vm_map_entry_shadow(entry);
2565 				} else if (entry->ba.object == NULL &&
2566 					   !map->system_map) {
2567 					vm_map_entry_allocate_object(entry);
2568 				}
2569 			}
2570 			entry->wired_count++;
2571 			entry->eflags |= MAP_ENTRY_USER_WIRED;
2572 
2573 			/*
2574 			 * Now fault in the area.  Note that vm_fault_wire()
2575 			 * may release the map lock temporarily, it will be
2576 			 * relocked on return.  The in-transition
2577 			 * flag protects the entries.
2578 			 */
2579 			save_start = entry->ba.start;
2580 			save_end = entry->ba.end;
2581 			rv = vm_fault_wire(map, entry, TRUE, 0);
2582 			if (rv) {
2583 				CLIP_CHECK_BACK(entry, save_start);
2584 				for (;;) {
2585 					KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2586 					entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2587 					entry->wired_count = 0;
2588 					if (entry->ba.end == save_end)
2589 						break;
2590 					entry = vm_map_rb_tree_RB_NEXT(entry);
2591 					KASSERT(entry,
2592 					     ("bad entry clip during backout"));
2593 				}
2594 				end = save_start;	/* unwire the rest */
2595 				break;
2596 			}
2597 			/*
2598 			 * note that even though the entry might have been
2599 			 * clipped, the USER_WIRED flag we set prevents
2600 			 * duplication so we do not have to do a
2601 			 * clip check.
2602 			 */
2603 			entry = vm_map_rb_tree_RB_NEXT(entry);
2604 		}
2605 
2606 		/*
2607 		 * If we failed fall through to the unwiring section to
2608 		 * unwire what we had wired so far.  'end' has already
2609 		 * been adjusted.
2610 		 */
2611 		if (rv)
2612 			new_pageable = 1;
2613 
2614 		/*
2615 		 * start_entry might have been clipped if we unlocked the
2616 		 * map and blocked.  No matter how clipped it has gotten
2617 		 * there should be a fragment that is on our start boundary.
2618 		 */
2619 		CLIP_CHECK_BACK(start_entry, start);
2620 	}
2621 
2622 	/*
2623 	 * Deal with the unwiring case.
2624 	 */
2625 	if (new_pageable) {
2626 		/*
2627 		 * This is the unwiring case.  We must first ensure that the
2628 		 * range to be unwired is really wired down.  We know there
2629 		 * are no holes.
2630 		 */
2631 		entry = start_entry;
2632 		while (entry && entry->ba.start < end) {
2633 			if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2634 				rv = KERN_INVALID_ARGUMENT;
2635 				goto done;
2636 			}
2637 			KASSERT(entry->wired_count != 0,
2638 				("wired count was 0 with USER_WIRED set! %p",
2639 				 entry));
2640 			entry = vm_map_rb_tree_RB_NEXT(entry);
2641 		}
2642 
2643 		/*
2644 		 * Now decrement the wiring count for each region. If a region
2645 		 * becomes completely unwired, unwire its physical pages and
2646 		 * mappings.
2647 		 */
2648 		/*
2649 		 * The map entries are processed in a loop, checking to
2650 		 * make sure the entry is wired and asserting it has a wired
2651 		 * count. However, another loop was inserted more-or-less in
2652 		 * the middle of the unwiring path. This loop picks up the
2653 		 * "entry" loop variable from the first loop without first
2654 		 * setting it to start_entry. Naturally, the secound loop
2655 		 * is never entered and the pages backing the entries are
2656 		 * never unwired. This can lead to a leak of wired pages.
2657 		 */
2658 		entry = start_entry;
2659 		while (entry && entry->ba.start < end) {
2660 			KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2661 				("expected USER_WIRED on entry %p", entry));
2662 			entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2663 			entry->wired_count--;
2664 			if (entry->wired_count == 0)
2665 				vm_fault_unwire(map, entry);
2666 			entry = vm_map_rb_tree_RB_NEXT(entry);
2667 		}
2668 	}
2669 done:
2670 	vm_map_unclip_range(map, start_entry, start, real_end, &count,
2671 			    MAP_CLIP_NO_HOLES);
2672 	vm_map_unlock(map);
2673 	vm_map_entry_release(count);
2674 
2675 	return (rv);
2676 }
2677 
2678 /*
2679  * Wiring/Unwiring of memory for kernel-related operation.
2680  *
2681  * Sets the pageability of the specified address range in the target map.
2682  * Regions specified as not pageable require locked-down physical
2683  * memory and physical page maps.
2684  *
2685  * The map must not be locked, but a reference must remain to the map
2686  * throughout the call.
2687  *
2688  * This function may be called via the zalloc path and must properly
2689  * reserve map entries for kernel_map.
2690  *
2691  * No requirements.
2692  */
2693 int
2694 vm_map_kernel_wiring(vm_map_t map, vm_offset_t start,
2695 		     vm_offset_t real_end, int kmflags)
2696 {
2697 	vm_map_entry_t entry;
2698 	vm_map_entry_t start_entry;
2699 	vm_offset_t end;
2700 	int rv = KERN_SUCCESS;
2701 	int count;
2702 
2703 	if (kmflags & KM_KRESERVE)
2704 		count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2705 	else
2706 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2707 	vm_map_lock(map);
2708 	VM_MAP_RANGE_CHECK(map, start, real_end);
2709 	end = real_end;
2710 
2711 	start_entry = vm_map_clip_range(map, start, end, &count,
2712 					MAP_CLIP_NO_HOLES);
2713 	if (start_entry == NULL) {
2714 		vm_map_unlock(map);
2715 		rv = KERN_INVALID_ADDRESS;
2716 		goto failure;
2717 	}
2718 	if ((kmflags & KM_PAGEABLE) == 0) {
2719 		/*
2720 		 * Wiring.
2721 		 *
2722 		 * 1.  Holding the write lock, we create any shadow or zero-fill
2723 		 * objects that need to be created. Then we clip each map
2724 		 * entry to the region to be wired and increment its wiring
2725 		 * count.  We create objects before clipping the map entries
2726 		 * to avoid object proliferation.
2727 		 *
2728 		 * 2.  We downgrade to a read lock, and call vm_fault_wire to
2729 		 * fault in the pages for any newly wired area (wired_count is
2730 		 * 1).
2731 		 *
2732 		 * Downgrading to a read lock for vm_fault_wire avoids a
2733 		 * possible deadlock with another process that may have faulted
2734 		 * on one of the pages to be wired (it would mark the page busy,
2735 		 * blocking us, then in turn block on the map lock that we
2736 		 * hold).  Because of problems in the recursive lock package,
2737 		 * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2738 		 * any actions that require the write lock must be done
2739 		 * beforehand.  Because we keep the read lock on the map, the
2740 		 * copy-on-write status of the entries we modify here cannot
2741 		 * change.
2742 		 */
2743 		entry = start_entry;
2744 		while (entry && entry->ba.start < end) {
2745 			/*
2746 			 * Trivial case if the entry is already wired
2747 			 */
2748 			if (entry->wired_count) {
2749 				entry->wired_count++;
2750 				entry = vm_map_rb_tree_RB_NEXT(entry);
2751 				continue;
2752 			}
2753 
2754 			/*
2755 			 * The entry is being newly wired, we have to setup
2756 			 * appropriate management structures.  A shadow
2757 			 * object is required for a copy-on-write region,
2758 			 * or a normal object for a zero-fill region.  We
2759 			 * do not have to do this for entries that point to sub
2760 			 * maps because we won't hold the lock on the sub map.
2761 			 */
2762 			if (entry->maptype == VM_MAPTYPE_NORMAL) {
2763 				int copyflag = entry->eflags &
2764 					       MAP_ENTRY_NEEDS_COPY;
2765 				if (copyflag && ((entry->protection &
2766 						  VM_PROT_WRITE) != 0)) {
2767 					vm_map_entry_shadow(entry);
2768 				} else if (entry->ba.object == NULL &&
2769 					   !map->system_map) {
2770 					vm_map_entry_allocate_object(entry);
2771 				}
2772 			}
2773 			entry->wired_count++;
2774 			entry = vm_map_rb_tree_RB_NEXT(entry);
2775 		}
2776 
2777 		/*
2778 		 * Pass 2.
2779 		 */
2780 
2781 		/*
2782 		 * HACK HACK HACK HACK
2783 		 *
2784 		 * vm_fault_wire() temporarily unlocks the map to avoid
2785 		 * deadlocks.  The in-transition flag from vm_map_clip_range
2786 		 * call should protect us from changes while the map is
2787 		 * unlocked.  T
2788 		 *
2789 		 * NOTE: Previously this comment stated that clipping might
2790 		 *	 still occur while the entry is unlocked, but from
2791 		 *	 what I can tell it actually cannot.
2792 		 *
2793 		 *	 It is unclear whether the CLIP_CHECK_*() calls
2794 		 *	 are still needed but we keep them in anyway.
2795 		 *
2796 		 * HACK HACK HACK HACK
2797 		 */
2798 
2799 		entry = start_entry;
2800 		while (entry && entry->ba.start < end) {
2801 			/*
2802 			 * If vm_fault_wire fails for any page we need to undo
2803 			 * what has been done.  We decrement the wiring count
2804 			 * for those pages which have not yet been wired (now)
2805 			 * and unwire those that have (later).
2806 			 */
2807 			vm_offset_t save_start = entry->ba.start;
2808 			vm_offset_t save_end = entry->ba.end;
2809 
2810 			if (entry->wired_count == 1)
2811 				rv = vm_fault_wire(map, entry, FALSE, kmflags);
2812 			if (rv) {
2813 				CLIP_CHECK_BACK(entry, save_start);
2814 				for (;;) {
2815 					KASSERT(entry->wired_count == 1,
2816 					  ("wired_count changed unexpectedly"));
2817 					entry->wired_count = 0;
2818 					if (entry->ba.end == save_end)
2819 						break;
2820 					entry = vm_map_rb_tree_RB_NEXT(entry);
2821 					KASSERT(entry,
2822 					  ("bad entry clip during backout"));
2823 				}
2824 				end = save_start;
2825 				break;
2826 			}
2827 			CLIP_CHECK_FWD(entry, save_end);
2828 			entry = vm_map_rb_tree_RB_NEXT(entry);
2829 		}
2830 
2831 		/*
2832 		 * If a failure occured undo everything by falling through
2833 		 * to the unwiring code.  'end' has already been adjusted
2834 		 * appropriately.
2835 		 */
2836 		if (rv)
2837 			kmflags |= KM_PAGEABLE;
2838 
2839 		/*
2840 		 * start_entry is still IN_TRANSITION but may have been
2841 		 * clipped since vm_fault_wire() unlocks and relocks the
2842 		 * map.  No matter how clipped it has gotten there should
2843 		 * be a fragment that is on our start boundary.
2844 		 */
2845 		CLIP_CHECK_BACK(start_entry, start);
2846 	}
2847 
2848 	if (kmflags & KM_PAGEABLE) {
2849 		/*
2850 		 * This is the unwiring case.  We must first ensure that the
2851 		 * range to be unwired is really wired down.  We know there
2852 		 * are no holes.
2853 		 */
2854 		entry = start_entry;
2855 		while (entry && entry->ba.start < end) {
2856 			if (entry->wired_count == 0) {
2857 				rv = KERN_INVALID_ARGUMENT;
2858 				goto done;
2859 			}
2860 			entry = vm_map_rb_tree_RB_NEXT(entry);
2861 		}
2862 
2863 		/*
2864 		 * Now decrement the wiring count for each region. If a region
2865 		 * becomes completely unwired, unwire its physical pages and
2866 		 * mappings.
2867 		 */
2868 		entry = start_entry;
2869 		while (entry && entry->ba.start < end) {
2870 			entry->wired_count--;
2871 			if (entry->wired_count == 0)
2872 				vm_fault_unwire(map, entry);
2873 			entry = vm_map_rb_tree_RB_NEXT(entry);
2874 		}
2875 	}
2876 done:
2877 	vm_map_unclip_range(map, start_entry, start, real_end,
2878 			    &count, MAP_CLIP_NO_HOLES);
2879 	vm_map_unlock(map);
2880 failure:
2881 	if (kmflags & KM_KRESERVE)
2882 		vm_map_entry_krelease(count);
2883 	else
2884 		vm_map_entry_release(count);
2885 	return (rv);
2886 }
2887 
2888 /*
2889  * Mark a newly allocated address range as wired but do not fault in
2890  * the pages.  The caller is expected to load the pages into the object.
2891  *
2892  * The map must be locked on entry and will remain locked on return.
2893  * No other requirements.
2894  */
2895 void
2896 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2897 		       int *countp)
2898 {
2899 	vm_map_entry_t scan;
2900 	vm_map_entry_t entry;
2901 
2902 	entry = vm_map_clip_range(map, addr, addr + size,
2903 				  countp, MAP_CLIP_NO_HOLES);
2904 	scan = entry;
2905 	while (scan && scan->ba.start < addr + size) {
2906 		KKASSERT(scan->wired_count == 0);
2907 		scan->wired_count = 1;
2908 		scan = vm_map_rb_tree_RB_NEXT(scan);
2909 	}
2910 	vm_map_unclip_range(map, entry, addr, addr + size,
2911 			    countp, MAP_CLIP_NO_HOLES);
2912 }
2913 
2914 /*
2915  * Push any dirty cached pages in the address range to their pager.
2916  * If syncio is TRUE, dirty pages are written synchronously.
2917  * If invalidate is TRUE, any cached pages are freed as well.
2918  *
2919  * This routine is called by sys_msync()
2920  *
2921  * Returns an error if any part of the specified range is not mapped.
2922  *
2923  * No requirements.
2924  */
2925 int
2926 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2927 	     boolean_t syncio, boolean_t invalidate)
2928 {
2929 	vm_map_entry_t current;
2930 	vm_map_entry_t next;
2931 	vm_map_entry_t entry;
2932 	vm_map_backing_t ba;
2933 	vm_size_t size;
2934 	vm_object_t object;
2935 	vm_ooffset_t offset;
2936 
2937 	vm_map_lock_read(map);
2938 	VM_MAP_RANGE_CHECK(map, start, end);
2939 	if (!vm_map_lookup_entry(map, start, &entry)) {
2940 		vm_map_unlock_read(map);
2941 		return (KERN_INVALID_ADDRESS);
2942 	}
2943 	lwkt_gettoken(&map->token);
2944 
2945 	/*
2946 	 * Make a first pass to check for holes.
2947 	 */
2948 	current = entry;
2949 	while (current && current->ba.start < end) {
2950 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2951 			lwkt_reltoken(&map->token);
2952 			vm_map_unlock_read(map);
2953 			return (KERN_INVALID_ARGUMENT);
2954 		}
2955 		next = vm_map_rb_tree_RB_NEXT(current);
2956 		if (end > current->ba.end &&
2957 		    (next == NULL ||
2958 		     current->ba.end != next->ba.start)) {
2959 			lwkt_reltoken(&map->token);
2960 			vm_map_unlock_read(map);
2961 			return (KERN_INVALID_ADDRESS);
2962 		}
2963 		current = next;
2964 	}
2965 
2966 	if (invalidate)
2967 		pmap_remove(vm_map_pmap(map), start, end);
2968 
2969 	/*
2970 	 * Make a second pass, cleaning/uncaching pages from the indicated
2971 	 * objects as we go.
2972 	 */
2973 	current = entry;
2974 	while (current && current->ba.start < end) {
2975 		offset = current->ba.offset + (start - current->ba.start);
2976 		size = (end <= current->ba.end ? end : current->ba.end) - start;
2977 
2978 		switch(current->maptype) {
2979 		case VM_MAPTYPE_SUBMAP:
2980 		{
2981 			vm_map_t smap;
2982 			vm_map_entry_t tentry;
2983 			vm_size_t tsize;
2984 
2985 			smap = current->ba.sub_map;
2986 			vm_map_lock_read(smap);
2987 			vm_map_lookup_entry(smap, offset, &tentry);
2988 			if (tentry == NULL) {
2989 				tsize = vm_map_max(smap) - offset;
2990 				ba = NULL;
2991 				offset = 0 + (offset - vm_map_min(smap));
2992 			} else {
2993 				tsize = tentry->ba.end - offset;
2994 				ba = &tentry->ba;
2995 				offset = tentry->ba.offset +
2996 					 (offset - tentry->ba.start);
2997 			}
2998 			vm_map_unlock_read(smap);
2999 			if (tsize < size)
3000 				size = tsize;
3001 			break;
3002 		}
3003 		case VM_MAPTYPE_NORMAL:
3004 			ba = &current->ba;
3005 			break;
3006 		default:
3007 			ba = NULL;
3008 			break;
3009 		}
3010 		if (ba) {
3011 			object = ba->object;
3012 			if (object)
3013 				vm_object_hold(object);
3014 		} else {
3015 			object = NULL;
3016 		}
3017 
3018 		/*
3019 		 * Note that there is absolutely no sense in writing out
3020 		 * anonymous objects, so we track down the vnode object
3021 		 * to write out.
3022 		 * We invalidate (remove) all pages from the address space
3023 		 * anyway, for semantic correctness.
3024 		 *
3025 		 * note: certain anonymous maps, such as MAP_NOSYNC maps,
3026 		 * may start out with a NULL object.
3027 		 *
3028 		 * XXX do we really want to stop at the first backing store
3029 		 * here if there are more? XXX
3030 		 */
3031 		if (ba) {
3032 			vm_object_t tobj;
3033 
3034 			tobj = object;
3035 			while (ba->backing_ba != NULL) {
3036 				offset -= ba->offset;
3037 				ba = ba->backing_ba;
3038 				offset += ba->offset;
3039 				tobj = ba->object;
3040 				if (tobj->size < OFF_TO_IDX(offset + size))
3041 					size = IDX_TO_OFF(tobj->size) - offset;
3042 				break; /* XXX this break is not correct */
3043 			}
3044 			if (object != tobj) {
3045 				if (object)
3046 					vm_object_drop(object);
3047 				object = tobj;
3048 				vm_object_hold(object);
3049 			}
3050 		}
3051 
3052 		if (object && (object->type == OBJT_VNODE) &&
3053 		    (current->protection & VM_PROT_WRITE) &&
3054 		    (object->flags & OBJ_NOMSYNC) == 0) {
3055 			/*
3056 			 * Flush pages if writing is allowed, invalidate them
3057 			 * if invalidation requested.  Pages undergoing I/O
3058 			 * will be ignored by vm_object_page_remove().
3059 			 *
3060 			 * We cannot lock the vnode and then wait for paging
3061 			 * to complete without deadlocking against vm_fault.
3062 			 * Instead we simply call vm_object_page_remove() and
3063 			 * allow it to block internally on a page-by-page
3064 			 * basis when it encounters pages undergoing async
3065 			 * I/O.
3066 			 */
3067 			int flags;
3068 
3069 			/* no chain wait needed for vnode objects */
3070 			vm_object_reference_locked(object);
3071 			vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
3072 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
3073 			flags |= invalidate ? OBJPC_INVAL : 0;
3074 
3075 			if (current->maptype == VM_MAPTYPE_NORMAL) {
3076 				vm_object_page_clean(object,
3077 				    OFF_TO_IDX(offset),
3078 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3079 				    flags);
3080 			}
3081 			vn_unlock(((struct vnode *)object->handle));
3082 			vm_object_deallocate_locked(object);
3083 		}
3084 		if (object && invalidate &&
3085 		   ((object->type == OBJT_VNODE) ||
3086 		    (object->type == OBJT_DEVICE) ||
3087 		    (object->type == OBJT_MGTDEVICE))) {
3088 			int clean_only =
3089 				((object->type == OBJT_DEVICE) ||
3090 				(object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
3091 			/* no chain wait needed for vnode/device objects */
3092 			vm_object_reference_locked(object);
3093 			if (current->maptype == VM_MAPTYPE_NORMAL) {
3094 				vm_object_page_remove(object,
3095 				    OFF_TO_IDX(offset),
3096 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3097 				    clean_only);
3098 			}
3099 			vm_object_deallocate_locked(object);
3100 		}
3101 		start += size;
3102 		if (object)
3103 			vm_object_drop(object);
3104 		current = vm_map_rb_tree_RB_NEXT(current);
3105 	}
3106 
3107 	lwkt_reltoken(&map->token);
3108 	vm_map_unlock_read(map);
3109 
3110 	return (KERN_SUCCESS);
3111 }
3112 
3113 /*
3114  * Make the region specified by this entry pageable.
3115  *
3116  * The vm_map must be exclusively locked.
3117  */
3118 static void
3119 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3120 {
3121 	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3122 	entry->wired_count = 0;
3123 	vm_fault_unwire(map, entry);
3124 }
3125 
3126 /*
3127  * Deallocate the given entry from the target map.
3128  *
3129  * The vm_map must be exclusively locked.
3130  */
3131 static void
3132 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
3133 {
3134 	vm_map_entry_unlink(map, entry);
3135 	map->size -= entry->ba.end - entry->ba.start;
3136 	vm_map_entry_dispose(map, entry, countp);
3137 }
3138 
3139 /*
3140  * Deallocates the given address range from the target map.
3141  *
3142  * The vm_map must be exclusively locked.
3143  */
3144 int
3145 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
3146 {
3147 	vm_object_t object;
3148 	vm_map_entry_t entry;
3149 	vm_map_entry_t first_entry;
3150 	vm_offset_t hole_start;
3151 
3152 	ASSERT_VM_MAP_LOCKED(map);
3153 	lwkt_gettoken(&map->token);
3154 again:
3155 	/*
3156 	 * Find the start of the region, and clip it.  Set entry to point
3157 	 * at the first record containing the requested address or, if no
3158 	 * such record exists, the next record with a greater address.  The
3159 	 * loop will run from this point until a record beyond the termination
3160 	 * address is encountered.
3161 	 *
3162 	 * Adjust freehint[] for either the clip case or the extension case.
3163 	 *
3164 	 * GGG see other GGG comment.
3165 	 */
3166 	if (vm_map_lookup_entry(map, start, &first_entry)) {
3167 		entry = first_entry;
3168 		vm_map_clip_start(map, entry, start, countp);
3169 		hole_start = start;
3170 	} else {
3171 		if (first_entry) {
3172 			entry = vm_map_rb_tree_RB_NEXT(first_entry);
3173 			if (entry == NULL)
3174 				hole_start = first_entry->ba.start;
3175 			else
3176 				hole_start = first_entry->ba.end;
3177 		} else {
3178 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3179 			if (entry == NULL)
3180 				hole_start = vm_map_min(map);
3181 			else
3182 				hole_start = vm_map_max(map);
3183 		}
3184 	}
3185 
3186 	/*
3187 	 * Step through all entries in this region
3188 	 */
3189 	while (entry && entry->ba.start < end) {
3190 		vm_map_entry_t next;
3191 		vm_offset_t s, e;
3192 		vm_pindex_t offidxstart, offidxend, count;
3193 
3194 		/*
3195 		 * If we hit an in-transition entry we have to sleep and
3196 		 * retry.  It's easier (and not really slower) to just retry
3197 		 * since this case occurs so rarely and the hint is already
3198 		 * pointing at the right place.  We have to reset the
3199 		 * start offset so as not to accidently delete an entry
3200 		 * another process just created in vacated space.
3201 		 */
3202 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3203 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3204 			start = entry->ba.start;
3205 			++mycpu->gd_cnt.v_intrans_coll;
3206 			++mycpu->gd_cnt.v_intrans_wait;
3207 			vm_map_transition_wait(map, 1);
3208 			goto again;
3209 		}
3210 		vm_map_clip_end(map, entry, end, countp);
3211 
3212 		s = entry->ba.start;
3213 		e = entry->ba.end;
3214 		next = vm_map_rb_tree_RB_NEXT(entry);
3215 
3216 		offidxstart = OFF_TO_IDX(entry->ba.offset);
3217 		count = OFF_TO_IDX(e - s);
3218 
3219 		switch(entry->maptype) {
3220 		case VM_MAPTYPE_NORMAL:
3221 		case VM_MAPTYPE_SUBMAP:
3222 			object = entry->ba.object;
3223 			break;
3224 		default:
3225 			object = NULL;
3226 			break;
3227 		}
3228 
3229 		/*
3230 		 * Unwire before removing addresses from the pmap; otherwise,
3231 		 * unwiring will put the entries back in the pmap.
3232 		 *
3233 		 * Generally speaking, doing a bulk pmap_remove() before
3234 		 * removing the pages from the VM object is better at
3235 		 * reducing unnecessary IPIs.  The pmap code is now optimized
3236 		 * to not blindly iterate the range when pt and pd pages
3237 		 * are missing.
3238 		 */
3239 		if (entry->wired_count != 0)
3240 			vm_map_entry_unwire(map, entry);
3241 
3242 		offidxend = offidxstart + count;
3243 
3244 		if (object == kernel_object) {
3245 			pmap_remove(map->pmap, s, e);
3246 			vm_object_hold(object);
3247 			vm_object_page_remove(object, offidxstart,
3248 					      offidxend, FALSE);
3249 			vm_object_drop(object);
3250 		} else if (object && object->type != OBJT_DEFAULT &&
3251 			   object->type != OBJT_SWAP) {
3252 			/*
3253 			 * vnode object routines cannot be chain-locked,
3254 			 * but since we aren't removing pages from the
3255 			 * object here we can use a shared hold.
3256 			 */
3257 			vm_object_hold_shared(object);
3258 			pmap_remove(map->pmap, s, e);
3259 			vm_object_drop(object);
3260 		} else if (object) {
3261 			vm_object_hold(object);
3262 			pmap_remove(map->pmap, s, e);
3263 
3264 			if (object != NULL &&
3265 			    object->ref_count != 1 &&
3266 			    (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3267 			     OBJ_ONEMAPPING &&
3268 			    (object->type == OBJT_DEFAULT ||
3269 			     object->type == OBJT_SWAP)) {
3270 				/*
3271 				 * When ONEMAPPING is set we can destroy the
3272 				 * pages underlying the entry's range.
3273 				 */
3274 				vm_object_page_remove(object, offidxstart,
3275 						      offidxend, FALSE);
3276 				if (object->type == OBJT_SWAP) {
3277 					swap_pager_freespace(object,
3278 							     offidxstart,
3279 							     count);
3280 				}
3281 				if (offidxend >= object->size &&
3282 				    offidxstart < object->size) {
3283 					object->size = offidxstart;
3284 				}
3285 			}
3286 			vm_object_drop(object);
3287 		} else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3288 			pmap_remove(map->pmap, s, e);
3289 		}
3290 
3291 		/*
3292 		 * Delete the entry (which may delete the object) only after
3293 		 * removing all pmap entries pointing to its pages.
3294 		 * (Otherwise, its page frames may be reallocated, and any
3295 		 * modify bits will be set in the wrong object!)
3296 		 */
3297 		vm_map_entry_delete(map, entry, countp);
3298 		entry = next;
3299 	}
3300 
3301 	/*
3302 	 * We either reached the end and use vm_map_max as the end
3303 	 * address, or we didn't and we use the next entry as the
3304 	 * end address.
3305 	 */
3306 	if (entry == NULL) {
3307 		vm_map_freehint_hole(map, hole_start,
3308 				     vm_map_max(map) - hole_start);
3309 	} else {
3310 		vm_map_freehint_hole(map, hole_start,
3311 				     entry->ba.start - hole_start);
3312 	}
3313 
3314 	lwkt_reltoken(&map->token);
3315 
3316 	return (KERN_SUCCESS);
3317 }
3318 
3319 /*
3320  * Remove the given address range from the target map.
3321  * This is the exported form of vm_map_delete.
3322  *
3323  * No requirements.
3324  */
3325 int
3326 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3327 {
3328 	int result;
3329 	int count;
3330 
3331 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3332 	vm_map_lock(map);
3333 	VM_MAP_RANGE_CHECK(map, start, end);
3334 	result = vm_map_delete(map, start, end, &count);
3335 	vm_map_unlock(map);
3336 	vm_map_entry_release(count);
3337 
3338 	return (result);
3339 }
3340 
3341 /*
3342  * Assert that the target map allows the specified privilege on the
3343  * entire address region given.  The entire region must be allocated.
3344  *
3345  * The caller must specify whether the vm_map is already locked or not.
3346  */
3347 boolean_t
3348 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3349 			vm_prot_t protection, boolean_t have_lock)
3350 {
3351 	vm_map_entry_t entry;
3352 	vm_map_entry_t tmp_entry;
3353 	boolean_t result;
3354 
3355 	if (have_lock == FALSE)
3356 		vm_map_lock_read(map);
3357 
3358 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3359 		if (have_lock == FALSE)
3360 			vm_map_unlock_read(map);
3361 		return (FALSE);
3362 	}
3363 	entry = tmp_entry;
3364 
3365 	result = TRUE;
3366 	while (start < end) {
3367 		if (entry == NULL) {
3368 			result = FALSE;
3369 			break;
3370 		}
3371 
3372 		/*
3373 		 * No holes allowed!
3374 		 */
3375 
3376 		if (start < entry->ba.start) {
3377 			result = FALSE;
3378 			break;
3379 		}
3380 		/*
3381 		 * Check protection associated with entry.
3382 		 */
3383 
3384 		if ((entry->protection & protection) != protection) {
3385 			result = FALSE;
3386 			break;
3387 		}
3388 		/* go to next entry */
3389 		start = entry->ba.end;
3390 		entry = vm_map_rb_tree_RB_NEXT(entry);
3391 	}
3392 	if (have_lock == FALSE)
3393 		vm_map_unlock_read(map);
3394 	return (result);
3395 }
3396 
3397 /*
3398  * vm_map_backing structures are not shared across forks and must be
3399  * replicated.
3400  *
3401  * Generally speaking we must reallocate the backing_ba sequence and
3402  * also adjust it for any changes made to the base entry->ba.start and
3403  * entry->ba.end.  The first ba in the chain is of course &entry->ba,
3404  * so we only need to adjust subsequent ba's start, end, and offset.
3405  *
3406  * MAP_BACK_CLIPPED	- Called as part of a clipping replication.
3407  *			  Do not clear OBJ_ONEMAPPING.
3408  *
3409  * MAP_BACK_BASEOBJREFD - Called from vm_map_insert().  The base object
3410  *			  has already been referenced.
3411  */
3412 static
3413 void
3414 vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
3415 {
3416 	vm_map_backing_t ba;
3417 	vm_map_backing_t nba;
3418 	vm_object_t object;
3419 
3420 	ba = &entry->ba;
3421 	for (;;) {
3422 		ba->pmap = map->pmap;
3423 
3424 		if (ba->map_object) {
3425 			switch(entry->maptype) {
3426 			case VM_MAPTYPE_NORMAL:
3427 				object = ba->object;
3428 				if (ba != &entry->ba ||
3429 				    (flags & MAP_BACK_BASEOBJREFD) == 0) {
3430 					vm_object_reference_quick(object);
3431 				}
3432 				vm_map_backing_attach(entry, ba);
3433 				if ((flags & MAP_BACK_CLIPPED) == 0 &&
3434 				    object->ref_count > 1) {
3435 					vm_object_clear_flag(object,
3436 							     OBJ_ONEMAPPING);
3437 				}
3438 				break;
3439 			case VM_MAPTYPE_UKSMAP:
3440 				vm_map_backing_attach(entry, ba);
3441 				break;
3442 			default:
3443 				break;
3444 			}
3445 		}
3446 		if (ba->backing_ba == NULL)
3447 			break;
3448 
3449 		/*
3450 		 * NOTE: The aux_info field is retained.
3451 		 */
3452 		nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT);
3453 		*nba = *ba->backing_ba;
3454 		nba->offset += (ba->start - nba->start);  /* += (new - old) */
3455 		nba->start = ba->start;
3456 		nba->end = ba->end;
3457 		ba->backing_ba = nba;
3458 		ba = nba;
3459 		/* pmap is replaced at the top of the loop */
3460 	}
3461 }
3462 
3463 static
3464 void
3465 vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start)
3466 {
3467 	vm_map_backing_t ba;
3468 
3469 	if (entry->maptype == VM_MAPTYPE_NORMAL) {
3470 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3471 			if (ba->object) {
3472 				lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3473 				ba->offset += (start - ba->start);
3474 				ba->start = start;
3475 				lockmgr(&ba->object->backing_lk, LK_RELEASE);
3476 			} else {
3477 				ba->offset += (start - ba->start);
3478 				ba->start = start;
3479 			}
3480 		}
3481 	} else {
3482 		/* not an object and can't be shadowed */
3483 	}
3484 }
3485 
3486 static
3487 void
3488 vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end)
3489 {
3490 	vm_map_backing_t ba;
3491 
3492 	if (entry->maptype == VM_MAPTYPE_NORMAL) {
3493 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3494 			if (ba->object) {
3495 				lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3496 				ba->end = end;
3497 				lockmgr(&ba->object->backing_lk, LK_RELEASE);
3498 			} else {
3499 				ba->end = end;
3500 			}
3501 		}
3502 	} /* else not an object and/or can't be shadowed */
3503 }
3504 
3505 /*
3506  * Handles the dirty work of making src_entry and dst_entry copy-on-write
3507  * after src_entry has been cloned to dst_entry.  For normal entries only.
3508  *
3509  * The vm_maps must be exclusively locked.
3510  * The vm_map's token must be held.
3511  *
3512  * Because the maps are locked no faults can be in progress during the
3513  * operation.
3514  */
3515 static void
3516 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3517 		  vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3518 {
3519 	vm_object_t obj;
3520 
3521 	KKASSERT(dst_entry->maptype == VM_MAPTYPE_NORMAL);
3522 
3523 	if (src_entry->wired_count) {
3524 		/*
3525 		 * Of course, wired down pages can't be set copy-on-write.
3526 		 * Cause wired pages to be copied into the new map by
3527 		 * simulating faults (the new pages are pageable)
3528 		 *
3529 		 * Scrap ba.object (its ref-count has not yet been adjusted
3530 		 * so we can just NULL out the field).  Remove the backing
3531 		 * store.
3532 		 *
3533 		 * Then call vm_fault_copy_entry() to create a new object
3534 		 * in dst_entry and copy the wired pages from src to dst.
3535 		 *
3536 		 * The fault-copy code doesn't work with virtual page
3537 		 * tables.
3538 		 *
3539 		 * NOTE: obj is not actually an object for all MAPTYPEs,
3540 		 *	 just test against NULL.
3541 		 */
3542 		if (dst_entry->ba.map_object != NULL) {
3543 			vm_map_backing_detach(dst_entry, &dst_entry->ba);
3544 			dst_entry->ba.map_object = NULL;
3545 			vm_map_entry_dispose_ba(dst_entry,
3546 						dst_entry->ba.backing_ba);
3547 			dst_entry->ba.backing_ba = NULL;
3548 			dst_entry->ba.backing_count = 0;
3549 		}
3550 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3551 	} else {
3552 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3553 			/*
3554 			 * If the source entry is not already marked NEEDS_COPY
3555 			 * we need to write-protect the PTEs.
3556 			 */
3557 			pmap_protect(src_map->pmap,
3558 				     src_entry->ba.start,
3559 				     src_entry->ba.end,
3560 				     src_entry->protection & ~VM_PROT_WRITE);
3561 		}
3562 
3563 		/*
3564 		 * dst_entry.ba_object might be stale.  Update it (its
3565 		 * ref-count has not yet been updated so just overwrite
3566 		 * the field).
3567 		 *
3568 		 * If there is no object then we are golden.  Also, in
3569 		 * this situation if there are no backing_ba linkages then
3570 		 * we can set ba.offset to whatever we want.  For now we
3571 		 * set the offset for 0 for make debugging object sizes
3572 		 * easier.
3573 		 */
3574 		obj = src_entry->ba.object;
3575 
3576 		if (obj) {
3577 			src_entry->eflags |= (MAP_ENTRY_COW |
3578 					      MAP_ENTRY_NEEDS_COPY);
3579 			dst_entry->eflags |= (MAP_ENTRY_COW |
3580 					      MAP_ENTRY_NEEDS_COPY);
3581 			KKASSERT(dst_entry->ba.offset == src_entry->ba.offset);
3582 		} else {
3583 			dst_entry->ba.offset = 0;
3584 		}
3585 
3586 		/*
3587 		 * Normal, allow the backing_ba link depth to
3588 		 * increase.
3589 		 */
3590 		pmap_copy(dst_map->pmap, src_map->pmap,
3591 			  dst_entry->ba.start,
3592 			  dst_entry->ba.end - dst_entry->ba.start,
3593 			  src_entry->ba.start);
3594 	}
3595 }
3596 
3597 /*
3598  * Create a vmspace for a new process and its related vm_map based on an
3599  * existing vmspace.  The new map inherits information from the old map
3600  * according to inheritance settings.
3601  *
3602  * The source map must not be locked.
3603  * No requirements.
3604  */
3605 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3606 			  vm_map_entry_t old_entry, int *countp);
3607 static void vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3608 			  vm_map_t old_map, vm_map_t new_map,
3609 			  vm_map_entry_t old_entry, int *countp);
3610 
3611 struct vmspace *
3612 vmspace_fork(struct vmspace *vm1, struct proc *p2, struct lwp *lp2)
3613 {
3614 	struct vmspace *vm2;
3615 	vm_map_t old_map = &vm1->vm_map;
3616 	vm_map_t new_map;
3617 	vm_map_entry_t old_entry;
3618 	int count;
3619 
3620 	lwkt_gettoken(&vm1->vm_map.token);
3621 	vm_map_lock(old_map);
3622 
3623 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map));
3624 	lwkt_gettoken(&vm2->vm_map.token);
3625 
3626 	/*
3627 	 * We must bump the timestamp to force any concurrent fault
3628 	 * to retry.
3629 	 */
3630 	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3631 	      (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3632 	new_map = &vm2->vm_map;	/* XXX */
3633 	new_map->timestamp = 1;
3634 
3635 	vm_map_lock(new_map);
3636 
3637 	count = old_map->nentries;
3638 	count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3639 
3640 	RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) {
3641 		switch(old_entry->maptype) {
3642 		case VM_MAPTYPE_SUBMAP:
3643 			panic("vm_map_fork: encountered a submap");
3644 			break;
3645 		case VM_MAPTYPE_UKSMAP:
3646 			vmspace_fork_uksmap_entry(p2, lp2,
3647 						  old_map, new_map,
3648 						  old_entry, &count);
3649 			break;
3650 		case VM_MAPTYPE_NORMAL:
3651 			vmspace_fork_normal_entry(old_map, new_map,
3652 						  old_entry, &count);
3653 			break;
3654 		default:
3655 			/* nothing to do */
3656 			break;
3657 		}
3658 	}
3659 
3660 	new_map->size = old_map->size;
3661 	vm_map_unlock(new_map);
3662 	vm_map_unlock(old_map);
3663 	vm_map_entry_release(count);
3664 
3665 	lwkt_reltoken(&vm2->vm_map.token);
3666 	lwkt_reltoken(&vm1->vm_map.token);
3667 
3668 	return (vm2);
3669 }
3670 
3671 static
3672 void
3673 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3674 			  vm_map_entry_t old_entry, int *countp)
3675 {
3676 	vm_map_entry_t new_entry;
3677 	vm_map_backing_t ba;
3678 	vm_object_t object;
3679 
3680 	/*
3681 	 * If the backing_ba link list gets too long then fault it
3682 	 * all into the head object and dispose of the list.  We do
3683 	 * this in old_entry prior to cloning in order to benefit both
3684 	 * parent and child.
3685 	 *
3686 	 * We can test our fronting object's size against its
3687 	 * resident_page_count for a really cheap (but probably not perfect)
3688 	 * all-shadowed test, allowing us to disconnect the backing_ba
3689 	 * link list early.
3690 	 */
3691 	object = old_entry->ba.object;
3692 	if (old_entry->ba.backing_ba &&
3693 	    (old_entry->ba.backing_count >= vm_map_backing_limit ||
3694 	     (vm_map_backing_shadow_test && object &&
3695 	      object->size == object->resident_page_count))) {
3696 		/*
3697 		 * If there are too many backing_ba linkages we
3698 		 * collapse everything into the head
3699 		 *
3700 		 * This will also remove all the pte's.
3701 		 */
3702 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY)
3703 			vm_map_entry_shadow(old_entry);
3704 		if (object == NULL)
3705 			vm_map_entry_allocate_object(old_entry);
3706 		if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) {
3707 			ba = old_entry->ba.backing_ba;
3708 			old_entry->ba.backing_ba = NULL;
3709 			old_entry->ba.backing_count = 0;
3710 			vm_map_entry_dispose_ba(old_entry, ba);
3711 		}
3712 	}
3713 	object = NULL;	/* object variable is now invalid */
3714 
3715 	/*
3716 	 * Fork the entry
3717 	 */
3718 	switch (old_entry->inheritance) {
3719 	case VM_INHERIT_NONE:
3720 		break;
3721 	case VM_INHERIT_SHARE:
3722 		/*
3723 		 * Clone the entry as a shared entry.  This will look like
3724 		 * shared memory across the old and the new process.  We must
3725 		 * ensure that the object is allocated.
3726 		 */
3727 		if (old_entry->ba.object == NULL)
3728 			vm_map_entry_allocate_object(old_entry);
3729 
3730 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3731 			/*
3732 			 * Create the fronting vm_map_backing for
3733 			 * an entry which needs a copy, plus an extra
3734 			 * ref because we are going to duplicate it
3735 			 * in the fork.
3736 			 *
3737 			 * The call to vm_map_entry_shadow() will also clear
3738 			 * OBJ_ONEMAPPING.
3739 			 *
3740 			 * XXX no more collapse.  Still need extra ref
3741 			 * for the fork.
3742 			 */
3743 			vm_map_entry_shadow(old_entry);
3744 		} else if (old_entry->ba.object) {
3745 			object = old_entry->ba.object;
3746 		}
3747 
3748 		/*
3749 		 * Clone the entry.  We've already bumped the ref on
3750 		 * the vm_object for our new entry.
3751 		 */
3752 		new_entry = vm_map_entry_create(countp);
3753 		*new_entry = *old_entry;
3754 
3755 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3756 		new_entry->wired_count = 0;
3757 
3758 		/*
3759 		 * Replicate and index the vm_map_backing.  Don't share
3760 		 * the vm_map_backing across vm_map's (only across clips).
3761 		 *
3762 		 * Insert the entry into the new map -- we know we're
3763 		 * inserting at the end of the new map.
3764 		 */
3765 		vm_map_backing_replicated(new_map, new_entry, 0);
3766 		vm_map_entry_link(new_map, new_entry);
3767 
3768 		/*
3769 		 * Update the physical map
3770 		 */
3771 		pmap_copy(new_map->pmap, old_map->pmap,
3772 			  new_entry->ba.start,
3773 			  (old_entry->ba.end - old_entry->ba.start),
3774 			  old_entry->ba.start);
3775 		break;
3776 	case VM_INHERIT_COPY:
3777 		/*
3778 		 * Clone the entry and link the copy into the new map.
3779 		 *
3780 		 * Note that ref-counting adjustment for old_entry->ba.object
3781 		 * (if it isn't a special map that is) is handled by
3782 		 * vm_map_copy_entry().
3783 		 */
3784 		new_entry = vm_map_entry_create(countp);
3785 		*new_entry = *old_entry;
3786 
3787 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3788 		new_entry->wired_count = 0;
3789 
3790 		vm_map_backing_replicated(new_map, new_entry, 0);
3791 		vm_map_entry_link(new_map, new_entry);
3792 
3793 		/*
3794 		 * This does the actual dirty work of making both entries
3795 		 * copy-on-write, and will also handle the fronting object.
3796 		 */
3797 		vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
3798 		break;
3799 	}
3800 }
3801 
3802 /*
3803  * When forking user-kernel shared maps, the map might change in the
3804  * child so do not try to copy the underlying pmap entries.
3805  */
3806 static
3807 void
3808 vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3809 			  vm_map_t old_map, vm_map_t new_map,
3810 			  vm_map_entry_t old_entry, int *countp)
3811 {
3812 	vm_map_entry_t new_entry;
3813 
3814 	/*
3815 	 * Do not fork lpmap entries whos TIDs do not match lp2's tid.
3816 	 *
3817 	 * XXX if p2 is NULL and lp2 is non-NULL, we retain the lpmap entry
3818 	 * (this is for e.g. resident'ing vmspace's) but set the field
3819 	 * to NULL.  Upon restore it should be restored. XXX NOT IMPL YET
3820 	 */
3821 	if (old_entry->aux.dev) {
3822 		switch(minor(old_entry->aux.dev)) {
3823 		case 5:
3824 			break;
3825 		case 6:
3826 			break;
3827 		case 7:
3828 			if (lp2 == NULL)
3829 				return;
3830 			if (old_entry->ba.aux_info == NULL)
3831 				return;
3832 			if (((struct lwp *)old_entry->ba.aux_info)->lwp_tid !=
3833 			    lp2->lwp_tid)
3834 				return;
3835 			break;
3836 		}
3837 	}
3838 
3839 	new_entry = vm_map_entry_create(countp);
3840 	*new_entry = *old_entry;
3841 
3842 	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3843 	new_entry->wired_count = 0;
3844 	KKASSERT(new_entry->ba.backing_ba == NULL);
3845 
3846 	if (new_entry->aux.dev) {
3847 		switch(minor(new_entry->aux.dev)) {
3848 		case 5:
3849 			/*
3850 			 * upmap
3851 			 */
3852 			new_entry->ba.aux_info = p2;
3853 			break;
3854 		case 6:
3855 			/*
3856 			 * kpmap
3857 			 */
3858 			new_entry->ba.aux_info = NULL;
3859 			break;
3860 		case 7:
3861 			/*
3862 			 * lpmap
3863 			 */
3864 			new_entry->ba.aux_info = lp2;
3865 			break;
3866 		}
3867 	} else {
3868 		new_entry->ba.aux_info = NULL;
3869 	}
3870 
3871 	vm_map_backing_replicated(new_map, new_entry, 0);
3872 
3873 	vm_map_entry_link(new_map, new_entry);
3874 }
3875 
3876 /*
3877  * Create an auto-grow stack entry
3878  *
3879  * No requirements.
3880  */
3881 int
3882 vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize,
3883 	      int flags, vm_prot_t prot, vm_prot_t max, int cow)
3884 {
3885 	vm_map_entry_t	prev_entry;
3886 	vm_map_entry_t	next;
3887 	vm_size_t	init_ssize;
3888 	int		rv;
3889 	int		count;
3890 	vm_offset_t	tmpaddr;
3891 
3892 	cow |= MAP_IS_STACK;
3893 
3894 	if (max_ssize < sgrowsiz)
3895 		init_ssize = max_ssize;
3896 	else
3897 		init_ssize = sgrowsiz;
3898 
3899 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3900 	vm_map_lock(map);
3901 
3902 	/*
3903 	 * Find space for the mapping
3904 	 */
3905 	if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3906 		if (vm_map_findspace(map, *addrbos, max_ssize, 1,
3907 				     flags, &tmpaddr)) {
3908 			vm_map_unlock(map);
3909 			vm_map_entry_release(count);
3910 			return (KERN_NO_SPACE);
3911 		}
3912 		*addrbos = tmpaddr;
3913 	}
3914 
3915 	/* If addr is already mapped, no go */
3916 	if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) {
3917 		vm_map_unlock(map);
3918 		vm_map_entry_release(count);
3919 		return (KERN_NO_SPACE);
3920 	}
3921 
3922 #if 0
3923 	/* XXX already handled by kern_mmap() */
3924 	/* If we would blow our VMEM resource limit, no go */
3925 	if (map->size + init_ssize >
3926 	    curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3927 		vm_map_unlock(map);
3928 		vm_map_entry_release(count);
3929 		return (KERN_NO_SPACE);
3930 	}
3931 #endif
3932 
3933 	/*
3934 	 * If we can't accomodate max_ssize in the current mapping,
3935 	 * no go.  However, we need to be aware that subsequent user
3936 	 * mappings might map into the space we have reserved for
3937 	 * stack, and currently this space is not protected.
3938 	 *
3939 	 * Hopefully we will at least detect this condition
3940 	 * when we try to grow the stack.
3941 	 */
3942 	if (prev_entry)
3943 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
3944 	else
3945 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3946 
3947 	if (next && next->ba.start < *addrbos + max_ssize) {
3948 		vm_map_unlock(map);
3949 		vm_map_entry_release(count);
3950 		return (KERN_NO_SPACE);
3951 	}
3952 
3953 	/*
3954 	 * We initially map a stack of only init_ssize.  We will
3955 	 * grow as needed later.  Since this is to be a grow
3956 	 * down stack, we map at the top of the range.
3957 	 *
3958 	 * Note: we would normally expect prot and max to be
3959 	 * VM_PROT_ALL, and cow to be 0.  Possibly we should
3960 	 * eliminate these as input parameters, and just
3961 	 * pass these values here in the insert call.
3962 	 */
3963 	rv = vm_map_insert(map, &count,
3964 			   NULL, NULL,
3965 			   0, NULL,
3966 			   *addrbos + max_ssize - init_ssize,
3967 	                   *addrbos + max_ssize,
3968 			   VM_MAPTYPE_NORMAL,
3969 			   VM_SUBSYS_STACK, prot, max, cow);
3970 
3971 	/* Now set the avail_ssize amount */
3972 	if (rv == KERN_SUCCESS) {
3973 		if (prev_entry)
3974 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
3975 		else
3976 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3977 		if (prev_entry != NULL) {
3978 			vm_map_clip_end(map,
3979 					prev_entry,
3980 					*addrbos + max_ssize - init_ssize,
3981 					&count);
3982 		}
3983 		if (next->ba.end   != *addrbos + max_ssize ||
3984 		    next->ba.start != *addrbos + max_ssize - init_ssize){
3985 			panic ("Bad entry start/end for new stack entry");
3986 		} else {
3987 			next->aux.avail_ssize = max_ssize - init_ssize;
3988 		}
3989 	}
3990 
3991 	vm_map_unlock(map);
3992 	vm_map_entry_release(count);
3993 	return (rv);
3994 }
3995 
3996 /*
3997  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3998  * desired address is already mapped, or if we successfully grow
3999  * the stack.  Also returns KERN_SUCCESS if addr is outside the
4000  * stack range (this is strange, but preserves compatibility with
4001  * the grow function in vm_machdep.c).
4002  *
4003  * No requirements.
4004  */
4005 int
4006 vm_map_growstack (vm_map_t map, vm_offset_t addr)
4007 {
4008 	vm_map_entry_t prev_entry;
4009 	vm_map_entry_t stack_entry;
4010 	vm_map_entry_t next;
4011 	struct vmspace *vm;
4012 	struct lwp *lp;
4013 	struct proc *p;
4014 	vm_offset_t    end;
4015 	int grow_amount;
4016 	int rv = KERN_SUCCESS;
4017 	int is_procstack;
4018 	int use_read_lock = 1;
4019 	int count;
4020 
4021 	/*
4022 	 * Find the vm
4023 	 */
4024 	lp = curthread->td_lwp;
4025 	p = curthread->td_proc;
4026 	KKASSERT(lp != NULL);
4027 	vm = lp->lwp_vmspace;
4028 
4029 	/*
4030 	 * Growstack is only allowed on the current process.  We disallow
4031 	 * other use cases, e.g. trying to access memory via procfs that
4032 	 * the stack hasn't grown into.
4033 	 */
4034 	if (map != &vm->vm_map) {
4035 		return KERN_FAILURE;
4036 	}
4037 
4038 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4039 Retry:
4040 	if (use_read_lock)
4041 		vm_map_lock_read(map);
4042 	else
4043 		vm_map_lock(map);
4044 
4045 	/*
4046 	 * If addr is already in the entry range, no need to grow.
4047 	 * prev_entry returns NULL if addr is at the head.
4048 	 */
4049 	if (vm_map_lookup_entry(map, addr, &prev_entry))
4050 		goto done;
4051 	if (prev_entry)
4052 		stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry);
4053 	else
4054 		stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
4055 
4056 	if (stack_entry == NULL)
4057 		goto done;
4058 	if (prev_entry == NULL)
4059 		end = stack_entry->ba.start - stack_entry->aux.avail_ssize;
4060 	else
4061 		end = prev_entry->ba.end;
4062 
4063 	/*
4064 	 * This next test mimics the old grow function in vm_machdep.c.
4065 	 * It really doesn't quite make sense, but we do it anyway
4066 	 * for compatibility.
4067 	 *
4068 	 * If not growable stack, return success.  This signals the
4069 	 * caller to proceed as he would normally with normal vm.
4070 	 */
4071 	if (stack_entry->aux.avail_ssize < 1 ||
4072 	    addr >= stack_entry->ba.start ||
4073 	    addr <  stack_entry->ba.start - stack_entry->aux.avail_ssize) {
4074 		goto done;
4075 	}
4076 
4077 	/* Find the minimum grow amount */
4078 	grow_amount = roundup (stack_entry->ba.start - addr, PAGE_SIZE);
4079 	if (grow_amount > stack_entry->aux.avail_ssize) {
4080 		rv = KERN_NO_SPACE;
4081 		goto done;
4082 	}
4083 
4084 	/*
4085 	 * If there is no longer enough space between the entries
4086 	 * nogo, and adjust the available space.  Note: this
4087 	 * should only happen if the user has mapped into the
4088 	 * stack area after the stack was created, and is
4089 	 * probably an error.
4090 	 *
4091 	 * This also effectively destroys any guard page the user
4092 	 * might have intended by limiting the stack size.
4093 	 */
4094 	if (grow_amount > stack_entry->ba.start - end) {
4095 		if (use_read_lock && vm_map_lock_upgrade(map)) {
4096 			/* lost lock */
4097 			use_read_lock = 0;
4098 			goto Retry;
4099 		}
4100 		use_read_lock = 0;
4101 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4102 		rv = KERN_NO_SPACE;
4103 		goto done;
4104 	}
4105 
4106 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
4107 
4108 	/* If this is the main process stack, see if we're over the
4109 	 * stack limit.
4110 	 */
4111 	if (is_procstack && (vm->vm_ssize + grow_amount >
4112 			     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4113 		rv = KERN_NO_SPACE;
4114 		goto done;
4115 	}
4116 
4117 	/* Round up the grow amount modulo SGROWSIZ */
4118 	grow_amount = roundup (grow_amount, sgrowsiz);
4119 	if (grow_amount > stack_entry->aux.avail_ssize) {
4120 		grow_amount = stack_entry->aux.avail_ssize;
4121 	}
4122 	if (is_procstack && (vm->vm_ssize + grow_amount >
4123 	                     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4124 		grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
4125 	}
4126 
4127 	/* If we would blow our VMEM resource limit, no go */
4128 	if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
4129 		rv = KERN_NO_SPACE;
4130 		goto done;
4131 	}
4132 
4133 	if (use_read_lock && vm_map_lock_upgrade(map)) {
4134 		/* lost lock */
4135 		use_read_lock = 0;
4136 		goto Retry;
4137 	}
4138 	use_read_lock = 0;
4139 
4140 	/* Get the preliminary new entry start value */
4141 	addr = stack_entry->ba.start - grow_amount;
4142 
4143 	/* If this puts us into the previous entry, cut back our growth
4144 	 * to the available space.  Also, see the note above.
4145 	 */
4146 	if (addr < end) {
4147 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4148 		addr = end;
4149 	}
4150 
4151 	rv = vm_map_insert(map, &count,
4152 			   NULL, NULL,
4153 			   0, NULL,
4154 			   addr, stack_entry->ba.start,
4155 			   VM_MAPTYPE_NORMAL,
4156 			   VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4157 
4158 	/* Adjust the available stack space by the amount we grew. */
4159 	if (rv == KERN_SUCCESS) {
4160 		if (prev_entry) {
4161 			vm_map_clip_end(map, prev_entry, addr, &count);
4162 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
4163 		} else {
4164 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
4165 		}
4166 		if (next->ba.end != stack_entry->ba.start  ||
4167 		    next->ba.start != addr) {
4168 			panic ("Bad stack grow start/end in new stack entry");
4169 		} else {
4170 			next->aux.avail_ssize =
4171 				stack_entry->aux.avail_ssize -
4172 				(next->ba.end - next->ba.start);
4173 			if (is_procstack) {
4174 				vm->vm_ssize += next->ba.end -
4175 						next->ba.start;
4176 			}
4177 		}
4178 
4179 		if (map->flags & MAP_WIREFUTURE) {
4180 			vm_map_user_wiring(map,
4181 					   next->ba.start,
4182 					   next->ba.end,
4183 					   FALSE);
4184 		}
4185 	}
4186 
4187 done:
4188 	if (use_read_lock)
4189 		vm_map_unlock_read(map);
4190 	else
4191 		vm_map_unlock(map);
4192 	vm_map_entry_release(count);
4193 	return (rv);
4194 }
4195 
4196 /*
4197  * Unshare the specified VM space for exec.  If other processes are
4198  * mapped to it, then create a new one.  The new vmspace is null.
4199  *
4200  * No requirements.
4201  */
4202 void
4203 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4204 {
4205 	struct vmspace *oldvmspace = p->p_vmspace;
4206 	struct vmspace *newvmspace;
4207 	vm_map_t map = &p->p_vmspace->vm_map;
4208 
4209 	/*
4210 	 * If we are execing a resident vmspace we fork it, otherwise
4211 	 * we create a new vmspace.  Note that exitingcnt is not
4212 	 * copied to the new vmspace.
4213 	 */
4214 	lwkt_gettoken(&oldvmspace->vm_map.token);
4215 	if (vmcopy)  {
4216 		newvmspace = vmspace_fork(vmcopy, NULL, NULL);
4217 		lwkt_gettoken(&newvmspace->vm_map.token);
4218 	} else {
4219 		newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map));
4220 		lwkt_gettoken(&newvmspace->vm_map.token);
4221 		bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4222 		      (caddr_t)&oldvmspace->vm_endcopy -
4223 		       (caddr_t)&oldvmspace->vm_startcopy);
4224 	}
4225 
4226 	/*
4227 	 * Finish initializing the vmspace before assigning it
4228 	 * to the process.  The vmspace will become the current vmspace
4229 	 * if p == curproc.
4230 	 */
4231 	pmap_pinit2(vmspace_pmap(newvmspace));
4232 	pmap_replacevm(p, newvmspace, 0);
4233 	lwkt_reltoken(&newvmspace->vm_map.token);
4234 	lwkt_reltoken(&oldvmspace->vm_map.token);
4235 	vmspace_rel(oldvmspace);
4236 }
4237 
4238 /*
4239  * Unshare the specified VM space for forcing COW.  This
4240  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4241  */
4242 void
4243 vmspace_unshare(struct proc *p)
4244 {
4245 	struct vmspace *oldvmspace = p->p_vmspace;
4246 	struct vmspace *newvmspace;
4247 
4248 	lwkt_gettoken(&oldvmspace->vm_map.token);
4249 	if (vmspace_getrefs(oldvmspace) == 1) {
4250 		lwkt_reltoken(&oldvmspace->vm_map.token);
4251 		return;
4252 	}
4253 	newvmspace = vmspace_fork(oldvmspace, NULL, NULL);
4254 	lwkt_gettoken(&newvmspace->vm_map.token);
4255 	pmap_pinit2(vmspace_pmap(newvmspace));
4256 	pmap_replacevm(p, newvmspace, 0);
4257 	lwkt_reltoken(&newvmspace->vm_map.token);
4258 	lwkt_reltoken(&oldvmspace->vm_map.token);
4259 	vmspace_rel(oldvmspace);
4260 }
4261 
4262 /*
4263  * vm_map_hint: return the beginning of the best area suitable for
4264  * creating a new mapping with "prot" protection.
4265  *
4266  * No requirements.
4267  */
4268 vm_offset_t
4269 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4270 {
4271 	struct vmspace *vms = p->p_vmspace;
4272 	struct rlimit limit;
4273 	rlim_t dsiz;
4274 
4275 	/*
4276 	 * Acquire datasize limit for mmap() operation,
4277 	 * calculate nearest power of 2.
4278 	 */
4279 	if (kern_getrlimit(RLIMIT_DATA, &limit))
4280 		limit.rlim_cur = maxdsiz;
4281 	dsiz = limit.rlim_cur;
4282 
4283 	if (!randomize_mmap || addr != 0) {
4284 		/*
4285 		 * Set a reasonable start point for the hint if it was
4286 		 * not specified or if it falls within the heap space.
4287 		 * Hinted mmap()s do not allocate out of the heap space.
4288 		 */
4289 		if (addr == 0 ||
4290 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4291 		     addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4292 			addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4293 		}
4294 
4295 		return addr;
4296 	}
4297 
4298 	/*
4299 	 * randomize_mmap && addr == 0.  For now randomize the
4300 	 * address within a dsiz range beyond the data limit.
4301 	 */
4302 	addr = (vm_offset_t)vms->vm_daddr + dsiz;
4303 	if (dsiz)
4304 		addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4305 	return (round_page(addr));
4306 }
4307 
4308 /*
4309  * Finds the VM object, offset, and protection for a given virtual address
4310  * in the specified map, assuming a page fault of the type specified.
4311  *
4312  * Leaves the map in question locked for read; return values are guaranteed
4313  * until a vm_map_lookup_done call is performed.  Note that the map argument
4314  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4315  *
4316  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4317  * that fast.
4318  *
4319  * If a lookup is requested with "write protection" specified, the map may
4320  * be changed to perform virtual copying operations, although the data
4321  * referenced will remain the same.
4322  *
4323  * No requirements.
4324  */
4325 int
4326 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
4327 	      vm_offset_t vaddr,
4328 	      vm_prot_t fault_typea,
4329 	      vm_map_entry_t *out_entry,	/* OUT */
4330 	      struct vm_map_backing **bap,	/* OUT */
4331 	      vm_pindex_t *pindex,		/* OUT */
4332 	      vm_pindex_t *pcount,		/* OUT */
4333 	      vm_prot_t *out_prot,		/* OUT */
4334 	      int *wflags)			/* OUT */
4335 {
4336 	vm_map_entry_t entry;
4337 	vm_map_t map = *var_map;
4338 	vm_prot_t prot;
4339 	vm_prot_t fault_type = fault_typea;
4340 	int use_read_lock = 1;
4341 	int rv = KERN_SUCCESS;
4342 	int count;
4343 	thread_t td = curthread;
4344 
4345 	/*
4346 	 * vm_map_entry_reserve() implements an important mitigation
4347 	 * against mmap() span running the kernel out of vm_map_entry
4348 	 * structures, but it can also cause an infinite call recursion.
4349 	 * Use td_nest_count to prevent an infinite recursion (allows
4350 	 * the vm_map code to dig into the pcpu vm_map_entry reserve).
4351 	 */
4352 	count = 0;
4353 	if (td->td_nest_count == 0) {
4354 		++td->td_nest_count;
4355 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4356 		--td->td_nest_count;
4357 	}
4358 RetryLookup:
4359 	if (use_read_lock)
4360 		vm_map_lock_read(map);
4361 	else
4362 		vm_map_lock(map);
4363 
4364 	/*
4365 	 * Always do a full lookup.  The hint doesn't get us much anymore
4366 	 * now that the map is RB'd.
4367 	 */
4368 	cpu_ccfence();
4369 	*out_entry = NULL;
4370 	*bap = NULL;
4371 
4372 	{
4373 		vm_map_entry_t tmp_entry;
4374 
4375 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4376 			rv = KERN_INVALID_ADDRESS;
4377 			goto done;
4378 		}
4379 		entry = tmp_entry;
4380 		*out_entry = entry;
4381 	}
4382 
4383 	/*
4384 	 * Handle submaps.
4385 	 */
4386 	if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4387 		vm_map_t old_map = map;
4388 
4389 		*var_map = map = entry->ba.sub_map;
4390 		if (use_read_lock)
4391 			vm_map_unlock_read(old_map);
4392 		else
4393 			vm_map_unlock(old_map);
4394 		use_read_lock = 1;
4395 		goto RetryLookup;
4396 	}
4397 
4398 	/*
4399 	 * Check whether this task is allowed to have this page.
4400 	 * Note the special case for MAP_ENTRY_COW pages with an override.
4401 	 * This is to implement a forced COW for debuggers.
4402 	 */
4403 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
4404 		prot = entry->max_protection;
4405 	else
4406 		prot = entry->protection;
4407 
4408 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4409 	if ((fault_type & prot) != fault_type) {
4410 		rv = KERN_PROTECTION_FAILURE;
4411 		goto done;
4412 	}
4413 
4414 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4415 	    (entry->eflags & MAP_ENTRY_COW) &&
4416 	    (fault_type & VM_PROT_WRITE) &&
4417 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4418 		rv = KERN_PROTECTION_FAILURE;
4419 		goto done;
4420 	}
4421 
4422 	/*
4423 	 * Flag regular pages that are supposed to be wired.  Remove prior
4424 	 * semantics that disallowed protection changes for such pages.
4425 	 *
4426 	 * The prior semantics are not used by modern systems.  Applications
4427 	 * do not assume an inability to change protection modes and may
4428 	 * operate incorrectly if we try to prevent protection changes.
4429 	 *
4430 	 * Modern applications are aware that even for locked memory,
4431 	 * changing protection modes, modifying MAP_PRIVATE mappings,
4432 	 * or fork() may still cause page faults on the locked memory.
4433 	 */
4434 	*wflags = 0;
4435 	if (entry->wired_count) {
4436 		*wflags |= FW_WIRED;
4437 #if 0
4438 		prot = fault_type = entry->protection;
4439 #endif
4440 	}
4441 
4442 	if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4443 	    pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4444 		if ((prot & VM_PROT_WRITE) == 0)
4445 			fault_type |= VM_PROT_WRITE;
4446 	}
4447 
4448 	/*
4449 	 * Only NORMAL maps are object-based.  UKSMAPs are not.
4450 	 */
4451 	if (entry->maptype != VM_MAPTYPE_NORMAL) {
4452 		*bap = NULL;
4453 		goto skip;
4454 	}
4455 
4456 	/*
4457 	 * If the entry was copy-on-write, we either ...
4458 	 */
4459 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4460 		/*
4461 		 * If we want to write the page, we may as well handle that
4462 		 * now since we've got the map locked.
4463 		 *
4464 		 * If we don't need to write the page, we just demote the
4465 		 * permissions allowed.
4466 		 */
4467 		if (fault_type & VM_PROT_WRITE) {
4468 			/*
4469 			 * Not allowed if TDF_NOFAULT is set as the shadowing
4470 			 * operation can deadlock against the faulting
4471 			 * function due to the copy-on-write.
4472 			 */
4473 			if (curthread->td_flags & TDF_NOFAULT) {
4474 				rv = KERN_FAILURE_NOFAULT;
4475 				goto done;
4476 			}
4477 
4478 			/*
4479 			 * Make a new vm_map_backing + object, and place it
4480 			 * in the object chain.  Note that no new references
4481 			 * have appeared -- one just moved from the map to
4482 			 * the new object.
4483 			 */
4484 			if (use_read_lock && vm_map_lock_upgrade(map)) {
4485 				/* lost lock */
4486 				use_read_lock = 0;
4487 				goto RetryLookup;
4488 			}
4489 			use_read_lock = 0;
4490 			vm_map_entry_shadow(entry);
4491 			*wflags |= FW_DIDCOW;
4492 		} else {
4493 			/*
4494 			 * We're attempting to read a copy-on-write page --
4495 			 * don't allow writes.
4496 			 */
4497 			prot &= ~VM_PROT_WRITE;
4498 		}
4499 	}
4500 
4501 	/*
4502 	 * Create an object if necessary.  This code also handles
4503 	 * partitioning large entries to improve vm_fault performance.
4504 	 */
4505 	if (entry->ba.object == NULL && !map->system_map) {
4506 		if (use_read_lock && vm_map_lock_upgrade(map))  {
4507 			/* lost lock */
4508 			use_read_lock = 0;
4509 			goto RetryLookup;
4510 		}
4511 		use_read_lock = 0;
4512 
4513 		/*
4514 		 * Partition large entries, giving each its own VM object,
4515 		 * to improve concurrent fault performance.  This is only
4516 		 * applicable to userspace.
4517 		 */
4518 		if (map != kernel_map &&
4519 		    entry->maptype == VM_MAPTYPE_NORMAL &&
4520 		    ((entry->ba.start ^ entry->ba.end) &
4521 		     ~MAP_ENTRY_PARTITION_MASK) &&
4522 		    vm_map_partition_enable) {
4523 			if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4524 				entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4525 				++mycpu->gd_cnt.v_intrans_coll;
4526 				++mycpu->gd_cnt.v_intrans_wait;
4527 				vm_map_transition_wait(map, 0);
4528 				goto RetryLookup;
4529 			}
4530 			vm_map_entry_partition(map, entry, vaddr, &count);
4531 		}
4532 		vm_map_entry_allocate_object(entry);
4533 	}
4534 
4535 	/*
4536 	 * Return the object/offset from this entry.  If the entry was
4537 	 * copy-on-write or empty, it has been fixed up.
4538 	 */
4539 	*bap = &entry->ba;
4540 
4541 skip:
4542 	*pindex = OFF_TO_IDX((vaddr - entry->ba.start) + entry->ba.offset);
4543 	*pcount = OFF_TO_IDX(entry->ba.end - trunc_page(vaddr));
4544 
4545 	/*
4546 	 * Return whether this is the only map sharing this data.  On
4547 	 * success we return with a read lock held on the map.  On failure
4548 	 * we return with the map unlocked.
4549 	 */
4550 	*out_prot = prot;
4551 done:
4552 	if (rv == KERN_SUCCESS) {
4553 		if (use_read_lock == 0)
4554 			vm_map_lock_downgrade(map);
4555 	} else if (use_read_lock) {
4556 		vm_map_unlock_read(map);
4557 	} else {
4558 		vm_map_unlock(map);
4559 	}
4560 	if (count > 0)
4561 		vm_map_entry_release(count);
4562 
4563 	return (rv);
4564 }
4565 
4566 /*
4567  * Releases locks acquired by a vm_map_lookup()
4568  * (according to the handle returned by that lookup).
4569  *
4570  * No other requirements.
4571  */
4572 void
4573 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4574 {
4575 	/*
4576 	 * Unlock the main-level map
4577 	 */
4578 	vm_map_unlock_read(map);
4579 	if (count)
4580 		vm_map_entry_release(count);
4581 }
4582 
4583 static void
4584 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4585 		       vm_offset_t vaddr, int *countp)
4586 {
4587 	vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4588 	vm_map_clip_start(map, entry, vaddr, countp);
4589 	vaddr += MAP_ENTRY_PARTITION_SIZE;
4590 	vm_map_clip_end(map, entry, vaddr, countp);
4591 }
4592 
4593 /*
4594  * Quick hack, needs some help to make it more SMP friendly.
4595  */
4596 void
4597 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4598 		 vm_offset_t ran_beg, vm_offset_t ran_end)
4599 {
4600 	struct vm_map_ilock *scan;
4601 
4602 	ilock->ran_beg = ran_beg;
4603 	ilock->ran_end = ran_end;
4604 	ilock->flags = 0;
4605 
4606 	spin_lock(&map->ilock_spin);
4607 restart:
4608 	for (scan = map->ilock_base; scan; scan = scan->next) {
4609 		if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4610 			scan->flags |= ILOCK_WAITING;
4611 			ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4612 			goto restart;
4613 		}
4614 	}
4615 	ilock->next = map->ilock_base;
4616 	map->ilock_base = ilock;
4617 	spin_unlock(&map->ilock_spin);
4618 }
4619 
4620 void
4621 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4622 {
4623 	struct vm_map_ilock *scan;
4624 	struct vm_map_ilock **scanp;
4625 
4626 	spin_lock(&map->ilock_spin);
4627 	scanp = &map->ilock_base;
4628 	while ((scan = *scanp) != NULL) {
4629 		if (scan == ilock) {
4630 			*scanp = ilock->next;
4631 			spin_unlock(&map->ilock_spin);
4632 			if (ilock->flags & ILOCK_WAITING)
4633 				wakeup(ilock);
4634 			return;
4635 		}
4636 		scanp = &scan->next;
4637 	}
4638 	spin_unlock(&map->ilock_spin);
4639 	panic("vm_map_deinterlock: missing ilock!");
4640 }
4641 
4642 #include "opt_ddb.h"
4643 #ifdef DDB
4644 #include <ddb/ddb.h>
4645 
4646 /*
4647  * Debugging only
4648  */
4649 DB_SHOW_COMMAND(map, vm_map_print)
4650 {
4651 	static int nlines;
4652 	/* XXX convert args. */
4653 	vm_map_t map = (vm_map_t)addr;
4654 	boolean_t full = have_addr;
4655 
4656 	vm_map_entry_t entry;
4657 
4658 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4659 	    (void *)map,
4660 	    (void *)map->pmap, map->nentries, map->timestamp);
4661 	nlines++;
4662 
4663 	if (!full && db_indent)
4664 		return;
4665 
4666 	db_indent += 2;
4667 	RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
4668 		db_iprintf("map entry %p: start=%p, end=%p\n",
4669 		    (void *)entry,
4670 		    (void *)entry->ba.start, (void *)entry->ba.end);
4671 		nlines++;
4672 		{
4673 			static char *inheritance_name[4] =
4674 			{"share", "copy", "none", "donate_copy"};
4675 
4676 			db_iprintf(" prot=%x/%x/%s",
4677 			    entry->protection,
4678 			    entry->max_protection,
4679 			    inheritance_name[(int)(unsigned char)
4680 						entry->inheritance]);
4681 			if (entry->wired_count != 0)
4682 				db_printf(", wired");
4683 		}
4684 		switch(entry->maptype) {
4685 		case VM_MAPTYPE_SUBMAP:
4686 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4687 			db_printf(", share=%p, offset=0x%lx\n",
4688 			    (void *)entry->ba.sub_map,
4689 			    (long)entry->ba.offset);
4690 			nlines++;
4691 
4692 			db_indent += 2;
4693 			vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
4694 				     full, 0, NULL);
4695 			db_indent -= 2;
4696 			break;
4697 		case VM_MAPTYPE_NORMAL:
4698 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4699 			db_printf(", object=%p, offset=0x%lx",
4700 			    (void *)entry->ba.object,
4701 			    (long)entry->ba.offset);
4702 			if (entry->eflags & MAP_ENTRY_COW)
4703 				db_printf(", copy (%s)",
4704 				    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) ?
4705 				     "needed" : "done"));
4706 			db_printf("\n");
4707 			nlines++;
4708 
4709 			if (entry->ba.object) {
4710 				db_indent += 2;
4711 				vm_object_print((db_expr_t)(intptr_t)
4712 						entry->ba.object,
4713 						full, 0, NULL);
4714 				nlines += 4;
4715 				db_indent -= 2;
4716 			}
4717 			break;
4718 		case VM_MAPTYPE_UKSMAP:
4719 			db_printf(", uksmap=%p, offset=0x%lx",
4720 			    (void *)entry->ba.uksmap,
4721 			    (long)entry->ba.offset);
4722 			if (entry->eflags & MAP_ENTRY_COW)
4723 				db_printf(", copy (%s)",
4724 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4725 			db_printf("\n");
4726 			nlines++;
4727 			break;
4728 		default:
4729 			break;
4730 		}
4731 	}
4732 	db_indent -= 2;
4733 	if (db_indent == 0)
4734 		nlines = 0;
4735 }
4736 
4737 /*
4738  * Debugging only
4739  */
4740 DB_SHOW_COMMAND(procvm, procvm)
4741 {
4742 	struct proc *p;
4743 
4744 	if (have_addr) {
4745 		p = (struct proc *) addr;
4746 	} else {
4747 		p = curproc;
4748 	}
4749 
4750 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4751 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4752 	    (void *)vmspace_pmap(p->p_vmspace));
4753 
4754 	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4755 }
4756 
4757 #endif /* DDB */
4758