xref: /dragonfly/sys/vm/vm_map.c (revision 0982c5b8)
1 /*
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
37  *
38  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
39  * All rights reserved.
40  *
41  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/proc.h>
67 #include <sys/serialize.h>
68 #include <sys/lock.h>
69 #include <sys/vmmeter.h>
70 #include <sys/mman.h>
71 #include <sys/vnode.h>
72 #include <sys/resourcevar.h>
73 #include <sys/shm.h>
74 #include <sys/tree.h>
75 #include <sys/malloc.h>
76 #include <sys/objcache.h>
77 #include <sys/kern_syscall.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_kern.h>
87 #include <vm/vm_extern.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_zone.h>
90 
91 #include <sys/random.h>
92 #include <sys/sysctl.h>
93 #include <sys/spinlock.h>
94 
95 #include <sys/thread2.h>
96 #include <sys/spinlock2.h>
97 
98 /*
99  * Virtual memory maps provide for the mapping, protection, and sharing
100  * of virtual memory objects.  In addition, this module provides for an
101  * efficient virtual copy of memory from one map to another.
102  *
103  * Synchronization is required prior to most operations.
104  *
105  * Maps consist of an ordered doubly-linked list of simple entries.
106  * A hint and a RB tree is used to speed-up lookups.
107  *
108  * Callers looking to modify maps specify start/end addresses which cause
109  * the related map entry to be clipped if necessary, and then later
110  * recombined if the pieces remained compatible.
111  *
112  * Virtual copy operations are performed by copying VM object references
113  * from one map to another, and then marking both regions as copy-on-write.
114  */
115 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
116 static void vmspace_dtor(void *obj, void *privdata);
117 static void vmspace_terminate(struct vmspace *vm, int final);
118 
119 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
120 MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
121 static struct objcache *vmspace_cache;
122 
123 /*
124  * per-cpu page table cross mappings are initialized in early boot
125  * and might require a considerable number of vm_map_entry structures.
126  */
127 #define MAPENTRYBSP_CACHE	(MAXCPU+1)
128 #define MAPENTRYAP_CACHE	8
129 
130 /*
131  * Partioning threaded programs with large anonymous memory areas can
132  * improve concurrent fault performance.
133  */
134 #define MAP_ENTRY_PARTITION_SIZE	((vm_offset_t)(32 * 1024 * 1024))
135 #define MAP_ENTRY_PARTITION_MASK	(MAP_ENTRY_PARTITION_SIZE - 1)
136 
137 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)	\
138 	((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
139 
140 static struct vm_zone mapentzone_store;
141 __read_mostly static vm_zone_t mapentzone;
142 
143 static struct vm_map_entry map_entry_init[MAX_MAPENT];
144 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
145 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
146 
147 __read_mostly static int randomize_mmap;
148 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
149     "Randomize mmap offsets");
150 __read_mostly static int vm_map_relock_enable = 1;
151 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
152 	   &vm_map_relock_enable, 0, "insert pop pgtable optimization");
153 __read_mostly static int vm_map_partition_enable = 1;
154 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
155 	   &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
156 __read_mostly static int vm_map_backing_limit = 5;
157 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
158 	   &vm_map_backing_limit, 0, "ba.backing_ba link depth");
159 __read_mostly static int vm_map_backing_shadow_test = 1;
160 SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW,
161 	   &vm_map_backing_shadow_test, 0, "ba.object shadow test");
162 
163 static void vmspace_drop_notoken(struct vmspace *vm);
164 static void vm_map_entry_shadow(vm_map_entry_t entry);
165 static vm_map_entry_t vm_map_entry_create(int *);
166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
167 static void vm_map_entry_dispose_ba (vm_map_entry_t entry, vm_map_backing_t ba);
168 static void vm_map_backing_replicated(vm_map_t map,
169 		vm_map_entry_t entry, int flags);
170 static void vm_map_backing_adjust_start(vm_map_entry_t entry,
171 		vm_ooffset_t start);
172 static void vm_map_backing_adjust_end(vm_map_entry_t entry,
173 		vm_ooffset_t end);
174 static void vm_map_backing_attach (vm_map_entry_t entry, vm_map_backing_t ba);
175 static void vm_map_backing_detach (vm_map_entry_t entry, vm_map_backing_t ba);
176 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
177 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
178 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
179 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
180 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
181 		vm_map_entry_t);
182 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
183 		vm_offset_t start, vm_offset_t end, int *countp, int flags);
184 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
185 		vm_offset_t vaddr, int *countp);
186 
187 #define MAP_BACK_CLIPPED	0x0001
188 #define MAP_BACK_BASEOBJREFD	0x0002
189 
190 /*
191  * Initialize the vm_map module.  Must be called before any other vm_map
192  * routines.
193  *
194  * Map and entry structures are allocated from the general purpose
195  * memory pool with some exceptions:
196  *
197  *	- The kernel map is allocated statically.
198  *	- Initial kernel map entries are allocated out of a static pool.
199  *	- We must set ZONE_SPECIAL here or the early boot code can get
200  *	  stuck if there are >63 cores.
201  *
202  *	These restrictions are necessary since malloc() uses the
203  *	maps and requires map entries.
204  *
205  * Called from the low level boot code only.
206  */
207 void
208 vm_map_startup(void)
209 {
210 	mapentzone = &mapentzone_store;
211 	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
212 		  map_entry_init, MAX_MAPENT);
213 	mapentzone_store.zflags |= ZONE_SPECIAL;
214 }
215 
216 /*
217  * Called prior to any vmspace allocations.
218  *
219  * Called from the low level boot code only.
220  */
221 void
222 vm_init2(void)
223 {
224 	vmspace_cache = objcache_create_mbacked(M_VMSPACE,
225 						sizeof(struct vmspace),
226 						0, ncpus * 4,
227 						vmspace_ctor, vmspace_dtor,
228 						NULL);
229 	zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
230 	pmap_init2();
231 	vm_object_init2();
232 }
233 
234 /*
235  * objcache support.  We leave the pmap root cached as long as possible
236  * for performance reasons.
237  */
238 static
239 boolean_t
240 vmspace_ctor(void *obj, void *privdata, int ocflags)
241 {
242 	struct vmspace *vm = obj;
243 
244 	bzero(vm, sizeof(*vm));
245 	vm->vm_refcnt = VM_REF_DELETED;
246 
247 	return 1;
248 }
249 
250 static
251 void
252 vmspace_dtor(void *obj, void *privdata)
253 {
254 	struct vmspace *vm = obj;
255 
256 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
257 	pmap_puninit(vmspace_pmap(vm));
258 }
259 
260 /*
261  * Red black tree functions
262  *
263  * The caller must hold the related map lock.
264  */
265 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
266 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
267 
268 /* a->ba.start is address, and the only field which must be initialized */
269 static int
270 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
271 {
272 	if (a->ba.start < b->ba.start)
273 		return(-1);
274 	else if (a->ba.start > b->ba.start)
275 		return(1);
276 	return(0);
277 }
278 
279 /*
280  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
281  * every refcnt.
282  */
283 void
284 vmspace_initrefs(struct vmspace *vm)
285 {
286 	vm->vm_refcnt = 1;
287 	vm->vm_holdcnt = 1;
288 }
289 
290 /*
291  * Allocate a vmspace structure, including a vm_map and pmap.
292  * Initialize numerous fields.  While the initial allocation is zerod,
293  * subsequence reuse from the objcache leaves elements of the structure
294  * intact (particularly the pmap), so portions must be zerod.
295  *
296  * Returns a referenced vmspace.
297  *
298  * No requirements.
299  */
300 struct vmspace *
301 vmspace_alloc(vm_offset_t min, vm_offset_t max)
302 {
303 	struct vmspace *vm;
304 
305 	vm = objcache_get(vmspace_cache, M_WAITOK);
306 
307 	bzero(&vm->vm_startcopy,
308 	      (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
309 	vm_map_init(&vm->vm_map, min, max, NULL);	/* initializes token */
310 
311 	/*
312 	 * NOTE: hold to acquires token for safety.
313 	 *
314 	 * On return vmspace is referenced (refs=1, hold=1).  That is,
315 	 * each refcnt also has a holdcnt.  There can be additional holds
316 	 * (holdcnt) above and beyond the refcnt.  Finalization is handled in
317 	 * two stages, one on refs 1->0, and the the second on hold 1->0.
318 	 */
319 	KKASSERT(vm->vm_holdcnt == 0);
320 	KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
321 	vmspace_initrefs(vm);
322 	vmspace_hold(vm);
323 	pmap_pinit(vmspace_pmap(vm));		/* (some fields reused) */
324 	vm->vm_map.pmap = vmspace_pmap(vm);	/* XXX */
325 	vm->vm_shm = NULL;
326 	vm->vm_flags = 0;
327 	cpu_vmspace_alloc(vm);
328 	vmspace_drop(vm);
329 
330 	return (vm);
331 }
332 
333 /*
334  * NOTE: Can return 0 if the vmspace is exiting.
335  */
336 int
337 vmspace_getrefs(struct vmspace *vm)
338 {
339 	int32_t n;
340 
341 	n = vm->vm_refcnt;
342 	cpu_ccfence();
343 	if (n & VM_REF_DELETED)
344 		n = -1;
345 	return n;
346 }
347 
348 void
349 vmspace_hold(struct vmspace *vm)
350 {
351 	atomic_add_int(&vm->vm_holdcnt, 1);
352 	lwkt_gettoken(&vm->vm_map.token);
353 }
354 
355 /*
356  * Drop with final termination interlock.
357  */
358 void
359 vmspace_drop(struct vmspace *vm)
360 {
361 	lwkt_reltoken(&vm->vm_map.token);
362 	vmspace_drop_notoken(vm);
363 }
364 
365 static void
366 vmspace_drop_notoken(struct vmspace *vm)
367 {
368 	if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
369 		if (vm->vm_refcnt & VM_REF_DELETED)
370 			vmspace_terminate(vm, 1);
371 	}
372 }
373 
374 /*
375  * A vmspace object must not be in a terminated state to be able to obtain
376  * additional refs on it.
377  *
378  * These are official references to the vmspace, the count is used to check
379  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
380  *
381  * XXX we need to combine hold & ref together into one 64-bit field to allow
382  * holds to prevent stage-1 termination.
383  */
384 void
385 vmspace_ref(struct vmspace *vm)
386 {
387 	uint32_t n;
388 
389 	atomic_add_int(&vm->vm_holdcnt, 1);
390 	n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
391 	KKASSERT((n & VM_REF_DELETED) == 0);
392 }
393 
394 /*
395  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
396  * termination of the vmspace.  Then, on the final drop of the hold we
397  * will do stage-2 final termination.
398  */
399 void
400 vmspace_rel(struct vmspace *vm)
401 {
402 	uint32_t n;
403 
404 	/*
405 	 * Drop refs.  Each ref also has a hold which is also dropped.
406 	 *
407 	 * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
408 	 * prevent finalization) to start termination processing.
409 	 * Finalization occurs when the last hold count drops to 0.
410 	 */
411 	n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
412 	while (n == 0) {
413 		if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
414 			vmspace_terminate(vm, 0);
415 			break;
416 		}
417 		n = vm->vm_refcnt;
418 		cpu_ccfence();
419 	}
420 	vmspace_drop_notoken(vm);
421 }
422 
423 /*
424  * This is called during exit indicating that the vmspace is no
425  * longer in used by an exiting process, but the process has not yet
426  * been reaped.
427  *
428  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
429  * to prevent stage-2 until the process is reaped.  Note hte order of
430  * operation, we must hold first.
431  *
432  * No requirements.
433  */
434 void
435 vmspace_relexit(struct vmspace *vm)
436 {
437 	atomic_add_int(&vm->vm_holdcnt, 1);
438 	vmspace_rel(vm);
439 }
440 
441 /*
442  * Called during reap to disconnect the remainder of the vmspace from
443  * the process.  On the hold drop the vmspace termination is finalized.
444  *
445  * No requirements.
446  */
447 void
448 vmspace_exitfree(struct proc *p)
449 {
450 	struct vmspace *vm;
451 
452 	vm = p->p_vmspace;
453 	p->p_vmspace = NULL;
454 	vmspace_drop_notoken(vm);
455 }
456 
457 /*
458  * Called in two cases:
459  *
460  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
461  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
462  *     and holdcnt will still be non-zero.
463  *
464  * (2) When holdcnt becomes 0, called with final == 1.  There should no
465  *     longer be anyone with access to the vmspace.
466  *
467  * VMSPACE_EXIT1 flags the primary deactivation
468  * VMSPACE_EXIT2 flags the last reap
469  */
470 static void
471 vmspace_terminate(struct vmspace *vm, int final)
472 {
473 	int count;
474 
475 	lwkt_gettoken(&vm->vm_map.token);
476 	if (final == 0) {
477 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
478 		vm->vm_flags |= VMSPACE_EXIT1;
479 
480 		/*
481 		 * Get rid of most of the resources.  Leave the kernel pmap
482 		 * intact.
483 		 *
484 		 * If the pmap does not contain wired pages we can bulk-delete
485 		 * the pmap as a performance optimization before removing the
486 		 * related mappings.
487 		 *
488 		 * If the pmap contains wired pages we cannot do this
489 		 * pre-optimization because currently vm_fault_unwire()
490 		 * expects the pmap pages to exist and will not decrement
491 		 * p->wire_count if they do not.
492 		 */
493 		shmexit(vm);
494 		if (vmspace_pmap(vm)->pm_stats.wired_count) {
495 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
496 				      VM_MAX_USER_ADDRESS);
497 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
498 					  VM_MAX_USER_ADDRESS);
499 		} else {
500 			pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
501 					  VM_MAX_USER_ADDRESS);
502 			vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
503 				      VM_MAX_USER_ADDRESS);
504 		}
505 		lwkt_reltoken(&vm->vm_map.token);
506 	} else {
507 		KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
508 		KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
509 
510 		/*
511 		 * Get rid of remaining basic resources.
512 		 */
513 		vm->vm_flags |= VMSPACE_EXIT2;
514 		shmexit(vm);
515 
516 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
517 		vm_map_lock(&vm->vm_map);
518 		cpu_vmspace_free(vm);
519 
520 		/*
521 		 * Lock the map, to wait out all other references to it.
522 		 * Delete all of the mappings and pages they hold, then call
523 		 * the pmap module to reclaim anything left.
524 		 */
525 		vm_map_delete(&vm->vm_map,
526 			      vm_map_min(&vm->vm_map),
527 			      vm_map_max(&vm->vm_map),
528 			      &count);
529 		vm_map_unlock(&vm->vm_map);
530 		vm_map_entry_release(count);
531 
532 		pmap_release(vmspace_pmap(vm));
533 		lwkt_reltoken(&vm->vm_map.token);
534 		objcache_put(vmspace_cache, vm);
535 	}
536 }
537 
538 /*
539  * Swap useage is determined by taking the proportional swap used by
540  * VM objects backing the VM map.  To make up for fractional losses,
541  * if the VM object has any swap use at all the associated map entries
542  * count for at least 1 swap page.
543  *
544  * No requirements.
545  */
546 vm_offset_t
547 vmspace_swap_count(struct vmspace *vm)
548 {
549 	vm_map_t map = &vm->vm_map;
550 	vm_map_entry_t cur;
551 	vm_object_t object;
552 	vm_offset_t count = 0;
553 	vm_offset_t n;
554 
555 	vmspace_hold(vm);
556 
557 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
558 		switch(cur->maptype) {
559 		case VM_MAPTYPE_NORMAL:
560 			if ((object = cur->ba.object) == NULL)
561 				break;
562 			if (object->swblock_count) {
563 				n = (cur->ba.end - cur->ba.start) / PAGE_SIZE;
564 				count += object->swblock_count *
565 				    SWAP_META_PAGES * n / object->size + 1;
566 			}
567 			break;
568 		default:
569 			break;
570 		}
571 	}
572 	vmspace_drop(vm);
573 
574 	return(count);
575 }
576 
577 /*
578  * Calculate the approximate number of anonymous pages in use by
579  * this vmspace.  To make up for fractional losses, we count each
580  * VM object as having at least 1 anonymous page.
581  *
582  * No requirements.
583  */
584 vm_offset_t
585 vmspace_anonymous_count(struct vmspace *vm)
586 {
587 	vm_map_t map = &vm->vm_map;
588 	vm_map_entry_t cur;
589 	vm_object_t object;
590 	vm_offset_t count = 0;
591 
592 	vmspace_hold(vm);
593 	RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
594 		switch(cur->maptype) {
595 		case VM_MAPTYPE_NORMAL:
596 			if ((object = cur->ba.object) == NULL)
597 				break;
598 			if (object->type != OBJT_DEFAULT &&
599 			    object->type != OBJT_SWAP) {
600 				break;
601 			}
602 			count += object->resident_page_count;
603 			break;
604 		default:
605 			break;
606 		}
607 	}
608 	vmspace_drop(vm);
609 
610 	return(count);
611 }
612 
613 /*
614  * Initialize an existing vm_map structure such as that in the vmspace
615  * structure.  The pmap is initialized elsewhere.
616  *
617  * No requirements.
618  */
619 void
620 vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr,
621 	    pmap_t pmap)
622 {
623 	RB_INIT(&map->rb_root);
624 	spin_init(&map->ilock_spin, "ilock");
625 	map->ilock_base = NULL;
626 	map->nentries = 0;
627 	map->size = 0;
628 	map->system_map = 0;
629 	vm_map_min(map) = min_addr;
630 	vm_map_max(map) = max_addr;
631 	map->pmap = pmap;
632 	map->timestamp = 0;
633 	map->flags = 0;
634 	bzero(&map->freehint, sizeof(map->freehint));
635 	lwkt_token_init(&map->token, "vm_map");
636 	lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
637 }
638 
639 /*
640  * Find the first possible free address for the specified request length.
641  * Returns 0 if we don't have one cached.
642  */
643 static
644 vm_offset_t
645 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
646 {
647 	vm_map_freehint_t *scan;
648 
649 	scan = &map->freehint[0];
650 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
651 		if (scan->length == length && scan->align == align)
652 			return(scan->start);
653 		++scan;
654 	}
655 	return 0;
656 }
657 
658 /*
659  * Unconditionally set the freehint.  Called by vm_map_findspace() after
660  * it finds an address.  This will help us iterate optimally on the next
661  * similar findspace.
662  */
663 static
664 void
665 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
666 		       vm_size_t length, vm_size_t align)
667 {
668 	vm_map_freehint_t *scan;
669 
670 	scan = &map->freehint[0];
671 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
672 		if (scan->length == length && scan->align == align) {
673 			scan->start = start;
674 			return;
675 		}
676 		++scan;
677 	}
678 	scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
679 	scan->start = start;
680 	scan->align = align;
681 	scan->length = length;
682 	++map->freehint_newindex;
683 }
684 
685 /*
686  * Update any existing freehints (for any alignment), for the hole we just
687  * added.
688  */
689 static
690 void
691 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
692 {
693 	vm_map_freehint_t *scan;
694 
695 	scan = &map->freehint[0];
696 	while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
697 		if (scan->length <= length && scan->start > start)
698 			scan->start = start;
699 		++scan;
700 	}
701 }
702 
703 /*
704  * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
705  * object in the entry for COW faults.
706  *
707  * The entire chain including entry->ba (prior to inserting the fronting
708  * object) essentially becomes set in stone... elements of it can be paged
709  * in or out, but cannot be further modified.
710  *
711  * NOTE: If we do not optimize the backing chain then a unique copy is not
712  *	 needed.  Note, however, that because portions of the chain are
713  *	 shared across pmaps we cannot make any changes to the vm_map_backing
714  *	 elements themselves.
715  *
716  * If the map segment is governed by a virtual page table then it is
717  * possible to address offsets beyond the mapped area.  Just allocate
718  * a maximally sized object for this case.
719  *
720  * If addref is non-zero an additional reference is added to the returned
721  * entry.  This mechanic exists because the additional reference might have
722  * to be added atomically and not after return to prevent a premature
723  * collapse.  XXX currently there is no collapse code.
724  *
725  * The vm_map must be exclusively locked.
726  * No other requirements.
727  */
728 static
729 void
730 vm_map_entry_shadow(vm_map_entry_t entry)
731 {
732 	vm_map_backing_t ba;
733 	vm_size_t length;
734 	vm_object_t source;
735 	vm_object_t result;
736 
737 	/*
738 	 * Number of bytes we have to shadow
739 	 */
740 	length = atop(entry->ba.end - entry->ba.start);
741 
742 	/*
743 	 * Don't create the new object if the old object isn't shared.
744 	 * This case occurs quite often when programs fork/exec/wait.
745 	 *
746 	 * Caller ensures source exists (all backing_ba's must have objects),
747 	 * typically indirectly by virtue of the NEEDS_COPY flag being set.
748 	 * We have a ref on source by virtue of the entry and do not need
749 	 * to lock it to do this test.
750 	 */
751 	source = entry->ba.object;
752 	KKASSERT(source);
753 
754 	if (source->type != OBJT_VNODE) {
755 		if (source->ref_count == 1 &&
756 		    source->handle == NULL &&
757 		    (source->type == OBJT_DEFAULT ||
758 		     source->type == OBJT_SWAP)) {
759 			goto done;
760 		}
761 	}
762 	ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
763 	vm_object_hold_shared(source);
764 
765 	/*
766 	 * Once it becomes part of a backing_ba chain it can wind up anywhere,
767 	 * drop the ONEMAPPING flag now.
768 	 */
769 	vm_object_clear_flag(source, OBJ_ONEMAPPING);
770 
771 	/*
772 	 * Allocate a new object with the given length.  The new object
773 	 * is returned referenced but we may have to add another one.
774 	 * If we are adding a second reference we must clear OBJ_ONEMAPPING.
775 	 * (typically because the caller is about to clone a vm_map_entry).
776 	 *
777 	 * The source object currently has an extra reference to prevent
778 	 * collapses into it while we mess with its shadow list, which
779 	 * we will remove later in this routine.
780 	 *
781 	 * The target object may require a second reference if asked for one
782 	 * by the caller.
783 	 */
784 	result = vm_object_allocate_hold(OBJT_DEFAULT, length);
785 	if (result == NULL)
786 		panic("vm_object_shadow: no object for shadowing");
787 
788 	/*
789 	 * The new object shadows the source object.
790 	 *
791 	 * Try to optimize the result object's page color when shadowing
792 	 * in order to maintain page coloring consistency in the combined
793 	 * shadowed object.
794 	 *
795 	 * The source object is moved to ba, retaining its existing ref-count.
796 	 * No additional ref is needed.
797 	 *
798 	 * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
799 	 */
800 	vm_map_backing_detach(entry, &entry->ba);
801 	*ba = entry->ba;		/* previous ba */
802 	entry->ba.object = result;	/* new ba (at head of entry) */
803 	entry->ba.backing_ba = ba;
804 	entry->ba.backing_count = ba->backing_count + 1;
805 	entry->ba.offset = 0;
806 
807 	/* cpu localization twist */
808 	result->pg_color = vm_quickcolor();
809 
810 	vm_map_backing_attach(entry, &entry->ba);
811 	vm_map_backing_attach(entry, ba);
812 
813 	/*
814 	 * Adjust the return storage.  Drop the ref on source before
815 	 * returning.
816 	 */
817 	vm_object_drop(result);
818 	vm_object_drop(source);
819 done:
820 	entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
821 }
822 
823 /*
824  * Allocate an object for a vm_map_entry.
825  *
826  * Object allocation for anonymous mappings is defered as long as possible.
827  * This function is called when we can defer no longer, generally when a map
828  * entry might be split or forked or takes a page fault.
829  *
830  * If the map segment is governed by a virtual page table then it is
831  * possible to address offsets beyond the mapped area.  Just allocate
832  * a maximally sized object for this case.
833  *
834  * The vm_map must be exclusively locked.
835  * No other requirements.
836  */
837 void
838 vm_map_entry_allocate_object(vm_map_entry_t entry)
839 {
840 	vm_object_t obj;
841 
842 	/*
843 	 * ba.offset is NOT cumulatively added in the backing_ba scan like
844 	 * it was in the old object chain, so we can assign whatever offset
845 	 * we like to the new object.
846 	 *
847 	 * For now assign a value of 0 to make debugging object sizes
848 	 * easier.
849 	 */
850 	entry->ba.offset = 0;
851 
852 	obj = vm_object_allocate(OBJT_DEFAULT,
853 				 atop(entry->ba.end - entry->ba.start) +
854 				 entry->ba.offset);
855 	entry->ba.object = obj;
856 	vm_map_backing_attach(entry, &entry->ba);
857 }
858 
859 /*
860  * Set an initial negative count so the first attempt to reserve
861  * space preloads a bunch of vm_map_entry's for this cpu.  Also
862  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
863  * map a new page for vm_map_entry structures.  SMP systems are
864  * particularly sensitive.
865  *
866  * This routine is called in early boot so we cannot just call
867  * vm_map_entry_reserve().
868  *
869  * Called from the low level boot code only (for each cpu)
870  *
871  * WARNING! Take care not to have too-big a static/BSS structure here
872  *	    as MAXCPU can be 256+, otherwise the loader's 64MB heap
873  *	    can get blown out by the kernel plus the initrd image.
874  */
875 void
876 vm_map_entry_reserve_cpu_init(globaldata_t gd)
877 {
878 	vm_map_entry_t entry;
879 	int count;
880 	int i;
881 
882 	atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
883 	if (gd->gd_cpuid == 0) {
884 		entry = &cpu_map_entry_init_bsp[0];
885 		count = MAPENTRYBSP_CACHE;
886 	} else {
887 		entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
888 		count = MAPENTRYAP_CACHE;
889 	}
890 	for (i = 0; i < count; ++i, ++entry) {
891 		MAPENT_FREELIST(entry) = gd->gd_vme_base;
892 		gd->gd_vme_base = entry;
893 	}
894 }
895 
896 /*
897  * Reserves vm_map_entry structures so code later-on can manipulate
898  * map_entry structures within a locked map without blocking trying
899  * to allocate a new vm_map_entry.
900  *
901  * No requirements.
902  *
903  * WARNING!  We must not decrement gd_vme_avail until after we have
904  *	     ensured that sufficient entries exist, otherwise we can
905  *	     get into an endless call recursion in the zalloc code
906  *	     itself.
907  */
908 int
909 vm_map_entry_reserve(int count)
910 {
911 	struct globaldata *gd = mycpu;
912 	vm_map_entry_t entry;
913 
914 	/*
915 	 * Make sure we have enough structures in gd_vme_base to handle
916 	 * the reservation request.
917 	 *
918 	 * Use a critical section to protect against VM faults.  It might
919 	 * not be needed, but we have to be careful here.
920 	 */
921 	if (gd->gd_vme_avail < count) {
922 		crit_enter();
923 		while (gd->gd_vme_avail < count) {
924 			entry = zalloc(mapentzone);
925 			MAPENT_FREELIST(entry) = gd->gd_vme_base;
926 			gd->gd_vme_base = entry;
927 			atomic_add_int(&gd->gd_vme_avail, 1);
928 		}
929 		crit_exit();
930 	}
931 	atomic_add_int(&gd->gd_vme_avail, -count);
932 
933 	return(count);
934 }
935 
936 /*
937  * Releases previously reserved vm_map_entry structures that were not
938  * used.  If we have too much junk in our per-cpu cache clean some of
939  * it out.
940  *
941  * No requirements.
942  */
943 void
944 vm_map_entry_release(int count)
945 {
946 	struct globaldata *gd = mycpu;
947 	vm_map_entry_t entry;
948 	vm_map_entry_t efree;
949 
950 	count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
951 	if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
952 		efree = NULL;
953 		crit_enter();
954 		while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
955 			entry = gd->gd_vme_base;
956 			KKASSERT(entry != NULL);
957 			gd->gd_vme_base = MAPENT_FREELIST(entry);
958 			atomic_add_int(&gd->gd_vme_avail, -1);
959 			MAPENT_FREELIST(entry) = efree;
960 			efree = entry;
961 		}
962 		crit_exit();
963 		while ((entry = efree) != NULL) {
964 			efree = MAPENT_FREELIST(efree);
965 			zfree(mapentzone, entry);
966 		}
967 	}
968 }
969 
970 /*
971  * Reserve map entry structures for use in kernel_map itself.  These
972  * entries have *ALREADY* been reserved on a per-cpu basis when the map
973  * was inited.  This function is used by zalloc() to avoid a recursion
974  * when zalloc() itself needs to allocate additional kernel memory.
975  *
976  * This function works like the normal reserve but does not load the
977  * vm_map_entry cache (because that would result in an infinite
978  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
979  *
980  * Any caller of this function must be sure to renormalize after
981  * potentially eating entries to ensure that the reserve supply
982  * remains intact.
983  *
984  * No requirements.
985  */
986 int
987 vm_map_entry_kreserve(int count)
988 {
989 	struct globaldata *gd = mycpu;
990 
991 	atomic_add_int(&gd->gd_vme_avail, -count);
992 	KASSERT(gd->gd_vme_base != NULL,
993 		("no reserved entries left, gd_vme_avail = %d",
994 		gd->gd_vme_avail));
995 	return(count);
996 }
997 
998 /*
999  * Release previously reserved map entries for kernel_map.  We do not
1000  * attempt to clean up like the normal release function as this would
1001  * cause an unnecessary (but probably not fatal) deep procedure call.
1002  *
1003  * No requirements.
1004  */
1005 void
1006 vm_map_entry_krelease(int count)
1007 {
1008 	struct globaldata *gd = mycpu;
1009 
1010 	atomic_add_int(&gd->gd_vme_avail, count);
1011 }
1012 
1013 /*
1014  * Allocates a VM map entry for insertion.  No entry fields are filled in.
1015  *
1016  * The entries should have previously been reserved.  The reservation count
1017  * is tracked in (*countp).
1018  *
1019  * No requirements.
1020  */
1021 static vm_map_entry_t
1022 vm_map_entry_create(int *countp)
1023 {
1024 	struct globaldata *gd = mycpu;
1025 	vm_map_entry_t entry;
1026 
1027 	KKASSERT(*countp > 0);
1028 	--*countp;
1029 	crit_enter();
1030 	entry = gd->gd_vme_base;
1031 	KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
1032 	gd->gd_vme_base = MAPENT_FREELIST(entry);
1033 	crit_exit();
1034 
1035 	return(entry);
1036 }
1037 
1038 /*
1039  * Attach and detach backing store elements
1040  */
1041 static void
1042 vm_map_backing_attach(vm_map_entry_t entry, vm_map_backing_t ba)
1043 {
1044 	vm_object_t obj;
1045 
1046 	switch(entry->maptype) {
1047 	case VM_MAPTYPE_NORMAL:
1048 		obj = ba->object;
1049 		lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1050 		TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry);
1051 		lockmgr(&obj->backing_lk, LK_RELEASE);
1052 		break;
1053 	case VM_MAPTYPE_UKSMAP:
1054 		ba->uksmap(ba, UKSMAPOP_ADD, entry->aux.dev, NULL);
1055 		break;
1056 	}
1057 }
1058 
1059 static void
1060 vm_map_backing_detach(vm_map_entry_t entry, vm_map_backing_t ba)
1061 {
1062 	vm_object_t obj;
1063 
1064 	switch(entry->maptype) {
1065 	case VM_MAPTYPE_NORMAL:
1066 		obj = ba->object;
1067 		lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1068 		TAILQ_REMOVE(&obj->backing_list, ba, entry);
1069 		lockmgr(&obj->backing_lk, LK_RELEASE);
1070 		break;
1071 	case VM_MAPTYPE_UKSMAP:
1072 		ba->uksmap(ba, UKSMAPOP_REM, entry->aux.dev, NULL);
1073 		break;
1074 	}
1075 }
1076 
1077 /*
1078  * Dispose of the dynamically allocated backing_ba chain associated
1079  * with a vm_map_entry.
1080  *
1081  * We decrement the (possibly shared) element and kfree() on the
1082  * 1->0 transition.  We only iterate to the next backing_ba when
1083  * the previous one went through a 1->0 transition.
1084  *
1085  * These can only be normal vm_object based backings.
1086  */
1087 static void
1088 vm_map_entry_dispose_ba(vm_map_entry_t entry, vm_map_backing_t ba)
1089 {
1090 	vm_map_backing_t next;
1091 
1092 	while (ba) {
1093 		if (ba->map_object) {
1094 			vm_map_backing_detach(entry, ba);
1095 			vm_object_deallocate(ba->object);
1096 		}
1097 		next = ba->backing_ba;
1098 		kfree(ba, M_MAP_BACKING);
1099 		ba = next;
1100 	}
1101 }
1102 
1103 /*
1104  * Dispose of a vm_map_entry that is no longer being referenced.
1105  *
1106  * No requirements.
1107  */
1108 static void
1109 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
1110 {
1111 	struct globaldata *gd = mycpu;
1112 
1113 	/*
1114 	 * Dispose of the base object and the backing link.
1115 	 */
1116 	switch(entry->maptype) {
1117 	case VM_MAPTYPE_NORMAL:
1118 		if (entry->ba.map_object) {
1119 			vm_map_backing_detach(entry, &entry->ba);
1120 			vm_object_deallocate(entry->ba.object);
1121 		}
1122 		break;
1123 	case VM_MAPTYPE_SUBMAP:
1124 		break;
1125 	case VM_MAPTYPE_UKSMAP:
1126 		vm_map_backing_detach(entry, &entry->ba);
1127 		break;
1128 	default:
1129 		break;
1130 	}
1131 	vm_map_entry_dispose_ba(entry, entry->ba.backing_ba);
1132 
1133 	/*
1134 	 * Cleanup for safety.
1135 	 */
1136 	entry->ba.backing_ba = NULL;
1137 	entry->ba.object = NULL;
1138 	entry->ba.offset = 0;
1139 
1140 	++*countp;
1141 	crit_enter();
1142 	MAPENT_FREELIST(entry) = gd->gd_vme_base;
1143 	gd->gd_vme_base = entry;
1144 	crit_exit();
1145 }
1146 
1147 
1148 /*
1149  * Insert/remove entries from maps.
1150  *
1151  * The related map must be exclusively locked.
1152  * The caller must hold map->token
1153  * No other requirements.
1154  */
1155 static __inline void
1156 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1157 {
1158 	ASSERT_VM_MAP_LOCKED(map);
1159 
1160 	map->nentries++;
1161 	if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
1162 		panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
1163 }
1164 
1165 static __inline void
1166 vm_map_entry_unlink(vm_map_t map,
1167 		    vm_map_entry_t entry)
1168 {
1169 	ASSERT_VM_MAP_LOCKED(map);
1170 
1171 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1172 		panic("vm_map_entry_unlink: attempt to mess with "
1173 		      "locked entry! %p", entry);
1174 	}
1175 	vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
1176 	map->nentries--;
1177 }
1178 
1179 /*
1180  * Finds the map entry containing (or immediately preceding) the specified
1181  * address in the given map.  The entry is returned in (*entry).
1182  *
1183  * The boolean result indicates whether the address is actually contained
1184  * in the map.
1185  *
1186  * The related map must be locked.
1187  * No other requirements.
1188  */
1189 boolean_t
1190 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1191 {
1192 	vm_map_entry_t tmp;
1193 	vm_map_entry_t last;
1194 
1195 	ASSERT_VM_MAP_LOCKED(map);
1196 
1197 	/*
1198 	 * Locate the record from the top of the tree.  'last' tracks the
1199 	 * closest prior record and is returned if no match is found, which
1200 	 * in binary tree terms means tracking the most recent right-branch
1201 	 * taken.  If there is no prior record, *entry is set to NULL.
1202 	 */
1203 	last = NULL;
1204 	tmp = RB_ROOT(&map->rb_root);
1205 
1206 	while (tmp) {
1207 		if (address >= tmp->ba.start) {
1208 			if (address < tmp->ba.end) {
1209 				*entry = tmp;
1210 				return(TRUE);
1211 			}
1212 			last = tmp;
1213 			tmp = RB_RIGHT(tmp, rb_entry);
1214 		} else {
1215 			tmp = RB_LEFT(tmp, rb_entry);
1216 		}
1217 	}
1218 	*entry = last;
1219 	return (FALSE);
1220 }
1221 
1222 /*
1223  * Inserts the given whole VM object into the target map at the specified
1224  * address range.  The object's size should match that of the address range.
1225  *
1226  * The map must be exclusively locked.
1227  * The object must be held.
1228  * The caller must have reserved sufficient vm_map_entry structures.
1229  *
1230  * If object is non-NULL, ref count must be bumped by caller prior to
1231  * making call to account for the new entry.  XXX API is a bit messy.
1232  */
1233 int
1234 vm_map_insert(vm_map_t map, int *countp,
1235 	      void *map_object, void *map_aux,
1236 	      vm_ooffset_t offset, void *aux_info,
1237 	      vm_offset_t start, vm_offset_t end,
1238 	      vm_maptype_t maptype, vm_subsys_t id,
1239 	      vm_prot_t prot, vm_prot_t max, int cow)
1240 {
1241 	vm_map_entry_t new_entry;
1242 	vm_map_entry_t prev_entry;
1243 	vm_map_entry_t next;
1244 	vm_map_entry_t temp_entry;
1245 	vm_eflags_t protoeflags;
1246 	vm_object_t object;
1247 	int must_drop = 0;
1248 
1249 	if (maptype == VM_MAPTYPE_UKSMAP)
1250 		object = NULL;
1251 	else
1252 		object = map_object;
1253 
1254 	ASSERT_VM_MAP_LOCKED(map);
1255 	if (object)
1256 		ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1257 
1258 	/*
1259 	 * Check that the start and end points are not bogus.
1260 	 */
1261 	if ((start < vm_map_min(map)) || (end > vm_map_max(map)) ||
1262 	    (start >= end)) {
1263 		return (KERN_INVALID_ADDRESS);
1264 	}
1265 
1266 	/*
1267 	 * Find the entry prior to the proposed starting address; if it's part
1268 	 * of an existing entry, this range is bogus.
1269 	 */
1270 	if (vm_map_lookup_entry(map, start, &temp_entry))
1271 		return (KERN_NO_SPACE);
1272 	prev_entry = temp_entry;
1273 
1274 	/*
1275 	 * Assert that the next entry doesn't overlap the end point.
1276 	 */
1277 	if (prev_entry)
1278 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
1279 	else
1280 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
1281 	if (next && next->ba.start < end)
1282 		return (KERN_NO_SPACE);
1283 
1284 	protoeflags = 0;
1285 
1286 	if (cow & MAP_COPY_ON_WRITE)
1287 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1288 
1289 	if (cow & MAP_NOFAULT) {
1290 		protoeflags |= MAP_ENTRY_NOFAULT;
1291 
1292 		KASSERT(object == NULL,
1293 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
1294 	}
1295 	if (cow & MAP_DISABLE_SYNCER)
1296 		protoeflags |= MAP_ENTRY_NOSYNC;
1297 	if (cow & MAP_DISABLE_COREDUMP)
1298 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1299 	if (cow & MAP_IS_STACK)
1300 		protoeflags |= MAP_ENTRY_STACK;
1301 	if (cow & MAP_IS_KSTACK)
1302 		protoeflags |= MAP_ENTRY_KSTACK;
1303 
1304 	lwkt_gettoken(&map->token);
1305 
1306 	if (object) {
1307 		;
1308 	} else if (prev_entry &&
1309 		 (prev_entry->eflags == protoeflags) &&
1310 		 (prev_entry->ba.end == start) &&
1311 		 (prev_entry->wired_count == 0) &&
1312 		 (prev_entry->id == id) &&
1313 		 prev_entry->maptype == maptype &&
1314 		 maptype == VM_MAPTYPE_NORMAL &&
1315 		 prev_entry->ba.backing_ba == NULL &&	/* not backed */
1316 		 ((prev_entry->ba.object == NULL) ||
1317 		  vm_object_coalesce(prev_entry->ba.object,
1318 				     OFF_TO_IDX(prev_entry->ba.offset),
1319 				     (vm_size_t)(prev_entry->ba.end - prev_entry->ba.start),
1320 				     (vm_size_t)(end - prev_entry->ba.end)))) {
1321 		/*
1322 		 * We were able to extend the object.  Determine if we
1323 		 * can extend the previous map entry to include the
1324 		 * new range as well.
1325 		 */
1326 		if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1327 		    (prev_entry->protection == prot) &&
1328 		    (prev_entry->max_protection == max)) {
1329 			map->size += (end - prev_entry->ba.end);
1330 			vm_map_backing_adjust_end(prev_entry, end);
1331 			vm_map_simplify_entry(map, prev_entry, countp);
1332 			lwkt_reltoken(&map->token);
1333 			return (KERN_SUCCESS);
1334 		}
1335 
1336 		/*
1337 		 * If we can extend the object but cannot extend the
1338 		 * map entry, we have to create a new map entry.  We
1339 		 * must bump the ref count on the extended object to
1340 		 * account for it.  object may be NULL.
1341 		 */
1342 		object = prev_entry->ba.object;
1343 		offset = prev_entry->ba.offset +
1344 			(prev_entry->ba.end - prev_entry->ba.start);
1345 		if (object) {
1346 			vm_object_hold(object);
1347 			vm_object_lock_swap(); /* map->token order */
1348 			vm_object_reference_locked(object);
1349 			map_object = object;
1350 			must_drop = 1;
1351 		}
1352 	}
1353 
1354 	/*
1355 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
1356 	 * in things like the buffer map where we manage kva but do not manage
1357 	 * backing objects.
1358 	 */
1359 
1360 	/*
1361 	 * Create a new entry
1362 	 */
1363 	new_entry = vm_map_entry_create(countp);
1364 	new_entry->ba.pmap = map->pmap;
1365 	new_entry->ba.start = start;
1366 	new_entry->ba.end = end;
1367 	new_entry->id = id;
1368 
1369 	new_entry->maptype = maptype;
1370 	new_entry->eflags = protoeflags;
1371 	new_entry->aux.master_pde = 0;		/* in case size is different */
1372 	new_entry->aux.map_aux = map_aux;
1373 	new_entry->ba.map_object = map_object;
1374 	new_entry->ba.backing_ba = NULL;
1375 	new_entry->ba.backing_count = 0;
1376 	new_entry->ba.offset = offset;
1377 	new_entry->ba.aux_info = aux_info;
1378 	new_entry->ba.flags = 0;
1379 	new_entry->ba.pmap = map->pmap;
1380 
1381 	new_entry->inheritance = VM_INHERIT_DEFAULT;
1382 	new_entry->protection = prot;
1383 	new_entry->max_protection = max;
1384 	new_entry->wired_count = 0;
1385 
1386 	/*
1387 	 * Insert the new entry into the list
1388 	 */
1389 	vm_map_backing_replicated(map, new_entry, MAP_BACK_BASEOBJREFD);
1390 	vm_map_entry_link(map, new_entry);
1391 	map->size += new_entry->ba.end - new_entry->ba.start;
1392 
1393 	/*
1394 	 * Don't worry about updating freehint[] when inserting, allow
1395 	 * addresses to be lower than the actual first free spot.
1396 	 */
1397 #if 0
1398 	/*
1399 	 * Temporarily removed to avoid MAP_STACK panic, due to
1400 	 * MAP_STACK being a huge hack.  Will be added back in
1401 	 * when MAP_STACK (and the user stack mapping) is fixed.
1402 	 */
1403 	/*
1404 	 * It may be possible to simplify the entry
1405 	 */
1406 	vm_map_simplify_entry(map, new_entry, countp);
1407 #endif
1408 
1409 	/*
1410 	 * Try to pre-populate the page table.  Mappings governed by virtual
1411 	 * page tables cannot be prepopulated without a lot of work, so
1412 	 * don't try.
1413 	 */
1414 	if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1415 	    maptype != VM_MAPTYPE_UKSMAP) {
1416 		int dorelock = 0;
1417 		if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1418 			dorelock = 1;
1419 			vm_object_lock_swap();
1420 			vm_object_drop(object);
1421 		}
1422 		pmap_object_init_pt(map->pmap, new_entry,
1423 				    new_entry->ba.start,
1424 				    new_entry->ba.end - new_entry->ba.start,
1425 				    cow & MAP_PREFAULT_PARTIAL);
1426 		if (dorelock) {
1427 			vm_object_hold(object);
1428 			vm_object_lock_swap();
1429 		}
1430 	}
1431 	lwkt_reltoken(&map->token);
1432 	if (must_drop)
1433 		vm_object_drop(object);
1434 
1435 	return (KERN_SUCCESS);
1436 }
1437 
1438 /*
1439  * Find sufficient space for `length' bytes in the given map, starting at
1440  * `start'.  Returns 0 on success, 1 on no space.
1441  *
1442  * This function will returned an arbitrarily aligned pointer.  If no
1443  * particular alignment is required you should pass align as 1.  Note that
1444  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1445  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1446  * argument.
1447  *
1448  * 'align' should be a power of 2 but is not required to be.
1449  *
1450  * The map must be exclusively locked.
1451  * No other requirements.
1452  */
1453 int
1454 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1455 		 vm_size_t align, int flags, vm_offset_t *addr)
1456 {
1457 	vm_map_entry_t entry;
1458 	vm_map_entry_t tmp;
1459 	vm_offset_t hole_start;
1460 	vm_offset_t end;
1461 	vm_offset_t align_mask;
1462 
1463 	if (start < vm_map_min(map))
1464 		start = vm_map_min(map);
1465 	if (start > vm_map_max(map))
1466 		return (1);
1467 
1468 	/*
1469 	 * If the alignment is not a power of 2 we will have to use
1470 	 * a mod/division, set align_mask to a special value.
1471 	 */
1472 	if ((align | (align - 1)) + 1 != (align << 1))
1473 		align_mask = (vm_offset_t)-1;
1474 	else
1475 		align_mask = align - 1;
1476 
1477 	/*
1478 	 * Use freehint to adjust the start point, hopefully reducing
1479 	 * the iteration to O(1).
1480 	 */
1481 	hole_start = vm_map_freehint_find(map, length, align);
1482 	if (start < hole_start)
1483 		start = hole_start;
1484 	if (vm_map_lookup_entry(map, start, &tmp))
1485 		start = tmp->ba.end;
1486 	entry = tmp;	/* may be NULL */
1487 
1488 	/*
1489 	 * Look through the rest of the map, trying to fit a new region in the
1490 	 * gap between existing regions, or after the very last region.
1491 	 */
1492 	for (;;) {
1493 		/*
1494 		 * Adjust the proposed start by the requested alignment,
1495 		 * be sure that we didn't wrap the address.
1496 		 */
1497 		if (align_mask == (vm_offset_t)-1)
1498 			end = roundup(start, align);
1499 		else
1500 			end = (start + align_mask) & ~align_mask;
1501 		if (end < start)
1502 			return (1);
1503 		start = end;
1504 
1505 		/*
1506 		 * Find the end of the proposed new region.  Be sure we didn't
1507 		 * go beyond the end of the map, or wrap around the address.
1508 		 * Then check to see if this is the last entry or if the
1509 		 * proposed end fits in the gap between this and the next
1510 		 * entry.
1511 		 */
1512 		end = start + length;
1513 		if (end > vm_map_max(map) || end < start)
1514 			return (1);
1515 
1516 		/*
1517 		 * Locate the next entry, we can stop if this is the
1518 		 * last entry (we know we are in-bounds so that would
1519 		 * be a sucess).
1520 		 */
1521 		if (entry)
1522 			entry = vm_map_rb_tree_RB_NEXT(entry);
1523 		else
1524 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
1525 		if (entry == NULL)
1526 			break;
1527 
1528 		/*
1529 		 * Determine if the proposed area would overlap the
1530 		 * next entry.
1531 		 *
1532 		 * When matching against a STACK entry, only allow the
1533 		 * memory map to intrude on the ungrown portion of the
1534 		 * STACK entry when MAP_TRYFIXED is set.
1535 		 */
1536 		if (entry->ba.start >= end) {
1537 			if ((entry->eflags & MAP_ENTRY_STACK) == 0)
1538 				break;
1539 			if (flags & MAP_TRYFIXED)
1540 				break;
1541 			if (entry->ba.start - entry->aux.avail_ssize >= end)
1542 				break;
1543 		}
1544 		start = entry->ba.end;
1545 	}
1546 
1547 	/*
1548 	 * Update the freehint
1549 	 */
1550 	vm_map_freehint_update(map, start, length, align);
1551 
1552 	/*
1553 	 * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1554 	 * if it fails.  The kernel_map is locked and nothing can steal
1555 	 * our address space if pmap_growkernel() blocks.
1556 	 *
1557 	 * NOTE: This may be unconditionally called for kldload areas on
1558 	 *	 x86_64 because these do not bump kernel_vm_end (which would
1559 	 *	 fill 128G worth of page tables!).  Therefore we must not
1560 	 *	 retry.
1561 	 */
1562 	if (map == &kernel_map) {
1563 		vm_offset_t kstop;
1564 
1565 		kstop = round_page(start + length);
1566 		if (kstop > kernel_vm_end)
1567 			pmap_growkernel(start, kstop);
1568 	}
1569 	*addr = start;
1570 	return (0);
1571 }
1572 
1573 /*
1574  * vm_map_find finds an unallocated region in the target address map with
1575  * the given length and allocates it.  The search is defined to be first-fit
1576  * from the specified address; the region found is returned in the same
1577  * parameter.
1578  *
1579  * If object is non-NULL, ref count must be bumped by caller
1580  * prior to making call to account for the new entry.
1581  *
1582  * No requirements.  This function will lock the map temporarily.
1583  */
1584 int
1585 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1586 	    vm_ooffset_t offset, vm_offset_t *addr,
1587 	    vm_size_t length, vm_size_t align, boolean_t fitit,
1588 	    vm_maptype_t maptype, vm_subsys_t id,
1589 	    vm_prot_t prot, vm_prot_t max, int cow)
1590 {
1591 	vm_offset_t start;
1592 	vm_object_t object;
1593 	void *aux_info;
1594 	int result;
1595 	int count;
1596 
1597 	/*
1598 	 * Certain UKSMAPs may need aux_info.
1599 	 *
1600 	 * (map_object is the callback function, aux_info is the process
1601 	 *  or thread, if necessary).
1602 	 */
1603 	aux_info = NULL;
1604 	if (maptype == VM_MAPTYPE_UKSMAP) {
1605 		KKASSERT(map_aux != NULL && map_object != NULL);
1606 
1607 		switch(minor(((struct cdev *)map_aux))) {
1608 		case 5:
1609 			/*
1610 			 * /dev/upmap
1611 			 */
1612 			aux_info = curproc;
1613 			break;
1614 		case 6:
1615 			/*
1616 			 * /dev/kpmap
1617 			 */
1618 			break;
1619 		case 7:
1620 			/*
1621 			 * /dev/lpmap
1622 			 */
1623 			aux_info = curthread->td_lwp;
1624 			break;
1625 		}
1626 		object = NULL;
1627 	} else {
1628 		object = map_object;
1629 	}
1630 
1631 	start = *addr;
1632 
1633 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1634 	vm_map_lock(map);
1635 	if (object)
1636 		vm_object_hold_shared(object);
1637 	if (fitit) {
1638 		if (vm_map_findspace(map, start, length, align, 0, addr)) {
1639 			if (object)
1640 				vm_object_drop(object);
1641 			vm_map_unlock(map);
1642 			vm_map_entry_release(count);
1643 			return (KERN_NO_SPACE);
1644 		}
1645 		start = *addr;
1646 	}
1647 	result = vm_map_insert(map, &count,
1648 			       map_object, map_aux,
1649 			       offset, aux_info,
1650 			       start, start + length,
1651 			       maptype, id, prot, max, cow);
1652 	if (object)
1653 		vm_object_drop(object);
1654 	vm_map_unlock(map);
1655 	vm_map_entry_release(count);
1656 
1657 	return (result);
1658 }
1659 
1660 /*
1661  * Simplify the given map entry by merging with either neighbor.  This
1662  * routine also has the ability to merge with both neighbors.
1663  *
1664  * This routine guarentees that the passed entry remains valid (though
1665  * possibly extended).  When merging, this routine may delete one or
1666  * both neighbors.  No action is taken on entries which have their
1667  * in-transition flag set.
1668  *
1669  * The map must be exclusively locked.
1670  */
1671 void
1672 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1673 {
1674 	vm_map_entry_t next, prev;
1675 	vm_size_t prevsize, esize;
1676 
1677 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1678 		++mycpu->gd_cnt.v_intrans_coll;
1679 		return;
1680 	}
1681 
1682 	if (entry->maptype == VM_MAPTYPE_SUBMAP)
1683 		return;
1684 	if (entry->maptype == VM_MAPTYPE_UKSMAP)
1685 		return;
1686 
1687 	prev = vm_map_rb_tree_RB_PREV(entry);
1688 	if (prev) {
1689 		prevsize = prev->ba.end - prev->ba.start;
1690 		if ( (prev->ba.end == entry->ba.start) &&
1691 		     (prev->maptype == entry->maptype) &&
1692 		     (prev->ba.object == entry->ba.object) &&
1693 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1694 		     (!prev->ba.object ||
1695 			(prev->ba.offset + prevsize == entry->ba.offset)) &&
1696 		     (prev->eflags == entry->eflags) &&
1697 		     (prev->protection == entry->protection) &&
1698 		     (prev->max_protection == entry->max_protection) &&
1699 		     (prev->inheritance == entry->inheritance) &&
1700 		     (prev->id == entry->id) &&
1701 		     (prev->wired_count == entry->wired_count)) {
1702 			/*
1703 			 * NOTE: order important.  Unlink before gumming up
1704 			 *	 the RBTREE w/adjust, adjust before disposal
1705 			 *	 of prior entry, to avoid pmap snafus.
1706 			 */
1707 			vm_map_entry_unlink(map, prev);
1708 			vm_map_backing_adjust_start(entry, prev->ba.start);
1709 			if (entry->ba.object == NULL)
1710 				entry->ba.offset = 0;
1711 			vm_map_entry_dispose(map, prev, countp);
1712 		}
1713 	}
1714 
1715 	next = vm_map_rb_tree_RB_NEXT(entry);
1716 	if (next) {
1717 		esize = entry->ba.end - entry->ba.start;
1718 		if ((entry->ba.end == next->ba.start) &&
1719 		    (next->maptype == entry->maptype) &&
1720 		    (next->ba.object == entry->ba.object) &&
1721 		     (prev->ba.backing_ba == entry->ba.backing_ba) &&
1722 		     (!entry->ba.object ||
1723 			(entry->ba.offset + esize == next->ba.offset)) &&
1724 		    (next->eflags == entry->eflags) &&
1725 		    (next->protection == entry->protection) &&
1726 		    (next->max_protection == entry->max_protection) &&
1727 		    (next->inheritance == entry->inheritance) &&
1728 		    (next->id == entry->id) &&
1729 		    (next->wired_count == entry->wired_count)) {
1730 			/*
1731 			 * NOTE: order important.  Unlink before gumming up
1732 			 *	 the RBTREE w/adjust, adjust before disposal
1733 			 *	 of prior entry, to avoid pmap snafus.
1734 			 */
1735 			vm_map_entry_unlink(map, next);
1736 			vm_map_backing_adjust_end(entry, next->ba.end);
1737 			vm_map_entry_dispose(map, next, countp);
1738 	        }
1739 	}
1740 }
1741 
1742 /*
1743  * Asserts that the given entry begins at or after the specified address.
1744  * If necessary, it splits the entry into two.
1745  */
1746 #define vm_map_clip_start(map, entry, startaddr, countp)		\
1747 {									\
1748 	if (startaddr > entry->ba.start)				\
1749 		_vm_map_clip_start(map, entry, startaddr, countp);	\
1750 }
1751 
1752 /*
1753  * This routine is called only when it is known that the entry must be split.
1754  *
1755  * The map must be exclusively locked.
1756  */
1757 static void
1758 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1759 		   int *countp)
1760 {
1761 	vm_map_entry_t new_entry;
1762 
1763 	/*
1764 	 * Split off the front portion -- note that we must insert the new
1765 	 * entry BEFORE this one, so that this entry has the specified
1766 	 * starting address.
1767 	 */
1768 
1769 	vm_map_simplify_entry(map, entry, countp);
1770 
1771 	/*
1772 	 * If there is no object backing this entry, we might as well create
1773 	 * one now.  If we defer it, an object can get created after the map
1774 	 * is clipped, and individual objects will be created for the split-up
1775 	 * map.  This is a bit of a hack, but is also about the best place to
1776 	 * put this improvement.
1777 	 */
1778 	if (entry->ba.object == NULL && !map->system_map &&
1779 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1780 		vm_map_entry_allocate_object(entry);
1781 	}
1782 
1783 	/*
1784 	 * NOTE: The replicated function will adjust start, end, and offset
1785 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1786 	 *	 the embedded ba.
1787 	 */
1788 	new_entry = vm_map_entry_create(countp);
1789 	*new_entry = *entry;
1790 	new_entry->ba.end = start;
1791 
1792 	/*
1793 	 * Ordering is important, make sure the new entry is replicated
1794 	 * before we cut the exiting entry.
1795 	 */
1796 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1797 	vm_map_backing_adjust_start(entry, start);
1798 	vm_map_entry_link(map, new_entry);
1799 }
1800 
1801 /*
1802  * Asserts that the given entry ends at or before the specified address.
1803  * If necessary, it splits the entry into two.
1804  *
1805  * The map must be exclusively locked.
1806  */
1807 #define vm_map_clip_end(map, entry, endaddr, countp)		\
1808 {								\
1809 	if (endaddr < entry->ba.end)				\
1810 		_vm_map_clip_end(map, entry, endaddr, countp);	\
1811 }
1812 
1813 /*
1814  * This routine is called only when it is known that the entry must be split.
1815  *
1816  * The map must be exclusively locked.
1817  */
1818 static void
1819 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1820 		 int *countp)
1821 {
1822 	vm_map_entry_t new_entry;
1823 
1824 	/*
1825 	 * If there is no object backing this entry, we might as well create
1826 	 * one now.  If we defer it, an object can get created after the map
1827 	 * is clipped, and individual objects will be created for the split-up
1828 	 * map.  This is a bit of a hack, but is also about the best place to
1829 	 * put this improvement.
1830 	 */
1831 
1832 	if (entry->ba.object == NULL && !map->system_map &&
1833 	    VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1834 		vm_map_entry_allocate_object(entry);
1835 	}
1836 
1837 	/*
1838 	 * Create a new entry and insert it AFTER the specified entry
1839 	 *
1840 	 * NOTE: The replicated function will adjust start, end, and offset
1841 	 *	 for the remainder of the backing_ba linkages.  We must fixup
1842 	 *	 the embedded ba.
1843 	 */
1844 	new_entry = vm_map_entry_create(countp);
1845 	*new_entry = *entry;
1846 	new_entry->ba.start = end;
1847 	new_entry->ba.offset += (new_entry->ba.start - entry->ba.start);
1848 
1849 	/*
1850 	 * Ordering is important, make sure the new entry is replicated
1851 	 * before we cut the exiting entry.
1852 	 */
1853 	vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1854 	vm_map_backing_adjust_end(entry, end);
1855 	vm_map_entry_link(map, new_entry);
1856 }
1857 
1858 /*
1859  * Asserts that the starting and ending region addresses fall within the
1860  * valid range for the map.
1861  */
1862 #define	VM_MAP_RANGE_CHECK(map, start, end)	\
1863 {						\
1864 	if (start < vm_map_min(map))		\
1865 		start = vm_map_min(map);	\
1866 	if (end > vm_map_max(map))		\
1867 		end = vm_map_max(map);		\
1868 	if (start > end)			\
1869 		start = end;			\
1870 }
1871 
1872 /*
1873  * Used to block when an in-transition collison occurs.  The map
1874  * is unlocked for the sleep and relocked before the return.
1875  */
1876 void
1877 vm_map_transition_wait(vm_map_t map, int relock)
1878 {
1879 	tsleep_interlock(map, 0);
1880 	vm_map_unlock(map);
1881 	tsleep(map, PINTERLOCKED, "vment", 0);
1882 	if (relock)
1883 		vm_map_lock(map);
1884 }
1885 
1886 /*
1887  * When we do blocking operations with the map lock held it is
1888  * possible that a clip might have occured on our in-transit entry,
1889  * requiring an adjustment to the entry in our loop.  These macros
1890  * help the pageable and clip_range code deal with the case.  The
1891  * conditional costs virtually nothing if no clipping has occured.
1892  */
1893 
1894 #define CLIP_CHECK_BACK(entry, save_start)			\
1895     do {							\
1896 	    while (entry->ba.start != save_start) {		\
1897 		    entry = vm_map_rb_tree_RB_PREV(entry);	\
1898 		    KASSERT(entry, ("bad entry clip")); 	\
1899 	    }							\
1900     } while(0)
1901 
1902 #define CLIP_CHECK_FWD(entry, save_end)				\
1903     do {							\
1904 	    while (entry->ba.end != save_end) {			\
1905 		    entry = vm_map_rb_tree_RB_NEXT(entry);	\
1906 		    KASSERT(entry, ("bad entry clip")); 	\
1907 	    }							\
1908     } while(0)
1909 
1910 
1911 /*
1912  * Clip the specified range and return the base entry.  The
1913  * range may cover several entries starting at the returned base
1914  * and the first and last entry in the covering sequence will be
1915  * properly clipped to the requested start and end address.
1916  *
1917  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1918  * flag.
1919  *
1920  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1921  * covered by the requested range.
1922  *
1923  * The map must be exclusively locked on entry and will remain locked
1924  * on return. If no range exists or the range contains holes and you
1925  * specified that no holes were allowed, NULL will be returned.  This
1926  * routine may temporarily unlock the map in order avoid a deadlock when
1927  * sleeping.
1928  */
1929 static
1930 vm_map_entry_t
1931 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1932 		  int *countp, int flags)
1933 {
1934 	vm_map_entry_t start_entry;
1935 	vm_map_entry_t entry;
1936 	vm_map_entry_t next;
1937 
1938 	/*
1939 	 * Locate the entry and effect initial clipping.  The in-transition
1940 	 * case does not occur very often so do not try to optimize it.
1941 	 */
1942 again:
1943 	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1944 		return (NULL);
1945 	entry = start_entry;
1946 	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1947 		entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1948 		++mycpu->gd_cnt.v_intrans_coll;
1949 		++mycpu->gd_cnt.v_intrans_wait;
1950 		vm_map_transition_wait(map, 1);
1951 		/*
1952 		 * entry and/or start_entry may have been clipped while
1953 		 * we slept, or may have gone away entirely.  We have
1954 		 * to restart from the lookup.
1955 		 */
1956 		goto again;
1957 	}
1958 
1959 	/*
1960 	 * Since we hold an exclusive map lock we do not have to restart
1961 	 * after clipping, even though clipping may block in zalloc.
1962 	 */
1963 	vm_map_clip_start(map, entry, start, countp);
1964 	vm_map_clip_end(map, entry, end, countp);
1965 	entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1966 
1967 	/*
1968 	 * Scan entries covered by the range.  When working on the next
1969 	 * entry a restart need only re-loop on the current entry which
1970 	 * we have already locked, since 'next' may have changed.  Also,
1971 	 * even though entry is safe, it may have been clipped so we
1972 	 * have to iterate forwards through the clip after sleeping.
1973 	 */
1974 	for (;;) {
1975 		next = vm_map_rb_tree_RB_NEXT(entry);
1976 		if (next == NULL || next->ba.start >= end)
1977 			break;
1978 		if (flags & MAP_CLIP_NO_HOLES) {
1979 			if (next->ba.start > entry->ba.end) {
1980 				vm_map_unclip_range(map, start_entry,
1981 					start, entry->ba.end, countp, flags);
1982 				return(NULL);
1983 			}
1984 		}
1985 
1986 		if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1987 			vm_offset_t save_end = entry->ba.end;
1988 			next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1989 			++mycpu->gd_cnt.v_intrans_coll;
1990 			++mycpu->gd_cnt.v_intrans_wait;
1991 			vm_map_transition_wait(map, 1);
1992 
1993 			/*
1994 			 * clips might have occured while we blocked.
1995 			 */
1996 			CLIP_CHECK_FWD(entry, save_end);
1997 			CLIP_CHECK_BACK(start_entry, start);
1998 			continue;
1999 		}
2000 
2001 		/*
2002 		 * No restart necessary even though clip_end may block, we
2003 		 * are holding the map lock.
2004 		 */
2005 		vm_map_clip_end(map, next, end, countp);
2006 		next->eflags |= MAP_ENTRY_IN_TRANSITION;
2007 		entry = next;
2008 	}
2009 	if (flags & MAP_CLIP_NO_HOLES) {
2010 		if (entry->ba.end != end) {
2011 			vm_map_unclip_range(map, start_entry,
2012 				start, entry->ba.end, countp, flags);
2013 			return(NULL);
2014 		}
2015 	}
2016 	return(start_entry);
2017 }
2018 
2019 /*
2020  * Undo the effect of vm_map_clip_range().  You should pass the same
2021  * flags and the same range that you passed to vm_map_clip_range().
2022  * This code will clear the in-transition flag on the entries and
2023  * wake up anyone waiting.  This code will also simplify the sequence
2024  * and attempt to merge it with entries before and after the sequence.
2025  *
2026  * The map must be locked on entry and will remain locked on return.
2027  *
2028  * Note that you should also pass the start_entry returned by
2029  * vm_map_clip_range().  However, if you block between the two calls
2030  * with the map unlocked please be aware that the start_entry may
2031  * have been clipped and you may need to scan it backwards to find
2032  * the entry corresponding with the original start address.  You are
2033  * responsible for this, vm_map_unclip_range() expects the correct
2034  * start_entry to be passed to it and will KASSERT otherwise.
2035  */
2036 static
2037 void
2038 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
2039 		    vm_offset_t start, vm_offset_t end,
2040 		    int *countp, int flags)
2041 {
2042 	vm_map_entry_t entry;
2043 
2044 	entry = start_entry;
2045 
2046 	KASSERT(entry->ba.start == start, ("unclip_range: illegal base entry"));
2047 	while (entry && entry->ba.start < end) {
2048 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2049 			("in-transition flag not set during unclip on: %p",
2050 			entry));
2051 		KASSERT(entry->ba.end <= end,
2052 			("unclip_range: tail wasn't clipped"));
2053 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2054 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2055 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2056 			wakeup(map);
2057 		}
2058 		entry = vm_map_rb_tree_RB_NEXT(entry);
2059 	}
2060 
2061 	/*
2062 	 * Simplification does not block so there is no restart case.
2063 	 */
2064 	entry = start_entry;
2065 	while (entry && entry->ba.start < end) {
2066 		vm_map_simplify_entry(map, entry, countp);
2067 		entry = vm_map_rb_tree_RB_NEXT(entry);
2068 	}
2069 }
2070 
2071 /*
2072  * Mark the given range as handled by a subordinate map.
2073  *
2074  * This range must have been created with vm_map_find(), and no other
2075  * operations may have been performed on this range prior to calling
2076  * vm_map_submap().
2077  *
2078  * Submappings cannot be removed.
2079  *
2080  * No requirements.
2081  */
2082 int
2083 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
2084 {
2085 	vm_map_entry_t entry;
2086 	int result = KERN_INVALID_ARGUMENT;
2087 	int count;
2088 
2089 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2090 	vm_map_lock(map);
2091 
2092 	VM_MAP_RANGE_CHECK(map, start, end);
2093 
2094 	if (vm_map_lookup_entry(map, start, &entry)) {
2095 		vm_map_clip_start(map, entry, start, &count);
2096 	} else if (entry) {
2097 		entry = vm_map_rb_tree_RB_NEXT(entry);
2098 	} else {
2099 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2100 	}
2101 
2102 	vm_map_clip_end(map, entry, end, &count);
2103 
2104 	if ((entry->ba.start == start) && (entry->ba.end == end) &&
2105 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2106 	    (entry->ba.object == NULL)) {
2107 		entry->ba.sub_map = submap;
2108 		entry->maptype = VM_MAPTYPE_SUBMAP;
2109 		result = KERN_SUCCESS;
2110 	}
2111 	vm_map_unlock(map);
2112 	vm_map_entry_release(count);
2113 
2114 	return (result);
2115 }
2116 
2117 /*
2118  * Sets the protection of the specified address region in the target map.
2119  * If "set_max" is specified, the maximum protection is to be set;
2120  * otherwise, only the current protection is affected.
2121  *
2122  * The protection is not applicable to submaps, but is applicable to normal
2123  * maps and maps governed by virtual page tables.  For example, when operating
2124  * on a virtual page table our protection basically controls how COW occurs
2125  * on the backing object, whereas the virtual page table abstraction itself
2126  * is an abstraction for userland.
2127  *
2128  * No requirements.
2129  */
2130 int
2131 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2132 	       vm_prot_t new_prot, boolean_t set_max)
2133 {
2134 	vm_map_entry_t current;
2135 	vm_map_entry_t entry;
2136 	int count;
2137 
2138 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2139 	vm_map_lock(map);
2140 
2141 	VM_MAP_RANGE_CHECK(map, start, end);
2142 
2143 	if (vm_map_lookup_entry(map, start, &entry)) {
2144 		vm_map_clip_start(map, entry, start, &count);
2145 	} else if (entry) {
2146 		entry = vm_map_rb_tree_RB_NEXT(entry);
2147 	} else {
2148 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2149 	}
2150 
2151 	/*
2152 	 * Make a first pass to check for protection violations.
2153 	 */
2154 	current = entry;
2155 	while (current && current->ba.start < end) {
2156 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2157 			vm_map_unlock(map);
2158 			vm_map_entry_release(count);
2159 			return (KERN_INVALID_ARGUMENT);
2160 		}
2161 		if ((new_prot & current->max_protection) != new_prot) {
2162 			vm_map_unlock(map);
2163 			vm_map_entry_release(count);
2164 			return (KERN_PROTECTION_FAILURE);
2165 		}
2166 
2167 		/*
2168 		 * When making a SHARED+RW file mmap writable, update
2169 		 * v_lastwrite_ts.
2170 		 */
2171 		if (new_prot & PROT_WRITE &&
2172 		    (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
2173 		    current->maptype == VM_MAPTYPE_NORMAL &&
2174 		    current->ba.object &&
2175 		    current->ba.object->type == OBJT_VNODE) {
2176 			struct vnode *vp;
2177 
2178 			vp = current->ba.object->handle;
2179 			if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
2180 				vfs_timestamp(&vp->v_lastwrite_ts);
2181 				vsetflags(vp, VLASTWRITETS);
2182 				vn_unlock(vp);
2183 			}
2184 		}
2185 		current = vm_map_rb_tree_RB_NEXT(current);
2186 	}
2187 
2188 	/*
2189 	 * Go back and fix up protections. [Note that clipping is not
2190 	 * necessary the second time.]
2191 	 */
2192 	current = entry;
2193 
2194 	while (current && current->ba.start < end) {
2195 		vm_prot_t old_prot;
2196 
2197 		vm_map_clip_end(map, current, end, &count);
2198 
2199 		old_prot = current->protection;
2200 		if (set_max) {
2201 			current->max_protection = new_prot;
2202 			current->protection = new_prot & old_prot;
2203 		} else {
2204 			current->protection = new_prot;
2205 		}
2206 
2207 		/*
2208 		 * Update physical map if necessary. Worry about copy-on-write
2209 		 * here -- CHECK THIS XXX
2210 		 */
2211 		if (current->protection != old_prot) {
2212 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2213 							VM_PROT_ALL)
2214 
2215 			pmap_protect(map->pmap, current->ba.start,
2216 			    current->ba.end,
2217 			    current->protection & MASK(current));
2218 #undef	MASK
2219 		}
2220 
2221 		vm_map_simplify_entry(map, current, &count);
2222 
2223 		current = vm_map_rb_tree_RB_NEXT(current);
2224 	}
2225 	vm_map_unlock(map);
2226 	vm_map_entry_release(count);
2227 	return (KERN_SUCCESS);
2228 }
2229 
2230 /*
2231  * This routine traverses a processes map handling the madvise
2232  * system call.  Advisories are classified as either those effecting
2233  * the vm_map_entry structure, or those effecting the underlying
2234  * objects.
2235  *
2236  * The <value> argument is used for extended madvise calls.
2237  *
2238  * No requirements.
2239  */
2240 int
2241 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2242 	       int behav, off_t value)
2243 {
2244 	vm_map_entry_t current, entry;
2245 	int modify_map = 0;
2246 	int error = 0;
2247 	int count;
2248 
2249 	/*
2250 	 * Some madvise calls directly modify the vm_map_entry, in which case
2251 	 * we need to use an exclusive lock on the map and we need to perform
2252 	 * various clipping operations.  Otherwise we only need a read-lock
2253 	 * on the map.
2254 	 */
2255 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2256 
2257 	switch(behav) {
2258 	case MADV_NORMAL:
2259 	case MADV_SEQUENTIAL:
2260 	case MADV_RANDOM:
2261 	case MADV_NOSYNC:
2262 	case MADV_AUTOSYNC:
2263 	case MADV_NOCORE:
2264 	case MADV_CORE:
2265 	case MADV_SETMAP:
2266 		modify_map = 1;
2267 		vm_map_lock(map);
2268 		break;
2269 	case MADV_INVAL:
2270 	case MADV_WILLNEED:
2271 	case MADV_DONTNEED:
2272 	case MADV_FREE:
2273 		vm_map_lock_read(map);
2274 		break;
2275 	default:
2276 		vm_map_entry_release(count);
2277 		return (EINVAL);
2278 	}
2279 
2280 	/*
2281 	 * Locate starting entry and clip if necessary.
2282 	 */
2283 
2284 	VM_MAP_RANGE_CHECK(map, start, end);
2285 
2286 	if (vm_map_lookup_entry(map, start, &entry)) {
2287 		if (modify_map)
2288 			vm_map_clip_start(map, entry, start, &count);
2289 	} else if (entry) {
2290 		entry = vm_map_rb_tree_RB_NEXT(entry);
2291 	} else {
2292 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2293 	}
2294 
2295 	if (modify_map) {
2296 		/*
2297 		 * madvise behaviors that are implemented in the vm_map_entry.
2298 		 *
2299 		 * We clip the vm_map_entry so that behavioral changes are
2300 		 * limited to the specified address range.
2301 		 */
2302 		for (current = entry;
2303 		     current && current->ba.start < end;
2304 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2305 			/*
2306 			 * Ignore submaps
2307 			 */
2308 			if (current->maptype == VM_MAPTYPE_SUBMAP)
2309 				continue;
2310 
2311 			vm_map_clip_end(map, current, end, &count);
2312 
2313 			switch (behav) {
2314 			case MADV_NORMAL:
2315 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2316 				break;
2317 			case MADV_SEQUENTIAL:
2318 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2319 				break;
2320 			case MADV_RANDOM:
2321 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2322 				break;
2323 			case MADV_NOSYNC:
2324 				current->eflags |= MAP_ENTRY_NOSYNC;
2325 				break;
2326 			case MADV_AUTOSYNC:
2327 				current->eflags &= ~MAP_ENTRY_NOSYNC;
2328 				break;
2329 			case MADV_NOCORE:
2330 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2331 				break;
2332 			case MADV_CORE:
2333 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2334 				break;
2335 			case MADV_SETMAP:
2336 				/*
2337 				 * Set the page directory page for a map
2338 				 * governed by a virtual page table.
2339 				 *
2340 				 * Software virtual page table support has
2341 				 * been removed, this MADV is no longer
2342 				 * supported.
2343 				 */
2344 				error = EINVAL;
2345 				break;
2346 			case MADV_INVAL:
2347 				/*
2348 				 * Invalidate the related pmap entries, used
2349 				 * to flush portions of the real kernel's
2350 				 * pmap when the caller has removed or
2351 				 * modified existing mappings in a virtual
2352 				 * page table.
2353 				 *
2354 				 * (exclusive locked map version does not
2355 				 * need the range interlock).
2356 				 */
2357 				pmap_remove(map->pmap,
2358 					    current->ba.start, current->ba.end);
2359 				break;
2360 			default:
2361 				error = EINVAL;
2362 				break;
2363 			}
2364 			vm_map_simplify_entry(map, current, &count);
2365 		}
2366 		vm_map_unlock(map);
2367 	} else {
2368 		vm_pindex_t pindex;
2369 		vm_pindex_t delta;
2370 
2371 		/*
2372 		 * madvise behaviors that are implemented in the underlying
2373 		 * vm_object.
2374 		 *
2375 		 * Since we don't clip the vm_map_entry, we have to clip
2376 		 * the vm_object pindex and count.
2377 		 *
2378 		 * NOTE!  These functions are only supported on normal maps.
2379 		 *
2380 		 * NOTE!  These functions only apply to the top-most object.
2381 		 *	  It is not applicable to backing objects.
2382 		 */
2383 		for (current = entry;
2384 		     current && current->ba.start < end;
2385 		     current = vm_map_rb_tree_RB_NEXT(current)) {
2386 			vm_offset_t useStart;
2387 
2388 			if (current->maptype != VM_MAPTYPE_NORMAL)
2389 				continue;
2390 
2391 			pindex = OFF_TO_IDX(current->ba.offset);
2392 			delta = atop(current->ba.end - current->ba.start);
2393 			useStart = current->ba.start;
2394 
2395 			if (current->ba.start < start) {
2396 				pindex += atop(start - current->ba.start);
2397 				delta -= atop(start - current->ba.start);
2398 				useStart = start;
2399 			}
2400 			if (current->ba.end > end)
2401 				delta -= atop(current->ba.end - end);
2402 
2403 			if ((vm_spindex_t)delta <= 0)
2404 				continue;
2405 
2406 			if (behav == MADV_INVAL) {
2407 				/*
2408 				 * Invalidate the related pmap entries, used
2409 				 * to flush portions of the real kernel's
2410 				 * pmap when the caller has removed or
2411 				 * modified existing mappings in a virtual
2412 				 * page table.
2413 				 *
2414 				 * (shared locked map version needs the
2415 				 * interlock, see vm_fault()).
2416 				 */
2417 				struct vm_map_ilock ilock;
2418 
2419 				KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2420 					    useStart + ptoa(delta) <=
2421 					    VM_MAX_USER_ADDRESS,
2422 					 ("Bad range %016jx-%016jx (%016jx)",
2423 					 useStart, useStart + ptoa(delta),
2424 					 delta));
2425 				vm_map_interlock(map, &ilock,
2426 						 useStart,
2427 						 useStart + ptoa(delta));
2428 				pmap_remove(map->pmap,
2429 					    useStart,
2430 					    useStart + ptoa(delta));
2431 				vm_map_deinterlock(map, &ilock);
2432 			} else {
2433 				vm_object_madvise(current->ba.object,
2434 						  pindex, delta, behav);
2435 			}
2436 
2437 			/*
2438 			 * Try to pre-populate the page table.
2439 			 */
2440 			if (behav == MADV_WILLNEED) {
2441 				pmap_object_init_pt(
2442 				    map->pmap, current,
2443 				    useStart,
2444 				    (delta << PAGE_SHIFT),
2445 				    MAP_PREFAULT_MADVISE
2446 				);
2447 			}
2448 		}
2449 		vm_map_unlock_read(map);
2450 	}
2451 	vm_map_entry_release(count);
2452 	return(error);
2453 }
2454 
2455 
2456 /*
2457  * Sets the inheritance of the specified address range in the target map.
2458  * Inheritance affects how the map will be shared with child maps at the
2459  * time of vm_map_fork.
2460  */
2461 int
2462 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2463 	       vm_inherit_t new_inheritance)
2464 {
2465 	vm_map_entry_t entry;
2466 	vm_map_entry_t temp_entry;
2467 	int count;
2468 
2469 	switch (new_inheritance) {
2470 	case VM_INHERIT_NONE:
2471 	case VM_INHERIT_COPY:
2472 	case VM_INHERIT_SHARE:
2473 		break;
2474 	default:
2475 		return (KERN_INVALID_ARGUMENT);
2476 	}
2477 
2478 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2479 	vm_map_lock(map);
2480 
2481 	VM_MAP_RANGE_CHECK(map, start, end);
2482 
2483 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2484 		entry = temp_entry;
2485 		vm_map_clip_start(map, entry, start, &count);
2486 	} else if (temp_entry) {
2487 		entry = vm_map_rb_tree_RB_NEXT(temp_entry);
2488 	} else {
2489 		entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2490 	}
2491 
2492 	while (entry && entry->ba.start < end) {
2493 		vm_map_clip_end(map, entry, end, &count);
2494 
2495 		entry->inheritance = new_inheritance;
2496 
2497 		vm_map_simplify_entry(map, entry, &count);
2498 
2499 		entry = vm_map_rb_tree_RB_NEXT(entry);
2500 	}
2501 	vm_map_unlock(map);
2502 	vm_map_entry_release(count);
2503 	return (KERN_SUCCESS);
2504 }
2505 
2506 /*
2507  * Wiring/Unwiring of memory for user-related operation.
2508  *
2509  * Implement the semantics of mlock
2510  *
2511  * The name of this function is horrid.  It both wires and unwires, using
2512  * user wiring semantics (where as vm_map_wire() both wires and unwires, using
2513  * kernel wiring semantics).  XXX change name to vm_map_user_wiring().
2514  */
2515 int
2516 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2517 	      boolean_t new_pageable)
2518 {
2519 	vm_map_entry_t entry;
2520 	vm_map_entry_t start_entry;
2521 	vm_offset_t end;
2522 	int rv = KERN_SUCCESS;
2523 	int count;
2524 
2525 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2526 	vm_map_lock(map);
2527 	VM_MAP_RANGE_CHECK(map, start, real_end);
2528 	end = real_end;
2529 
2530 	start_entry = vm_map_clip_range(map, start, end, &count,
2531 					MAP_CLIP_NO_HOLES);
2532 	if (start_entry == NULL) {
2533 		vm_map_unlock(map);
2534 		vm_map_entry_release(count);
2535 		return (KERN_INVALID_ADDRESS);
2536 	}
2537 
2538 	if (new_pageable == 0) {
2539 		entry = start_entry;
2540 		while (entry && entry->ba.start < end) {
2541 			vm_offset_t save_start;
2542 			vm_offset_t save_end;
2543 
2544 			/*
2545 			 * Already user wired or hard wired (trivial cases)
2546 			 */
2547 			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2548 				entry = vm_map_rb_tree_RB_NEXT(entry);
2549 				continue;
2550 			}
2551 			if (entry->wired_count != 0) {
2552 				entry->wired_count++;
2553 				entry->eflags |= MAP_ENTRY_USER_WIRED;
2554 				entry = vm_map_rb_tree_RB_NEXT(entry);
2555 				continue;
2556 			}
2557 
2558 			/*
2559 			 * A new wiring requires instantiation of appropriate
2560 			 * management structures and the faulting in of the
2561 			 * page.
2562 			 */
2563 			if (entry->maptype == VM_MAPTYPE_NORMAL) {
2564 				int copyflag = entry->eflags &
2565 					       MAP_ENTRY_NEEDS_COPY;
2566 				if (copyflag && ((entry->protection &
2567 						  VM_PROT_WRITE) != 0)) {
2568 					vm_map_entry_shadow(entry);
2569 				} else if (entry->ba.object == NULL &&
2570 					   !map->system_map) {
2571 					vm_map_entry_allocate_object(entry);
2572 				}
2573 			}
2574 			entry->wired_count++;
2575 			entry->eflags |= MAP_ENTRY_USER_WIRED;
2576 
2577 			/*
2578 			 * Now fault in the area.  Note that vm_fault_wire()
2579 			 * may release the map lock temporarily, it will be
2580 			 * relocked on return.  The in-transition
2581 			 * flag protects the entries.
2582 			 */
2583 			save_start = entry->ba.start;
2584 			save_end = entry->ba.end;
2585 			rv = vm_fault_wire(map, entry, TRUE, 0);
2586 			if (rv) {
2587 				CLIP_CHECK_BACK(entry, save_start);
2588 				for (;;) {
2589 					KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2590 					entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2591 					entry->wired_count = 0;
2592 					if (entry->ba.end == save_end)
2593 						break;
2594 					entry = vm_map_rb_tree_RB_NEXT(entry);
2595 					KASSERT(entry,
2596 					     ("bad entry clip during backout"));
2597 				}
2598 				end = save_start;	/* unwire the rest */
2599 				break;
2600 			}
2601 			/*
2602 			 * note that even though the entry might have been
2603 			 * clipped, the USER_WIRED flag we set prevents
2604 			 * duplication so we do not have to do a
2605 			 * clip check.
2606 			 */
2607 			entry = vm_map_rb_tree_RB_NEXT(entry);
2608 		}
2609 
2610 		/*
2611 		 * If we failed fall through to the unwiring section to
2612 		 * unwire what we had wired so far.  'end' has already
2613 		 * been adjusted.
2614 		 */
2615 		if (rv)
2616 			new_pageable = 1;
2617 
2618 		/*
2619 		 * start_entry might have been clipped if we unlocked the
2620 		 * map and blocked.  No matter how clipped it has gotten
2621 		 * there should be a fragment that is on our start boundary.
2622 		 */
2623 		CLIP_CHECK_BACK(start_entry, start);
2624 	}
2625 
2626 	/*
2627 	 * Deal with the unwiring case.
2628 	 */
2629 	if (new_pageable) {
2630 		/*
2631 		 * This is the unwiring case.  We must first ensure that the
2632 		 * range to be unwired is really wired down.  We know there
2633 		 * are no holes.
2634 		 */
2635 		entry = start_entry;
2636 		while (entry && entry->ba.start < end) {
2637 			if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2638 				rv = KERN_INVALID_ARGUMENT;
2639 				goto done;
2640 			}
2641 			KASSERT(entry->wired_count != 0,
2642 				("wired count was 0 with USER_WIRED set! %p",
2643 				 entry));
2644 			entry = vm_map_rb_tree_RB_NEXT(entry);
2645 		}
2646 
2647 		/*
2648 		 * Now decrement the wiring count for each region. If a region
2649 		 * becomes completely unwired, unwire its physical pages and
2650 		 * mappings.
2651 		 */
2652 		/*
2653 		 * The map entries are processed in a loop, checking to
2654 		 * make sure the entry is wired and asserting it has a wired
2655 		 * count. However, another loop was inserted more-or-less in
2656 		 * the middle of the unwiring path. This loop picks up the
2657 		 * "entry" loop variable from the first loop without first
2658 		 * setting it to start_entry. Naturally, the secound loop
2659 		 * is never entered and the pages backing the entries are
2660 		 * never unwired. This can lead to a leak of wired pages.
2661 		 */
2662 		entry = start_entry;
2663 		while (entry && entry->ba.start < end) {
2664 			KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2665 				("expected USER_WIRED on entry %p", entry));
2666 			entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2667 			entry->wired_count--;
2668 			if (entry->wired_count == 0)
2669 				vm_fault_unwire(map, entry);
2670 			entry = vm_map_rb_tree_RB_NEXT(entry);
2671 		}
2672 	}
2673 done:
2674 	vm_map_unclip_range(map, start_entry, start, real_end, &count,
2675 		MAP_CLIP_NO_HOLES);
2676 	vm_map_unlock(map);
2677 	vm_map_entry_release(count);
2678 
2679 	return (rv);
2680 }
2681 
2682 /*
2683  * Wiring/Unwiring of memory for kernel-related operation.
2684  *
2685  * XXX the naming is horrid.  Change name to vm_map_wiring().
2686  *
2687  * Sets the pageability of the specified address range in the target map.
2688  * Regions specified as not pageable require locked-down physical
2689  * memory and physical page maps.
2690  *
2691  * The map must not be locked, but a reference must remain to the map
2692  * throughout the call.
2693  *
2694  * This function may be called via the zalloc path and must properly
2695  * reserve map entries for kernel_map.
2696  *
2697  * No requirements.
2698  */
2699 int
2700 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t real_end, int kmflags)
2701 {
2702 	vm_map_entry_t entry;
2703 	vm_map_entry_t start_entry;
2704 	vm_offset_t end;
2705 	int rv = KERN_SUCCESS;
2706 	int count;
2707 
2708 	if (kmflags & KM_KRESERVE)
2709 		count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2710 	else
2711 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2712 	vm_map_lock(map);
2713 	VM_MAP_RANGE_CHECK(map, start, real_end);
2714 	end = real_end;
2715 
2716 	start_entry = vm_map_clip_range(map, start, end, &count,
2717 					MAP_CLIP_NO_HOLES);
2718 	if (start_entry == NULL) {
2719 		vm_map_unlock(map);
2720 		rv = KERN_INVALID_ADDRESS;
2721 		goto failure;
2722 	}
2723 	if ((kmflags & KM_PAGEABLE) == 0) {
2724 		/*
2725 		 * Wiring.
2726 		 *
2727 		 * 1.  Holding the write lock, we create any shadow or zero-fill
2728 		 * objects that need to be created. Then we clip each map
2729 		 * entry to the region to be wired and increment its wiring
2730 		 * count.  We create objects before clipping the map entries
2731 		 * to avoid object proliferation.
2732 		 *
2733 		 * 2.  We downgrade to a read lock, and call vm_fault_wire to
2734 		 * fault in the pages for any newly wired area (wired_count is
2735 		 * 1).
2736 		 *
2737 		 * Downgrading to a read lock for vm_fault_wire avoids a
2738 		 * possible deadlock with another process that may have faulted
2739 		 * on one of the pages to be wired (it would mark the page busy,
2740 		 * blocking us, then in turn block on the map lock that we
2741 		 * hold).  Because of problems in the recursive lock package,
2742 		 * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2743 		 * any actions that require the write lock must be done
2744 		 * beforehand.  Because we keep the read lock on the map, the
2745 		 * copy-on-write status of the entries we modify here cannot
2746 		 * change.
2747 		 */
2748 		entry = start_entry;
2749 		while (entry && entry->ba.start < end) {
2750 			/*
2751 			 * Trivial case if the entry is already wired
2752 			 */
2753 			if (entry->wired_count) {
2754 				entry->wired_count++;
2755 				entry = vm_map_rb_tree_RB_NEXT(entry);
2756 				continue;
2757 			}
2758 
2759 			/*
2760 			 * The entry is being newly wired, we have to setup
2761 			 * appropriate management structures.  A shadow
2762 			 * object is required for a copy-on-write region,
2763 			 * or a normal object for a zero-fill region.  We
2764 			 * do not have to do this for entries that point to sub
2765 			 * maps because we won't hold the lock on the sub map.
2766 			 */
2767 			if (entry->maptype == VM_MAPTYPE_NORMAL) {
2768 				int copyflag = entry->eflags &
2769 					       MAP_ENTRY_NEEDS_COPY;
2770 				if (copyflag && ((entry->protection &
2771 						  VM_PROT_WRITE) != 0)) {
2772 					vm_map_entry_shadow(entry);
2773 				} else if (entry->ba.object == NULL &&
2774 					   !map->system_map) {
2775 					vm_map_entry_allocate_object(entry);
2776 				}
2777 			}
2778 			entry->wired_count++;
2779 			entry = vm_map_rb_tree_RB_NEXT(entry);
2780 		}
2781 
2782 		/*
2783 		 * Pass 2.
2784 		 */
2785 
2786 		/*
2787 		 * HACK HACK HACK HACK
2788 		 *
2789 		 * vm_fault_wire() temporarily unlocks the map to avoid
2790 		 * deadlocks.  The in-transition flag from vm_map_clip_range
2791 		 * call should protect us from changes while the map is
2792 		 * unlocked.  T
2793 		 *
2794 		 * NOTE: Previously this comment stated that clipping might
2795 		 *	 still occur while the entry is unlocked, but from
2796 		 *	 what I can tell it actually cannot.
2797 		 *
2798 		 *	 It is unclear whether the CLIP_CHECK_*() calls
2799 		 *	 are still needed but we keep them in anyway.
2800 		 *
2801 		 * HACK HACK HACK HACK
2802 		 */
2803 
2804 		entry = start_entry;
2805 		while (entry && entry->ba.start < end) {
2806 			/*
2807 			 * If vm_fault_wire fails for any page we need to undo
2808 			 * what has been done.  We decrement the wiring count
2809 			 * for those pages which have not yet been wired (now)
2810 			 * and unwire those that have (later).
2811 			 */
2812 			vm_offset_t save_start = entry->ba.start;
2813 			vm_offset_t save_end = entry->ba.end;
2814 
2815 			if (entry->wired_count == 1)
2816 				rv = vm_fault_wire(map, entry, FALSE, kmflags);
2817 			if (rv) {
2818 				CLIP_CHECK_BACK(entry, save_start);
2819 				for (;;) {
2820 					KASSERT(entry->wired_count == 1,
2821 					  ("wired_count changed unexpectedly"));
2822 					entry->wired_count = 0;
2823 					if (entry->ba.end == save_end)
2824 						break;
2825 					entry = vm_map_rb_tree_RB_NEXT(entry);
2826 					KASSERT(entry,
2827 					  ("bad entry clip during backout"));
2828 				}
2829 				end = save_start;
2830 				break;
2831 			}
2832 			CLIP_CHECK_FWD(entry, save_end);
2833 			entry = vm_map_rb_tree_RB_NEXT(entry);
2834 		}
2835 
2836 		/*
2837 		 * If a failure occured undo everything by falling through
2838 		 * to the unwiring code.  'end' has already been adjusted
2839 		 * appropriately.
2840 		 */
2841 		if (rv)
2842 			kmflags |= KM_PAGEABLE;
2843 
2844 		/*
2845 		 * start_entry is still IN_TRANSITION but may have been
2846 		 * clipped since vm_fault_wire() unlocks and relocks the
2847 		 * map.  No matter how clipped it has gotten there should
2848 		 * be a fragment that is on our start boundary.
2849 		 */
2850 		CLIP_CHECK_BACK(start_entry, start);
2851 	}
2852 
2853 	if (kmflags & KM_PAGEABLE) {
2854 		/*
2855 		 * This is the unwiring case.  We must first ensure that the
2856 		 * range to be unwired is really wired down.  We know there
2857 		 * are no holes.
2858 		 */
2859 		entry = start_entry;
2860 		while (entry && entry->ba.start < end) {
2861 			if (entry->wired_count == 0) {
2862 				rv = KERN_INVALID_ARGUMENT;
2863 				goto done;
2864 			}
2865 			entry = vm_map_rb_tree_RB_NEXT(entry);
2866 		}
2867 
2868 		/*
2869 		 * Now decrement the wiring count for each region. If a region
2870 		 * becomes completely unwired, unwire its physical pages and
2871 		 * mappings.
2872 		 */
2873 		entry = start_entry;
2874 		while (entry && entry->ba.start < end) {
2875 			entry->wired_count--;
2876 			if (entry->wired_count == 0)
2877 				vm_fault_unwire(map, entry);
2878 			entry = vm_map_rb_tree_RB_NEXT(entry);
2879 		}
2880 	}
2881 done:
2882 	vm_map_unclip_range(map, start_entry, start, real_end,
2883 			    &count, MAP_CLIP_NO_HOLES);
2884 	vm_map_unlock(map);
2885 failure:
2886 	if (kmflags & KM_KRESERVE)
2887 		vm_map_entry_krelease(count);
2888 	else
2889 		vm_map_entry_release(count);
2890 	return (rv);
2891 }
2892 
2893 /*
2894  * Mark a newly allocated address range as wired but do not fault in
2895  * the pages.  The caller is expected to load the pages into the object.
2896  *
2897  * The map must be locked on entry and will remain locked on return.
2898  * No other requirements.
2899  */
2900 void
2901 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2902 		       int *countp)
2903 {
2904 	vm_map_entry_t scan;
2905 	vm_map_entry_t entry;
2906 
2907 	entry = vm_map_clip_range(map, addr, addr + size,
2908 				  countp, MAP_CLIP_NO_HOLES);
2909 	scan = entry;
2910 	while (scan && scan->ba.start < addr + size) {
2911 		KKASSERT(scan->wired_count == 0);
2912 		scan->wired_count = 1;
2913 		scan = vm_map_rb_tree_RB_NEXT(scan);
2914 	}
2915 	vm_map_unclip_range(map, entry, addr, addr + size,
2916 			    countp, MAP_CLIP_NO_HOLES);
2917 }
2918 
2919 /*
2920  * Push any dirty cached pages in the address range to their pager.
2921  * If syncio is TRUE, dirty pages are written synchronously.
2922  * If invalidate is TRUE, any cached pages are freed as well.
2923  *
2924  * This routine is called by sys_msync()
2925  *
2926  * Returns an error if any part of the specified range is not mapped.
2927  *
2928  * No requirements.
2929  */
2930 int
2931 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2932 	     boolean_t syncio, boolean_t invalidate)
2933 {
2934 	vm_map_entry_t current;
2935 	vm_map_entry_t next;
2936 	vm_map_entry_t entry;
2937 	vm_map_backing_t ba;
2938 	vm_size_t size;
2939 	vm_object_t object;
2940 	vm_ooffset_t offset;
2941 
2942 	vm_map_lock_read(map);
2943 	VM_MAP_RANGE_CHECK(map, start, end);
2944 	if (!vm_map_lookup_entry(map, start, &entry)) {
2945 		vm_map_unlock_read(map);
2946 		return (KERN_INVALID_ADDRESS);
2947 	}
2948 	lwkt_gettoken(&map->token);
2949 
2950 	/*
2951 	 * Make a first pass to check for holes.
2952 	 */
2953 	current = entry;
2954 	while (current && current->ba.start < end) {
2955 		if (current->maptype == VM_MAPTYPE_SUBMAP) {
2956 			lwkt_reltoken(&map->token);
2957 			vm_map_unlock_read(map);
2958 			return (KERN_INVALID_ARGUMENT);
2959 		}
2960 		next = vm_map_rb_tree_RB_NEXT(current);
2961 		if (end > current->ba.end &&
2962 		    (next == NULL ||
2963 		     current->ba.end != next->ba.start)) {
2964 			lwkt_reltoken(&map->token);
2965 			vm_map_unlock_read(map);
2966 			return (KERN_INVALID_ADDRESS);
2967 		}
2968 		current = next;
2969 	}
2970 
2971 	if (invalidate)
2972 		pmap_remove(vm_map_pmap(map), start, end);
2973 
2974 	/*
2975 	 * Make a second pass, cleaning/uncaching pages from the indicated
2976 	 * objects as we go.
2977 	 */
2978 	current = entry;
2979 	while (current && current->ba.start < end) {
2980 		offset = current->ba.offset + (start - current->ba.start);
2981 		size = (end <= current->ba.end ? end : current->ba.end) - start;
2982 
2983 		switch(current->maptype) {
2984 		case VM_MAPTYPE_SUBMAP:
2985 		{
2986 			vm_map_t smap;
2987 			vm_map_entry_t tentry;
2988 			vm_size_t tsize;
2989 
2990 			smap = current->ba.sub_map;
2991 			vm_map_lock_read(smap);
2992 			vm_map_lookup_entry(smap, offset, &tentry);
2993 			if (tentry == NULL) {
2994 				tsize = vm_map_max(smap) - offset;
2995 				ba = NULL;
2996 				offset = 0 + (offset - vm_map_min(smap));
2997 			} else {
2998 				tsize = tentry->ba.end - offset;
2999 				ba = &tentry->ba;
3000 				offset = tentry->ba.offset +
3001 					 (offset - tentry->ba.start);
3002 			}
3003 			vm_map_unlock_read(smap);
3004 			if (tsize < size)
3005 				size = tsize;
3006 			break;
3007 		}
3008 		case VM_MAPTYPE_NORMAL:
3009 			ba = &current->ba;
3010 			break;
3011 		default:
3012 			ba = NULL;
3013 			break;
3014 		}
3015 		if (ba) {
3016 			object = ba->object;
3017 			if (object)
3018 				vm_object_hold(object);
3019 		} else {
3020 			object = NULL;
3021 		}
3022 
3023 		/*
3024 		 * Note that there is absolutely no sense in writing out
3025 		 * anonymous objects, so we track down the vnode object
3026 		 * to write out.
3027 		 * We invalidate (remove) all pages from the address space
3028 		 * anyway, for semantic correctness.
3029 		 *
3030 		 * note: certain anonymous maps, such as MAP_NOSYNC maps,
3031 		 * may start out with a NULL object.
3032 		 *
3033 		 * XXX do we really want to stop at the first backing store
3034 		 * here if there are more? XXX
3035 		 */
3036 		if (ba) {
3037 			vm_object_t tobj;
3038 
3039 			tobj = object;
3040 			while (ba->backing_ba != NULL) {
3041 				offset -= ba->offset;
3042 				ba = ba->backing_ba;
3043 				offset += ba->offset;
3044 				tobj = ba->object;
3045 				if (tobj->size < OFF_TO_IDX(offset + size))
3046 					size = IDX_TO_OFF(tobj->size) - offset;
3047 				break; /* XXX this break is not correct */
3048 			}
3049 			if (object != tobj) {
3050 				if (object)
3051 					vm_object_drop(object);
3052 				object = tobj;
3053 				vm_object_hold(object);
3054 			}
3055 		}
3056 
3057 		if (object && (object->type == OBJT_VNODE) &&
3058 		    (current->protection & VM_PROT_WRITE) &&
3059 		    (object->flags & OBJ_NOMSYNC) == 0) {
3060 			/*
3061 			 * Flush pages if writing is allowed, invalidate them
3062 			 * if invalidation requested.  Pages undergoing I/O
3063 			 * will be ignored by vm_object_page_remove().
3064 			 *
3065 			 * We cannot lock the vnode and then wait for paging
3066 			 * to complete without deadlocking against vm_fault.
3067 			 * Instead we simply call vm_object_page_remove() and
3068 			 * allow it to block internally on a page-by-page
3069 			 * basis when it encounters pages undergoing async
3070 			 * I/O.
3071 			 */
3072 			int flags;
3073 
3074 			/* no chain wait needed for vnode objects */
3075 			vm_object_reference_locked(object);
3076 			vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
3077 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
3078 			flags |= invalidate ? OBJPC_INVAL : 0;
3079 
3080 			if (current->maptype == VM_MAPTYPE_NORMAL) {
3081 				vm_object_page_clean(object,
3082 				    OFF_TO_IDX(offset),
3083 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3084 				    flags);
3085 			}
3086 			vn_unlock(((struct vnode *)object->handle));
3087 			vm_object_deallocate_locked(object);
3088 		}
3089 		if (object && invalidate &&
3090 		   ((object->type == OBJT_VNODE) ||
3091 		    (object->type == OBJT_DEVICE) ||
3092 		    (object->type == OBJT_MGTDEVICE))) {
3093 			int clean_only =
3094 				((object->type == OBJT_DEVICE) ||
3095 				(object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
3096 			/* no chain wait needed for vnode/device objects */
3097 			vm_object_reference_locked(object);
3098 			if (current->maptype == VM_MAPTYPE_NORMAL) {
3099 				vm_object_page_remove(object,
3100 				    OFF_TO_IDX(offset),
3101 				    OFF_TO_IDX(offset + size + PAGE_MASK),
3102 				    clean_only);
3103 			}
3104 			vm_object_deallocate_locked(object);
3105 		}
3106 		start += size;
3107 		if (object)
3108 			vm_object_drop(object);
3109 		current = vm_map_rb_tree_RB_NEXT(current);
3110 	}
3111 
3112 	lwkt_reltoken(&map->token);
3113 	vm_map_unlock_read(map);
3114 
3115 	return (KERN_SUCCESS);
3116 }
3117 
3118 /*
3119  * Make the region specified by this entry pageable.
3120  *
3121  * The vm_map must be exclusively locked.
3122  */
3123 static void
3124 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3125 {
3126 	entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3127 	entry->wired_count = 0;
3128 	vm_fault_unwire(map, entry);
3129 }
3130 
3131 /*
3132  * Deallocate the given entry from the target map.
3133  *
3134  * The vm_map must be exclusively locked.
3135  */
3136 static void
3137 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
3138 {
3139 	vm_map_entry_unlink(map, entry);
3140 	map->size -= entry->ba.end - entry->ba.start;
3141 	vm_map_entry_dispose(map, entry, countp);
3142 }
3143 
3144 /*
3145  * Deallocates the given address range from the target map.
3146  *
3147  * The vm_map must be exclusively locked.
3148  */
3149 int
3150 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
3151 {
3152 	vm_object_t object;
3153 	vm_map_entry_t entry;
3154 	vm_map_entry_t first_entry;
3155 	vm_offset_t hole_start;
3156 
3157 	ASSERT_VM_MAP_LOCKED(map);
3158 	lwkt_gettoken(&map->token);
3159 again:
3160 	/*
3161 	 * Find the start of the region, and clip it.  Set entry to point
3162 	 * at the first record containing the requested address or, if no
3163 	 * such record exists, the next record with a greater address.  The
3164 	 * loop will run from this point until a record beyond the termination
3165 	 * address is encountered.
3166 	 *
3167 	 * Adjust freehint[] for either the clip case or the extension case.
3168 	 *
3169 	 * GGG see other GGG comment.
3170 	 */
3171 	if (vm_map_lookup_entry(map, start, &first_entry)) {
3172 		entry = first_entry;
3173 		vm_map_clip_start(map, entry, start, countp);
3174 		hole_start = start;
3175 	} else {
3176 		if (first_entry) {
3177 			entry = vm_map_rb_tree_RB_NEXT(first_entry);
3178 			if (entry == NULL)
3179 				hole_start = first_entry->ba.start;
3180 			else
3181 				hole_start = first_entry->ba.end;
3182 		} else {
3183 			entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3184 			if (entry == NULL)
3185 				hole_start = vm_map_min(map);
3186 			else
3187 				hole_start = vm_map_max(map);
3188 		}
3189 	}
3190 
3191 	/*
3192 	 * Step through all entries in this region
3193 	 */
3194 	while (entry && entry->ba.start < end) {
3195 		vm_map_entry_t next;
3196 		vm_offset_t s, e;
3197 		vm_pindex_t offidxstart, offidxend, count;
3198 
3199 		/*
3200 		 * If we hit an in-transition entry we have to sleep and
3201 		 * retry.  It's easier (and not really slower) to just retry
3202 		 * since this case occurs so rarely and the hint is already
3203 		 * pointing at the right place.  We have to reset the
3204 		 * start offset so as not to accidently delete an entry
3205 		 * another process just created in vacated space.
3206 		 */
3207 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3208 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3209 			start = entry->ba.start;
3210 			++mycpu->gd_cnt.v_intrans_coll;
3211 			++mycpu->gd_cnt.v_intrans_wait;
3212 			vm_map_transition_wait(map, 1);
3213 			goto again;
3214 		}
3215 		vm_map_clip_end(map, entry, end, countp);
3216 
3217 		s = entry->ba.start;
3218 		e = entry->ba.end;
3219 		next = vm_map_rb_tree_RB_NEXT(entry);
3220 
3221 		offidxstart = OFF_TO_IDX(entry->ba.offset);
3222 		count = OFF_TO_IDX(e - s);
3223 
3224 		switch(entry->maptype) {
3225 		case VM_MAPTYPE_NORMAL:
3226 		case VM_MAPTYPE_SUBMAP:
3227 			object = entry->ba.object;
3228 			break;
3229 		default:
3230 			object = NULL;
3231 			break;
3232 		}
3233 
3234 		/*
3235 		 * Unwire before removing addresses from the pmap; otherwise,
3236 		 * unwiring will put the entries back in the pmap.
3237 		 *
3238 		 * Generally speaking, doing a bulk pmap_remove() before
3239 		 * removing the pages from the VM object is better at
3240 		 * reducing unnecessary IPIs.  The pmap code is now optimized
3241 		 * to not blindly iterate the range when pt and pd pages
3242 		 * are missing.
3243 		 */
3244 		if (entry->wired_count != 0)
3245 			vm_map_entry_unwire(map, entry);
3246 
3247 		offidxend = offidxstart + count;
3248 
3249 		if (object == &kernel_object) {
3250 			pmap_remove(map->pmap, s, e);
3251 			vm_object_hold(object);
3252 			vm_object_page_remove(object, offidxstart,
3253 					      offidxend, FALSE);
3254 			vm_object_drop(object);
3255 		} else if (object && object->type != OBJT_DEFAULT &&
3256 			   object->type != OBJT_SWAP) {
3257 			/*
3258 			 * vnode object routines cannot be chain-locked,
3259 			 * but since we aren't removing pages from the
3260 			 * object here we can use a shared hold.
3261 			 */
3262 			vm_object_hold_shared(object);
3263 			pmap_remove(map->pmap, s, e);
3264 			vm_object_drop(object);
3265 		} else if (object) {
3266 			vm_object_hold(object);
3267 			pmap_remove(map->pmap, s, e);
3268 
3269 			if (object != NULL &&
3270 			    object->ref_count != 1 &&
3271 			    (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3272 			     OBJ_ONEMAPPING &&
3273 			    (object->type == OBJT_DEFAULT ||
3274 			     object->type == OBJT_SWAP)) {
3275 				/*
3276 				 * When ONEMAPPING is set we can destroy the
3277 				 * pages underlying the entry's range.
3278 				 */
3279 				vm_object_page_remove(object, offidxstart,
3280 						      offidxend, FALSE);
3281 				if (object->type == OBJT_SWAP) {
3282 					swap_pager_freespace(object,
3283 							     offidxstart,
3284 							     count);
3285 				}
3286 				if (offidxend >= object->size &&
3287 				    offidxstart < object->size) {
3288 					object->size = offidxstart;
3289 				}
3290 			}
3291 			vm_object_drop(object);
3292 		} else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3293 			pmap_remove(map->pmap, s, e);
3294 		}
3295 
3296 		/*
3297 		 * Delete the entry (which may delete the object) only after
3298 		 * removing all pmap entries pointing to its pages.
3299 		 * (Otherwise, its page frames may be reallocated, and any
3300 		 * modify bits will be set in the wrong object!)
3301 		 */
3302 		vm_map_entry_delete(map, entry, countp);
3303 		entry = next;
3304 	}
3305 
3306 	/*
3307 	 * We either reached the end and use vm_map_max as the end
3308 	 * address, or we didn't and we use the next entry as the
3309 	 * end address.
3310 	 */
3311 	if (entry == NULL) {
3312 		vm_map_freehint_hole(map, hole_start,
3313 				     vm_map_max(map) - hole_start);
3314 	} else {
3315 		vm_map_freehint_hole(map, hole_start,
3316 				     entry->ba.start - hole_start);
3317 	}
3318 
3319 	lwkt_reltoken(&map->token);
3320 
3321 	return (KERN_SUCCESS);
3322 }
3323 
3324 /*
3325  * Remove the given address range from the target map.
3326  * This is the exported form of vm_map_delete.
3327  *
3328  * No requirements.
3329  */
3330 int
3331 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3332 {
3333 	int result;
3334 	int count;
3335 
3336 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3337 	vm_map_lock(map);
3338 	VM_MAP_RANGE_CHECK(map, start, end);
3339 	result = vm_map_delete(map, start, end, &count);
3340 	vm_map_unlock(map);
3341 	vm_map_entry_release(count);
3342 
3343 	return (result);
3344 }
3345 
3346 /*
3347  * Assert that the target map allows the specified privilege on the
3348  * entire address region given.  The entire region must be allocated.
3349  *
3350  * The caller must specify whether the vm_map is already locked or not.
3351  */
3352 boolean_t
3353 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3354 			vm_prot_t protection, boolean_t have_lock)
3355 {
3356 	vm_map_entry_t entry;
3357 	vm_map_entry_t tmp_entry;
3358 	boolean_t result;
3359 
3360 	if (have_lock == FALSE)
3361 		vm_map_lock_read(map);
3362 
3363 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3364 		if (have_lock == FALSE)
3365 			vm_map_unlock_read(map);
3366 		return (FALSE);
3367 	}
3368 	entry = tmp_entry;
3369 
3370 	result = TRUE;
3371 	while (start < end) {
3372 		if (entry == NULL) {
3373 			result = FALSE;
3374 			break;
3375 		}
3376 
3377 		/*
3378 		 * No holes allowed!
3379 		 */
3380 
3381 		if (start < entry->ba.start) {
3382 			result = FALSE;
3383 			break;
3384 		}
3385 		/*
3386 		 * Check protection associated with entry.
3387 		 */
3388 
3389 		if ((entry->protection & protection) != protection) {
3390 			result = FALSE;
3391 			break;
3392 		}
3393 		/* go to next entry */
3394 		start = entry->ba.end;
3395 		entry = vm_map_rb_tree_RB_NEXT(entry);
3396 	}
3397 	if (have_lock == FALSE)
3398 		vm_map_unlock_read(map);
3399 	return (result);
3400 }
3401 
3402 /*
3403  * vm_map_backing structures are not shared across forks and must be
3404  * replicated.
3405  *
3406  * Generally speaking we must reallocate the backing_ba sequence and
3407  * also adjust it for any changes made to the base entry->ba.start and
3408  * entry->ba.end.  The first ba in the chain is of course &entry->ba,
3409  * so we only need to adjust subsequent ba's start, end, and offset.
3410  *
3411  * MAP_BACK_CLIPPED	- Called as part of a clipping replication.
3412  *			  Do not clear OBJ_ONEMAPPING.
3413  *
3414  * MAP_BACK_BASEOBJREFD - Called from vm_map_insert().  The base object
3415  *			  has already been referenced.
3416  */
3417 static
3418 void
3419 vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
3420 {
3421 	vm_map_backing_t ba;
3422 	vm_map_backing_t nba;
3423 	vm_object_t object;
3424 
3425 	ba = &entry->ba;
3426 	for (;;) {
3427 		ba->pmap = map->pmap;
3428 
3429 		if (ba->map_object) {
3430 			switch(entry->maptype) {
3431 			case VM_MAPTYPE_NORMAL:
3432 				object = ba->object;
3433 				if (ba != &entry->ba ||
3434 				    (flags & MAP_BACK_BASEOBJREFD) == 0) {
3435 					vm_object_reference_quick(object);
3436 				}
3437 				vm_map_backing_attach(entry, ba);
3438 				if ((flags & MAP_BACK_CLIPPED) == 0 &&
3439 				    object->ref_count > 1) {
3440 					vm_object_clear_flag(object,
3441 							     OBJ_ONEMAPPING);
3442 				}
3443 				break;
3444 			case VM_MAPTYPE_UKSMAP:
3445 				vm_map_backing_attach(entry, ba);
3446 				break;
3447 			default:
3448 				break;
3449 			}
3450 		}
3451 		if (ba->backing_ba == NULL)
3452 			break;
3453 
3454 		/*
3455 		 * NOTE: The aux_info field is retained.
3456 		 */
3457 		nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT);
3458 		*nba = *ba->backing_ba;
3459 		nba->offset += (ba->start - nba->start);  /* += (new - old) */
3460 		nba->start = ba->start;
3461 		nba->end = ba->end;
3462 		ba->backing_ba = nba;
3463 		ba = nba;
3464 		/* pmap is replaced at the top of the loop */
3465 	}
3466 }
3467 
3468 static
3469 void
3470 vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start)
3471 {
3472 	vm_map_backing_t ba;
3473 
3474 	if (entry->maptype == VM_MAPTYPE_NORMAL) {
3475 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3476 			if (ba->object) {
3477 				lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3478 				ba->offset += (start - ba->start);
3479 				ba->start = start;
3480 				lockmgr(&ba->object->backing_lk, LK_RELEASE);
3481 			} else {
3482 				ba->offset += (start - ba->start);
3483 				ba->start = start;
3484 			}
3485 		}
3486 	} else {
3487 		/* not an object and can't be shadowed */
3488 	}
3489 }
3490 
3491 static
3492 void
3493 vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end)
3494 {
3495 	vm_map_backing_t ba;
3496 
3497 	if (entry->maptype == VM_MAPTYPE_NORMAL) {
3498 		for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3499 			if (ba->object) {
3500 				lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3501 				ba->end = end;
3502 				lockmgr(&ba->object->backing_lk, LK_RELEASE);
3503 			} else {
3504 				ba->end = end;
3505 			}
3506 		}
3507 	} /* else not an object and/or can't be shadowed */
3508 }
3509 
3510 /*
3511  * Handles the dirty work of making src_entry and dst_entry copy-on-write
3512  * after src_entry has been cloned to dst_entry.  For normal entries only.
3513  *
3514  * The vm_maps must be exclusively locked.
3515  * The vm_map's token must be held.
3516  *
3517  * Because the maps are locked no faults can be in progress during the
3518  * operation.
3519  */
3520 static void
3521 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3522 		  vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3523 {
3524 	vm_object_t obj;
3525 
3526 	KKASSERT(dst_entry->maptype == VM_MAPTYPE_NORMAL);
3527 
3528 	if (src_entry->wired_count) {
3529 		/*
3530 		 * Of course, wired down pages can't be set copy-on-write.
3531 		 * Cause wired pages to be copied into the new map by
3532 		 * simulating faults (the new pages are pageable)
3533 		 *
3534 		 * Scrap ba.object (its ref-count has not yet been adjusted
3535 		 * so we can just NULL out the field).  Remove the backing
3536 		 * store.
3537 		 *
3538 		 * Then call vm_fault_copy_entry() to create a new object
3539 		 * in dst_entry and copy the wired pages from src to dst.
3540 		 *
3541 		 * The fault-copy code doesn't work with virtual page
3542 		 * tables.
3543 		 *
3544 		 * NOTE: obj is not actually an object for all MAPTYPEs,
3545 		 *	 just test against NULL.
3546 		 */
3547 		if (dst_entry->ba.map_object != NULL) {
3548 			vm_map_backing_detach(dst_entry, &dst_entry->ba);
3549 			dst_entry->ba.map_object = NULL;
3550 			vm_map_entry_dispose_ba(dst_entry,
3551 						dst_entry->ba.backing_ba);
3552 			dst_entry->ba.backing_ba = NULL;
3553 			dst_entry->ba.backing_count = 0;
3554 		}
3555 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3556 	} else {
3557 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3558 			/*
3559 			 * If the source entry is not already marked NEEDS_COPY
3560 			 * we need to write-protect the PTEs.
3561 			 */
3562 			pmap_protect(src_map->pmap,
3563 				     src_entry->ba.start,
3564 				     src_entry->ba.end,
3565 				     src_entry->protection & ~VM_PROT_WRITE);
3566 		}
3567 
3568 		/*
3569 		 * dst_entry.ba_object might be stale.  Update it (its
3570 		 * ref-count has not yet been updated so just overwrite
3571 		 * the field).
3572 		 *
3573 		 * If there is no object then we are golden.  Also, in
3574 		 * this situation if there are no backing_ba linkages then
3575 		 * we can set ba.offset to whatever we want.  For now we
3576 		 * set the offset for 0 for make debugging object sizes
3577 		 * easier.
3578 		 */
3579 		obj = src_entry->ba.object;
3580 
3581 		if (obj) {
3582 			src_entry->eflags |= (MAP_ENTRY_COW |
3583 					      MAP_ENTRY_NEEDS_COPY);
3584 			dst_entry->eflags |= (MAP_ENTRY_COW |
3585 					      MAP_ENTRY_NEEDS_COPY);
3586 			KKASSERT(dst_entry->ba.offset == src_entry->ba.offset);
3587 		} else {
3588 			dst_entry->ba.offset = 0;
3589 		}
3590 
3591 		/*
3592 		 * Normal, allow the backing_ba link depth to
3593 		 * increase.
3594 		 */
3595 		pmap_copy(dst_map->pmap, src_map->pmap,
3596 			  dst_entry->ba.start,
3597 			  dst_entry->ba.end - dst_entry->ba.start,
3598 			  src_entry->ba.start);
3599 	}
3600 }
3601 
3602 /*
3603  * Create a vmspace for a new process and its related vm_map based on an
3604  * existing vmspace.  The new map inherits information from the old map
3605  * according to inheritance settings.
3606  *
3607  * The source map must not be locked.
3608  * No requirements.
3609  */
3610 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3611 			  vm_map_entry_t old_entry, int *countp);
3612 static void vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3613 			  vm_map_t old_map, vm_map_t new_map,
3614 			  vm_map_entry_t old_entry, int *countp);
3615 
3616 struct vmspace *
3617 vmspace_fork(struct vmspace *vm1, struct proc *p2, struct lwp *lp2)
3618 {
3619 	struct vmspace *vm2;
3620 	vm_map_t old_map = &vm1->vm_map;
3621 	vm_map_t new_map;
3622 	vm_map_entry_t old_entry;
3623 	int count;
3624 
3625 	lwkt_gettoken(&vm1->vm_map.token);
3626 	vm_map_lock(old_map);
3627 
3628 	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map));
3629 	lwkt_gettoken(&vm2->vm_map.token);
3630 
3631 	/*
3632 	 * We must bump the timestamp to force any concurrent fault
3633 	 * to retry.
3634 	 */
3635 	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3636 	      (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3637 	new_map = &vm2->vm_map;	/* XXX */
3638 	new_map->timestamp = 1;
3639 
3640 	vm_map_lock(new_map);
3641 
3642 	count = old_map->nentries;
3643 	count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3644 
3645 	RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) {
3646 		switch(old_entry->maptype) {
3647 		case VM_MAPTYPE_SUBMAP:
3648 			panic("vm_map_fork: encountered a submap");
3649 			break;
3650 		case VM_MAPTYPE_UKSMAP:
3651 			vmspace_fork_uksmap_entry(p2, lp2,
3652 						  old_map, new_map,
3653 						  old_entry, &count);
3654 			break;
3655 		case VM_MAPTYPE_NORMAL:
3656 			vmspace_fork_normal_entry(old_map, new_map,
3657 						  old_entry, &count);
3658 			break;
3659 		default:
3660 			/* nothing to do */
3661 			break;
3662 		}
3663 	}
3664 
3665 	new_map->size = old_map->size;
3666 	vm_map_unlock(new_map);
3667 	vm_map_unlock(old_map);
3668 	vm_map_entry_release(count);
3669 
3670 	lwkt_reltoken(&vm2->vm_map.token);
3671 	lwkt_reltoken(&vm1->vm_map.token);
3672 
3673 	return (vm2);
3674 }
3675 
3676 static
3677 void
3678 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3679 			  vm_map_entry_t old_entry, int *countp)
3680 {
3681 	vm_map_entry_t new_entry;
3682 	vm_map_backing_t ba;
3683 	vm_object_t object;
3684 
3685 	/*
3686 	 * If the backing_ba link list gets too long then fault it
3687 	 * all into the head object and dispose of the list.  We do
3688 	 * this in old_entry prior to cloning in order to benefit both
3689 	 * parent and child.
3690 	 *
3691 	 * We can test our fronting object's size against its
3692 	 * resident_page_count for a really cheap (but probably not perfect)
3693 	 * all-shadowed test, allowing us to disconnect the backing_ba
3694 	 * link list early.
3695 	 */
3696 	object = old_entry->ba.object;
3697 	if (old_entry->ba.backing_ba &&
3698 	    (old_entry->ba.backing_count >= vm_map_backing_limit ||
3699 	     (vm_map_backing_shadow_test && object &&
3700 	      object->size == object->resident_page_count))) {
3701 		/*
3702 		 * If there are too many backing_ba linkages we
3703 		 * collapse everything into the head
3704 		 *
3705 		 * This will also remove all the pte's.
3706 		 */
3707 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY)
3708 			vm_map_entry_shadow(old_entry);
3709 		if (object == NULL)
3710 			vm_map_entry_allocate_object(old_entry);
3711 		if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) {
3712 			ba = old_entry->ba.backing_ba;
3713 			old_entry->ba.backing_ba = NULL;
3714 			old_entry->ba.backing_count = 0;
3715 			vm_map_entry_dispose_ba(old_entry, ba);
3716 		}
3717 	}
3718 	object = NULL;	/* object variable is now invalid */
3719 
3720 	/*
3721 	 * Fork the entry
3722 	 */
3723 	switch (old_entry->inheritance) {
3724 	case VM_INHERIT_NONE:
3725 		break;
3726 	case VM_INHERIT_SHARE:
3727 		/*
3728 		 * Clone the entry as a shared entry.  This will look like
3729 		 * shared memory across the old and the new process.  We must
3730 		 * ensure that the object is allocated.
3731 		 */
3732 		if (old_entry->ba.object == NULL)
3733 			vm_map_entry_allocate_object(old_entry);
3734 
3735 		if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3736 			/*
3737 			 * Create the fronting vm_map_backing for
3738 			 * an entry which needs a copy, plus an extra
3739 			 * ref because we are going to duplicate it
3740 			 * in the fork.
3741 			 *
3742 			 * The call to vm_map_entry_shadow() will also clear
3743 			 * OBJ_ONEMAPPING.
3744 			 *
3745 			 * XXX no more collapse.  Still need extra ref
3746 			 * for the fork.
3747 			 */
3748 			vm_map_entry_shadow(old_entry);
3749 		} else if (old_entry->ba.object) {
3750 			object = old_entry->ba.object;
3751 		}
3752 
3753 		/*
3754 		 * Clone the entry.  We've already bumped the ref on
3755 		 * the vm_object for our new entry.
3756 		 */
3757 		new_entry = vm_map_entry_create(countp);
3758 		*new_entry = *old_entry;
3759 
3760 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3761 		new_entry->wired_count = 0;
3762 
3763 		/*
3764 		 * Replicate and index the vm_map_backing.  Don't share
3765 		 * the vm_map_backing across vm_map's (only across clips).
3766 		 *
3767 		 * Insert the entry into the new map -- we know we're
3768 		 * inserting at the end of the new map.
3769 		 */
3770 		vm_map_backing_replicated(new_map, new_entry, 0);
3771 		vm_map_entry_link(new_map, new_entry);
3772 
3773 		/*
3774 		 * Update the physical map
3775 		 */
3776 		pmap_copy(new_map->pmap, old_map->pmap,
3777 			  new_entry->ba.start,
3778 			  (old_entry->ba.end - old_entry->ba.start),
3779 			  old_entry->ba.start);
3780 		break;
3781 	case VM_INHERIT_COPY:
3782 		/*
3783 		 * Clone the entry and link the copy into the new map.
3784 		 *
3785 		 * Note that ref-counting adjustment for old_entry->ba.object
3786 		 * (if it isn't a special map that is) is handled by
3787 		 * vm_map_copy_entry().
3788 		 */
3789 		new_entry = vm_map_entry_create(countp);
3790 		*new_entry = *old_entry;
3791 
3792 		new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3793 		new_entry->wired_count = 0;
3794 
3795 		vm_map_backing_replicated(new_map, new_entry, 0);
3796 		vm_map_entry_link(new_map, new_entry);
3797 
3798 		/*
3799 		 * This does the actual dirty work of making both entries
3800 		 * copy-on-write, and will also handle the fronting object.
3801 		 */
3802 		vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
3803 		break;
3804 	}
3805 }
3806 
3807 /*
3808  * When forking user-kernel shared maps, the map might change in the
3809  * child so do not try to copy the underlying pmap entries.
3810  */
3811 static
3812 void
3813 vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3814 			  vm_map_t old_map, vm_map_t new_map,
3815 			  vm_map_entry_t old_entry, int *countp)
3816 {
3817 	vm_map_entry_t new_entry;
3818 
3819 	/*
3820 	 * Do not fork lpmap entries whos TIDs do not match lp2's tid.
3821 	 *
3822 	 * XXX if p2 is NULL and lp2 is non-NULL, we retain the lpmap entry
3823 	 * (this is for e.g. resident'ing vmspace's) but set the field
3824 	 * to NULL.  Upon restore it should be restored. XXX NOT IMPL YET
3825 	 */
3826 	if (old_entry->aux.dev) {
3827 		switch(minor(old_entry->aux.dev)) {
3828 		case 5:
3829 			break;
3830 		case 6:
3831 			break;
3832 		case 7:
3833 			if (lp2 == NULL)
3834 				return;
3835 			if (old_entry->ba.aux_info == NULL)
3836 				return;
3837 			if (((struct lwp *)old_entry->ba.aux_info)->lwp_tid !=
3838 			    lp2->lwp_tid)
3839 				return;
3840 			break;
3841 		}
3842 	}
3843 
3844 	new_entry = vm_map_entry_create(countp);
3845 	*new_entry = *old_entry;
3846 
3847 	new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3848 	new_entry->wired_count = 0;
3849 	KKASSERT(new_entry->ba.backing_ba == NULL);
3850 
3851 	if (new_entry->aux.dev) {
3852 		switch(minor(new_entry->aux.dev)) {
3853 		case 5:
3854 			/*
3855 			 * upmap
3856 			 */
3857 			new_entry->ba.aux_info = p2;
3858 			break;
3859 		case 6:
3860 			/*
3861 			 * kpmap
3862 			 */
3863 			new_entry->ba.aux_info = NULL;
3864 			break;
3865 		case 7:
3866 			/*
3867 			 * lpmap
3868 			 */
3869 			new_entry->ba.aux_info = lp2;
3870 			break;
3871 		}
3872 	} else {
3873 		new_entry->ba.aux_info = NULL;
3874 	}
3875 
3876 	vm_map_backing_replicated(new_map, new_entry, 0);
3877 
3878 	vm_map_entry_link(new_map, new_entry);
3879 }
3880 
3881 /*
3882  * Create an auto-grow stack entry
3883  *
3884  * No requirements.
3885  */
3886 int
3887 vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize,
3888 	      int flags, vm_prot_t prot, vm_prot_t max, int cow)
3889 {
3890 	vm_map_entry_t	prev_entry;
3891 	vm_map_entry_t	next;
3892 	vm_size_t	init_ssize;
3893 	int		rv;
3894 	int		count;
3895 	vm_offset_t	tmpaddr;
3896 
3897 	cow |= MAP_IS_STACK;
3898 
3899 	if (max_ssize < sgrowsiz)
3900 		init_ssize = max_ssize;
3901 	else
3902 		init_ssize = sgrowsiz;
3903 
3904 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3905 	vm_map_lock(map);
3906 
3907 	/*
3908 	 * Find space for the mapping
3909 	 */
3910 	if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3911 		if (vm_map_findspace(map, *addrbos, max_ssize, 1,
3912 				     flags, &tmpaddr)) {
3913 			vm_map_unlock(map);
3914 			vm_map_entry_release(count);
3915 			return (KERN_NO_SPACE);
3916 		}
3917 		*addrbos = tmpaddr;
3918 	}
3919 
3920 	/* If addr is already mapped, no go */
3921 	if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) {
3922 		vm_map_unlock(map);
3923 		vm_map_entry_release(count);
3924 		return (KERN_NO_SPACE);
3925 	}
3926 
3927 #if 0
3928 	/* XXX already handled by kern_mmap() */
3929 	/* If we would blow our VMEM resource limit, no go */
3930 	if (map->size + init_ssize >
3931 	    curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3932 		vm_map_unlock(map);
3933 		vm_map_entry_release(count);
3934 		return (KERN_NO_SPACE);
3935 	}
3936 #endif
3937 
3938 	/*
3939 	 * If we can't accomodate max_ssize in the current mapping,
3940 	 * no go.  However, we need to be aware that subsequent user
3941 	 * mappings might map into the space we have reserved for
3942 	 * stack, and currently this space is not protected.
3943 	 *
3944 	 * Hopefully we will at least detect this condition
3945 	 * when we try to grow the stack.
3946 	 */
3947 	if (prev_entry)
3948 		next = vm_map_rb_tree_RB_NEXT(prev_entry);
3949 	else
3950 		next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3951 
3952 	if (next && next->ba.start < *addrbos + max_ssize) {
3953 		vm_map_unlock(map);
3954 		vm_map_entry_release(count);
3955 		return (KERN_NO_SPACE);
3956 	}
3957 
3958 	/*
3959 	 * We initially map a stack of only init_ssize.  We will
3960 	 * grow as needed later.  Since this is to be a grow
3961 	 * down stack, we map at the top of the range.
3962 	 *
3963 	 * Note: we would normally expect prot and max to be
3964 	 * VM_PROT_ALL, and cow to be 0.  Possibly we should
3965 	 * eliminate these as input parameters, and just
3966 	 * pass these values here in the insert call.
3967 	 */
3968 	rv = vm_map_insert(map, &count,
3969 			   NULL, NULL,
3970 			   0, NULL,
3971 			   *addrbos + max_ssize - init_ssize,
3972 	                   *addrbos + max_ssize,
3973 			   VM_MAPTYPE_NORMAL,
3974 			   VM_SUBSYS_STACK, prot, max, cow);
3975 
3976 	/* Now set the avail_ssize amount */
3977 	if (rv == KERN_SUCCESS) {
3978 		if (prev_entry)
3979 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
3980 		else
3981 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3982 		if (prev_entry != NULL) {
3983 			vm_map_clip_end(map,
3984 					prev_entry,
3985 					*addrbos + max_ssize - init_ssize,
3986 					&count);
3987 		}
3988 		if (next->ba.end   != *addrbos + max_ssize ||
3989 		    next->ba.start != *addrbos + max_ssize - init_ssize){
3990 			panic ("Bad entry start/end for new stack entry");
3991 		} else {
3992 			next->aux.avail_ssize = max_ssize - init_ssize;
3993 		}
3994 	}
3995 
3996 	vm_map_unlock(map);
3997 	vm_map_entry_release(count);
3998 	return (rv);
3999 }
4000 
4001 /*
4002  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
4003  * desired address is already mapped, or if we successfully grow
4004  * the stack.  Also returns KERN_SUCCESS if addr is outside the
4005  * stack range (this is strange, but preserves compatibility with
4006  * the grow function in vm_machdep.c).
4007  *
4008  * No requirements.
4009  */
4010 int
4011 vm_map_growstack (vm_map_t map, vm_offset_t addr)
4012 {
4013 	vm_map_entry_t prev_entry;
4014 	vm_map_entry_t stack_entry;
4015 	vm_map_entry_t next;
4016 	struct vmspace *vm;
4017 	struct lwp *lp;
4018 	struct proc *p;
4019 	vm_offset_t    end;
4020 	int grow_amount;
4021 	int rv = KERN_SUCCESS;
4022 	int is_procstack;
4023 	int use_read_lock = 1;
4024 	int count;
4025 
4026 	/*
4027 	 * Find the vm
4028 	 */
4029 	lp = curthread->td_lwp;
4030 	p = curthread->td_proc;
4031 	KKASSERT(lp != NULL);
4032 	vm = lp->lwp_vmspace;
4033 
4034 	/*
4035 	 * Growstack is only allowed on the current process.  We disallow
4036 	 * other use cases, e.g. trying to access memory via procfs that
4037 	 * the stack hasn't grown into.
4038 	 */
4039 	if (map != &vm->vm_map) {
4040 		return KERN_FAILURE;
4041 	}
4042 
4043 	count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4044 Retry:
4045 	if (use_read_lock)
4046 		vm_map_lock_read(map);
4047 	else
4048 		vm_map_lock(map);
4049 
4050 	/*
4051 	 * If addr is already in the entry range, no need to grow.
4052 	 * prev_entry returns NULL if addr is at the head.
4053 	 */
4054 	if (vm_map_lookup_entry(map, addr, &prev_entry))
4055 		goto done;
4056 	if (prev_entry)
4057 		stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry);
4058 	else
4059 		stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
4060 
4061 	if (stack_entry == NULL)
4062 		goto done;
4063 	if (prev_entry == NULL)
4064 		end = stack_entry->ba.start - stack_entry->aux.avail_ssize;
4065 	else
4066 		end = prev_entry->ba.end;
4067 
4068 	/*
4069 	 * This next test mimics the old grow function in vm_machdep.c.
4070 	 * It really doesn't quite make sense, but we do it anyway
4071 	 * for compatibility.
4072 	 *
4073 	 * If not growable stack, return success.  This signals the
4074 	 * caller to proceed as he would normally with normal vm.
4075 	 */
4076 	if (stack_entry->aux.avail_ssize < 1 ||
4077 	    addr >= stack_entry->ba.start ||
4078 	    addr <  stack_entry->ba.start - stack_entry->aux.avail_ssize) {
4079 		goto done;
4080 	}
4081 
4082 	/* Find the minimum grow amount */
4083 	grow_amount = roundup (stack_entry->ba.start - addr, PAGE_SIZE);
4084 	if (grow_amount > stack_entry->aux.avail_ssize) {
4085 		rv = KERN_NO_SPACE;
4086 		goto done;
4087 	}
4088 
4089 	/*
4090 	 * If there is no longer enough space between the entries
4091 	 * nogo, and adjust the available space.  Note: this
4092 	 * should only happen if the user has mapped into the
4093 	 * stack area after the stack was created, and is
4094 	 * probably an error.
4095 	 *
4096 	 * This also effectively destroys any guard page the user
4097 	 * might have intended by limiting the stack size.
4098 	 */
4099 	if (grow_amount > stack_entry->ba.start - end) {
4100 		if (use_read_lock && vm_map_lock_upgrade(map)) {
4101 			/* lost lock */
4102 			use_read_lock = 0;
4103 			goto Retry;
4104 		}
4105 		use_read_lock = 0;
4106 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4107 		rv = KERN_NO_SPACE;
4108 		goto done;
4109 	}
4110 
4111 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
4112 
4113 	/* If this is the main process stack, see if we're over the
4114 	 * stack limit.
4115 	 */
4116 	if (is_procstack && (vm->vm_ssize + grow_amount >
4117 			     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4118 		rv = KERN_NO_SPACE;
4119 		goto done;
4120 	}
4121 
4122 	/* Round up the grow amount modulo SGROWSIZ */
4123 	grow_amount = roundup (grow_amount, sgrowsiz);
4124 	if (grow_amount > stack_entry->aux.avail_ssize) {
4125 		grow_amount = stack_entry->aux.avail_ssize;
4126 	}
4127 	if (is_procstack && (vm->vm_ssize + grow_amount >
4128 	                     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4129 		grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
4130 	}
4131 
4132 	/* If we would blow our VMEM resource limit, no go */
4133 	if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
4134 		rv = KERN_NO_SPACE;
4135 		goto done;
4136 	}
4137 
4138 	if (use_read_lock && vm_map_lock_upgrade(map)) {
4139 		/* lost lock */
4140 		use_read_lock = 0;
4141 		goto Retry;
4142 	}
4143 	use_read_lock = 0;
4144 
4145 	/* Get the preliminary new entry start value */
4146 	addr = stack_entry->ba.start - grow_amount;
4147 
4148 	/* If this puts us into the previous entry, cut back our growth
4149 	 * to the available space.  Also, see the note above.
4150 	 */
4151 	if (addr < end) {
4152 		stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4153 		addr = end;
4154 	}
4155 
4156 	rv = vm_map_insert(map, &count,
4157 			   NULL, NULL,
4158 			   0, NULL,
4159 			   addr, stack_entry->ba.start,
4160 			   VM_MAPTYPE_NORMAL,
4161 			   VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4162 
4163 	/* Adjust the available stack space by the amount we grew. */
4164 	if (rv == KERN_SUCCESS) {
4165 		if (prev_entry) {
4166 			vm_map_clip_end(map, prev_entry, addr, &count);
4167 			next = vm_map_rb_tree_RB_NEXT(prev_entry);
4168 		} else {
4169 			next = RB_MIN(vm_map_rb_tree, &map->rb_root);
4170 		}
4171 		if (next->ba.end != stack_entry->ba.start  ||
4172 		    next->ba.start != addr) {
4173 			panic ("Bad stack grow start/end in new stack entry");
4174 		} else {
4175 			next->aux.avail_ssize =
4176 				stack_entry->aux.avail_ssize -
4177 				(next->ba.end - next->ba.start);
4178 			if (is_procstack) {
4179 				vm->vm_ssize += next->ba.end -
4180 						next->ba.start;
4181 			}
4182 		}
4183 
4184 		if (map->flags & MAP_WIREFUTURE)
4185 			vm_map_unwire(map, next->ba.start, next->ba.end, FALSE);
4186 	}
4187 
4188 done:
4189 	if (use_read_lock)
4190 		vm_map_unlock_read(map);
4191 	else
4192 		vm_map_unlock(map);
4193 	vm_map_entry_release(count);
4194 	return (rv);
4195 }
4196 
4197 /*
4198  * Unshare the specified VM space for exec.  If other processes are
4199  * mapped to it, then create a new one.  The new vmspace is null.
4200  *
4201  * No requirements.
4202  */
4203 void
4204 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4205 {
4206 	struct vmspace *oldvmspace = p->p_vmspace;
4207 	struct vmspace *newvmspace;
4208 	vm_map_t map = &p->p_vmspace->vm_map;
4209 
4210 	/*
4211 	 * If we are execing a resident vmspace we fork it, otherwise
4212 	 * we create a new vmspace.  Note that exitingcnt is not
4213 	 * copied to the new vmspace.
4214 	 */
4215 	lwkt_gettoken(&oldvmspace->vm_map.token);
4216 	if (vmcopy)  {
4217 		newvmspace = vmspace_fork(vmcopy, NULL, NULL);
4218 		lwkt_gettoken(&newvmspace->vm_map.token);
4219 	} else {
4220 		newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map));
4221 		lwkt_gettoken(&newvmspace->vm_map.token);
4222 		bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4223 		      (caddr_t)&oldvmspace->vm_endcopy -
4224 		       (caddr_t)&oldvmspace->vm_startcopy);
4225 	}
4226 
4227 	/*
4228 	 * Finish initializing the vmspace before assigning it
4229 	 * to the process.  The vmspace will become the current vmspace
4230 	 * if p == curproc.
4231 	 */
4232 	pmap_pinit2(vmspace_pmap(newvmspace));
4233 	pmap_replacevm(p, newvmspace, 0);
4234 	lwkt_reltoken(&newvmspace->vm_map.token);
4235 	lwkt_reltoken(&oldvmspace->vm_map.token);
4236 	vmspace_rel(oldvmspace);
4237 }
4238 
4239 /*
4240  * Unshare the specified VM space for forcing COW.  This
4241  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4242  */
4243 void
4244 vmspace_unshare(struct proc *p)
4245 {
4246 	struct vmspace *oldvmspace = p->p_vmspace;
4247 	struct vmspace *newvmspace;
4248 
4249 	lwkt_gettoken(&oldvmspace->vm_map.token);
4250 	if (vmspace_getrefs(oldvmspace) == 1) {
4251 		lwkt_reltoken(&oldvmspace->vm_map.token);
4252 		return;
4253 	}
4254 	newvmspace = vmspace_fork(oldvmspace, NULL, NULL);
4255 	lwkt_gettoken(&newvmspace->vm_map.token);
4256 	pmap_pinit2(vmspace_pmap(newvmspace));
4257 	pmap_replacevm(p, newvmspace, 0);
4258 	lwkt_reltoken(&newvmspace->vm_map.token);
4259 	lwkt_reltoken(&oldvmspace->vm_map.token);
4260 	vmspace_rel(oldvmspace);
4261 }
4262 
4263 /*
4264  * vm_map_hint: return the beginning of the best area suitable for
4265  * creating a new mapping with "prot" protection.
4266  *
4267  * No requirements.
4268  */
4269 vm_offset_t
4270 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4271 {
4272 	struct vmspace *vms = p->p_vmspace;
4273 	struct rlimit limit;
4274 	rlim_t dsiz;
4275 
4276 	/*
4277 	 * Acquire datasize limit for mmap() operation,
4278 	 * calculate nearest power of 2.
4279 	 */
4280 	if (kern_getrlimit(RLIMIT_DATA, &limit))
4281 		limit.rlim_cur = maxdsiz;
4282 	dsiz = limit.rlim_cur;
4283 
4284 	if (!randomize_mmap || addr != 0) {
4285 		/*
4286 		 * Set a reasonable start point for the hint if it was
4287 		 * not specified or if it falls within the heap space.
4288 		 * Hinted mmap()s do not allocate out of the heap space.
4289 		 */
4290 		if (addr == 0 ||
4291 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4292 		     addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4293 			addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4294 		}
4295 
4296 		return addr;
4297 	}
4298 
4299 	/*
4300 	 * randomize_mmap && addr == 0.  For now randomize the
4301 	 * address within a dsiz range beyond the data limit.
4302 	 */
4303 	addr = (vm_offset_t)vms->vm_daddr + dsiz;
4304 	if (dsiz)
4305 		addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4306 	return (round_page(addr));
4307 }
4308 
4309 /*
4310  * Finds the VM object, offset, and protection for a given virtual address
4311  * in the specified map, assuming a page fault of the type specified.
4312  *
4313  * Leaves the map in question locked for read; return values are guaranteed
4314  * until a vm_map_lookup_done call is performed.  Note that the map argument
4315  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4316  *
4317  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4318  * that fast.
4319  *
4320  * If a lookup is requested with "write protection" specified, the map may
4321  * be changed to perform virtual copying operations, although the data
4322  * referenced will remain the same.
4323  *
4324  * No requirements.
4325  */
4326 int
4327 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
4328 	      vm_offset_t vaddr,
4329 	      vm_prot_t fault_typea,
4330 	      vm_map_entry_t *out_entry,	/* OUT */
4331 	      struct vm_map_backing **bap,	/* OUT */
4332 	      vm_pindex_t *pindex,		/* OUT */
4333 	      vm_pindex_t *pcount,		/* OUT */
4334 	      vm_prot_t *out_prot,		/* OUT */
4335 	      int *wflags)			/* OUT */
4336 {
4337 	vm_map_entry_t entry;
4338 	vm_map_t map = *var_map;
4339 	vm_prot_t prot;
4340 	vm_prot_t fault_type = fault_typea;
4341 	int use_read_lock = 1;
4342 	int rv = KERN_SUCCESS;
4343 	int count;
4344 	thread_t td = curthread;
4345 
4346 	/*
4347 	 * vm_map_entry_reserve() implements an important mitigation
4348 	 * against mmap() span running the kernel out of vm_map_entry
4349 	 * structures, but it can also cause an infinite call recursion.
4350 	 * Use td_nest_count to prevent an infinite recursion (allows
4351 	 * the vm_map code to dig into the pcpu vm_map_entry reserve).
4352 	 */
4353 	count = 0;
4354 	if (td->td_nest_count == 0) {
4355 		++td->td_nest_count;
4356 		count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4357 		--td->td_nest_count;
4358 	}
4359 RetryLookup:
4360 	if (use_read_lock)
4361 		vm_map_lock_read(map);
4362 	else
4363 		vm_map_lock(map);
4364 
4365 	/*
4366 	 * Always do a full lookup.  The hint doesn't get us much anymore
4367 	 * now that the map is RB'd.
4368 	 */
4369 	cpu_ccfence();
4370 	*out_entry = NULL;
4371 	*bap = NULL;
4372 
4373 	{
4374 		vm_map_entry_t tmp_entry;
4375 
4376 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4377 			rv = KERN_INVALID_ADDRESS;
4378 			goto done;
4379 		}
4380 		entry = tmp_entry;
4381 		*out_entry = entry;
4382 	}
4383 
4384 	/*
4385 	 * Handle submaps.
4386 	 */
4387 	if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4388 		vm_map_t old_map = map;
4389 
4390 		*var_map = map = entry->ba.sub_map;
4391 		if (use_read_lock)
4392 			vm_map_unlock_read(old_map);
4393 		else
4394 			vm_map_unlock(old_map);
4395 		use_read_lock = 1;
4396 		goto RetryLookup;
4397 	}
4398 
4399 	/*
4400 	 * Check whether this task is allowed to have this page.
4401 	 * Note the special case for MAP_ENTRY_COW pages with an override.
4402 	 * This is to implement a forced COW for debuggers.
4403 	 */
4404 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
4405 		prot = entry->max_protection;
4406 	else
4407 		prot = entry->protection;
4408 
4409 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4410 	if ((fault_type & prot) != fault_type) {
4411 		rv = KERN_PROTECTION_FAILURE;
4412 		goto done;
4413 	}
4414 
4415 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4416 	    (entry->eflags & MAP_ENTRY_COW) &&
4417 	    (fault_type & VM_PROT_WRITE) &&
4418 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4419 		rv = KERN_PROTECTION_FAILURE;
4420 		goto done;
4421 	}
4422 
4423 	/*
4424 	 * If this page is not pageable, we have to get it for all possible
4425 	 * accesses.
4426 	 */
4427 	*wflags = 0;
4428 	if (entry->wired_count) {
4429 		*wflags |= FW_WIRED;
4430 		prot = fault_type = entry->protection;
4431 	}
4432 
4433 	if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4434 	    pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4435 		if ((prot & VM_PROT_WRITE) == 0)
4436 			fault_type |= VM_PROT_WRITE;
4437 	}
4438 
4439 	/*
4440 	 * Only NORMAL maps are object-based.  UKSMAPs are not.
4441 	 */
4442 	if (entry->maptype != VM_MAPTYPE_NORMAL) {
4443 		*bap = NULL;
4444 		goto skip;
4445 	}
4446 
4447 	/*
4448 	 * If the entry was copy-on-write, we either ...
4449 	 */
4450 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4451 		/*
4452 		 * If we want to write the page, we may as well handle that
4453 		 * now since we've got the map locked.
4454 		 *
4455 		 * If we don't need to write the page, we just demote the
4456 		 * permissions allowed.
4457 		 */
4458 		if (fault_type & VM_PROT_WRITE) {
4459 			/*
4460 			 * Not allowed if TDF_NOFAULT is set as the shadowing
4461 			 * operation can deadlock against the faulting
4462 			 * function due to the copy-on-write.
4463 			 */
4464 			if (curthread->td_flags & TDF_NOFAULT) {
4465 				rv = KERN_FAILURE_NOFAULT;
4466 				goto done;
4467 			}
4468 
4469 			/*
4470 			 * Make a new vm_map_backing + object, and place it
4471 			 * in the object chain.  Note that no new references
4472 			 * have appeared -- one just moved from the map to
4473 			 * the new object.
4474 			 */
4475 			if (use_read_lock && vm_map_lock_upgrade(map)) {
4476 				/* lost lock */
4477 				use_read_lock = 0;
4478 				goto RetryLookup;
4479 			}
4480 			use_read_lock = 0;
4481 			vm_map_entry_shadow(entry);
4482 			*wflags |= FW_DIDCOW;
4483 		} else {
4484 			/*
4485 			 * We're attempting to read a copy-on-write page --
4486 			 * don't allow writes.
4487 			 */
4488 			prot &= ~VM_PROT_WRITE;
4489 		}
4490 	}
4491 
4492 	/*
4493 	 * Create an object if necessary.  This code also handles
4494 	 * partitioning large entries to improve vm_fault performance.
4495 	 */
4496 	if (entry->ba.object == NULL && !map->system_map) {
4497 		if (use_read_lock && vm_map_lock_upgrade(map))  {
4498 			/* lost lock */
4499 			use_read_lock = 0;
4500 			goto RetryLookup;
4501 		}
4502 		use_read_lock = 0;
4503 
4504 		/*
4505 		 * Partition large entries, giving each its own VM object,
4506 		 * to improve concurrent fault performance.  This is only
4507 		 * applicable to userspace.
4508 		 */
4509 		if (map != &kernel_map &&
4510 		    entry->maptype == VM_MAPTYPE_NORMAL &&
4511 		    ((entry->ba.start ^ entry->ba.end) &
4512 		     ~MAP_ENTRY_PARTITION_MASK) &&
4513 		    vm_map_partition_enable) {
4514 			if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4515 				entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4516 				++mycpu->gd_cnt.v_intrans_coll;
4517 				++mycpu->gd_cnt.v_intrans_wait;
4518 				vm_map_transition_wait(map, 0);
4519 				goto RetryLookup;
4520 			}
4521 			vm_map_entry_partition(map, entry, vaddr, &count);
4522 		}
4523 		vm_map_entry_allocate_object(entry);
4524 	}
4525 
4526 	/*
4527 	 * Return the object/offset from this entry.  If the entry was
4528 	 * copy-on-write or empty, it has been fixed up.
4529 	 */
4530 	*bap = &entry->ba;
4531 
4532 skip:
4533 	*pindex = OFF_TO_IDX((vaddr - entry->ba.start) + entry->ba.offset);
4534 	*pcount = OFF_TO_IDX(entry->ba.end - trunc_page(vaddr));
4535 
4536 	/*
4537 	 * Return whether this is the only map sharing this data.  On
4538 	 * success we return with a read lock held on the map.  On failure
4539 	 * we return with the map unlocked.
4540 	 */
4541 	*out_prot = prot;
4542 done:
4543 	if (rv == KERN_SUCCESS) {
4544 		if (use_read_lock == 0)
4545 			vm_map_lock_downgrade(map);
4546 	} else if (use_read_lock) {
4547 		vm_map_unlock_read(map);
4548 	} else {
4549 		vm_map_unlock(map);
4550 	}
4551 	if (count > 0)
4552 		vm_map_entry_release(count);
4553 
4554 	return (rv);
4555 }
4556 
4557 /*
4558  * Releases locks acquired by a vm_map_lookup()
4559  * (according to the handle returned by that lookup).
4560  *
4561  * No other requirements.
4562  */
4563 void
4564 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4565 {
4566 	/*
4567 	 * Unlock the main-level map
4568 	 */
4569 	vm_map_unlock_read(map);
4570 	if (count)
4571 		vm_map_entry_release(count);
4572 }
4573 
4574 static void
4575 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4576 		       vm_offset_t vaddr, int *countp)
4577 {
4578 	vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4579 	vm_map_clip_start(map, entry, vaddr, countp);
4580 	vaddr += MAP_ENTRY_PARTITION_SIZE;
4581 	vm_map_clip_end(map, entry, vaddr, countp);
4582 }
4583 
4584 /*
4585  * Quick hack, needs some help to make it more SMP friendly.
4586  */
4587 void
4588 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4589 		 vm_offset_t ran_beg, vm_offset_t ran_end)
4590 {
4591 	struct vm_map_ilock *scan;
4592 
4593 	ilock->ran_beg = ran_beg;
4594 	ilock->ran_end = ran_end;
4595 	ilock->flags = 0;
4596 
4597 	spin_lock(&map->ilock_spin);
4598 restart:
4599 	for (scan = map->ilock_base; scan; scan = scan->next) {
4600 		if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4601 			scan->flags |= ILOCK_WAITING;
4602 			ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4603 			goto restart;
4604 		}
4605 	}
4606 	ilock->next = map->ilock_base;
4607 	map->ilock_base = ilock;
4608 	spin_unlock(&map->ilock_spin);
4609 }
4610 
4611 void
4612 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4613 {
4614 	struct vm_map_ilock *scan;
4615 	struct vm_map_ilock **scanp;
4616 
4617 	spin_lock(&map->ilock_spin);
4618 	scanp = &map->ilock_base;
4619 	while ((scan = *scanp) != NULL) {
4620 		if (scan == ilock) {
4621 			*scanp = ilock->next;
4622 			spin_unlock(&map->ilock_spin);
4623 			if (ilock->flags & ILOCK_WAITING)
4624 				wakeup(ilock);
4625 			return;
4626 		}
4627 		scanp = &scan->next;
4628 	}
4629 	spin_unlock(&map->ilock_spin);
4630 	panic("vm_map_deinterlock: missing ilock!");
4631 }
4632 
4633 #include "opt_ddb.h"
4634 #ifdef DDB
4635 #include <ddb/ddb.h>
4636 
4637 /*
4638  * Debugging only
4639  */
4640 DB_SHOW_COMMAND(map, vm_map_print)
4641 {
4642 	static int nlines;
4643 	/* XXX convert args. */
4644 	vm_map_t map = (vm_map_t)addr;
4645 	boolean_t full = have_addr;
4646 
4647 	vm_map_entry_t entry;
4648 
4649 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4650 	    (void *)map,
4651 	    (void *)map->pmap, map->nentries, map->timestamp);
4652 	nlines++;
4653 
4654 	if (!full && db_indent)
4655 		return;
4656 
4657 	db_indent += 2;
4658 	RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
4659 		db_iprintf("map entry %p: start=%p, end=%p\n",
4660 		    (void *)entry,
4661 		    (void *)entry->ba.start, (void *)entry->ba.end);
4662 		nlines++;
4663 		{
4664 			static char *inheritance_name[4] =
4665 			{"share", "copy", "none", "donate_copy"};
4666 
4667 			db_iprintf(" prot=%x/%x/%s",
4668 			    entry->protection,
4669 			    entry->max_protection,
4670 			    inheritance_name[(int)(unsigned char)
4671 						entry->inheritance]);
4672 			if (entry->wired_count != 0)
4673 				db_printf(", wired");
4674 		}
4675 		switch(entry->maptype) {
4676 		case VM_MAPTYPE_SUBMAP:
4677 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4678 			db_printf(", share=%p, offset=0x%lx\n",
4679 			    (void *)entry->ba.sub_map,
4680 			    (long)entry->ba.offset);
4681 			nlines++;
4682 
4683 			db_indent += 2;
4684 			vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
4685 				     full, 0, NULL);
4686 			db_indent -= 2;
4687 			break;
4688 		case VM_MAPTYPE_NORMAL:
4689 			/* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4690 			db_printf(", object=%p, offset=0x%lx",
4691 			    (void *)entry->ba.object,
4692 			    (long)entry->ba.offset);
4693 			if (entry->eflags & MAP_ENTRY_COW)
4694 				db_printf(", copy (%s)",
4695 				    ((entry->eflags & MAP_ENTRY_NEEDS_COPY) ?
4696 				     "needed" : "done"));
4697 			db_printf("\n");
4698 			nlines++;
4699 
4700 			if (entry->ba.object) {
4701 				db_indent += 2;
4702 				vm_object_print((db_expr_t)(intptr_t)
4703 						entry->ba.object,
4704 						full, 0, NULL);
4705 				nlines += 4;
4706 				db_indent -= 2;
4707 			}
4708 			break;
4709 		case VM_MAPTYPE_UKSMAP:
4710 			db_printf(", uksmap=%p, offset=0x%lx",
4711 			    (void *)entry->ba.uksmap,
4712 			    (long)entry->ba.offset);
4713 			if (entry->eflags & MAP_ENTRY_COW)
4714 				db_printf(", copy (%s)",
4715 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4716 			db_printf("\n");
4717 			nlines++;
4718 			break;
4719 		default:
4720 			break;
4721 		}
4722 	}
4723 	db_indent -= 2;
4724 	if (db_indent == 0)
4725 		nlines = 0;
4726 }
4727 
4728 /*
4729  * Debugging only
4730  */
4731 DB_SHOW_COMMAND(procvm, procvm)
4732 {
4733 	struct proc *p;
4734 
4735 	if (have_addr) {
4736 		p = (struct proc *) addr;
4737 	} else {
4738 		p = curproc;
4739 	}
4740 
4741 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4742 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4743 	    (void *)vmspace_pmap(p->p_vmspace));
4744 
4745 	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4746 }
4747 
4748 #endif /* DDB */
4749