xref: /dragonfly/sys/vm/vm_object.c (revision 65cc0652)
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62 
63 /*
64  *	Virtual memory object module.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>		/* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
91 
92 #include <vm/vm_page2.h>
93 
94 #include <machine/specialreg.h>
95 
96 #define EASY_SCAN_FACTOR	8
97 
98 static void	vm_object_qcollapse(vm_object_t object,
99 				    vm_object_t backing_object);
100 static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
101 					     int pagerflags);
102 static void	vm_object_lock_init(vm_object_t);
103 
104 
105 /*
106  *	Virtual memory objects maintain the actual data
107  *	associated with allocated virtual memory.  A given
108  *	page of memory exists within exactly one object.
109  *
110  *	An object is only deallocated when all "references"
111  *	are given up.  Only one "reference" to a given
112  *	region of an object should be writeable.
113  *
114  *	Associated with each object is a list of all resident
115  *	memory pages belonging to that object; this list is
116  *	maintained by the "vm_page" module, and locked by the object's
117  *	lock.
118  *
119  *	Each object also records a "pager" routine which is
120  *	used to retrieve (and store) pages to the proper backing
121  *	storage.  In addition, objects may be backed by other
122  *	objects from which they were virtual-copied.
123  *
124  *	The only items within the object structure which are
125  *	modified after time of creation are:
126  *		reference count		locked by object's lock
127  *		pager routine		locked by object's lock
128  *
129  */
130 
131 struct vm_object kernel_object;
132 
133 static long object_collapses;
134 static long object_bypasses;
135 
136 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
137 
138 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
139 
140 #if defined(DEBUG_LOCKS)
141 
142 #define vm_object_vndeallocate(obj, vpp)	\
143                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
144 
145 /*
146  * Debug helper to track hold/drop/ref/deallocate calls.
147  */
148 static void
149 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
150 {
151 	int i;
152 
153 	i = atomic_fetchadd_int(&obj->debug_index, 1);
154 	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
155 	ksnprintf(obj->debug_hold_thrs[i],
156 		  sizeof(obj->debug_hold_thrs[i]),
157 		  "%c%d:(%d):%s",
158 		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
159 		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
160 		  obj->ref_count,
161 		  curthread->td_comm);
162 	obj->debug_hold_file[i] = file;
163 	obj->debug_hold_line[i] = line;
164 #if 0
165 	/* Uncomment for debugging obj refs/derefs in reproducable cases */
166 	if (strcmp(curthread->td_comm, "sshd") == 0) {
167 		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
168 			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
169 			obj, obj->ref_count, addrem, file, line);
170 	}
171 #endif
172 }
173 
174 #endif
175 
176 /*
177  * Misc low level routines
178  */
179 static void
180 vm_object_lock_init(vm_object_t obj)
181 {
182 #if defined(DEBUG_LOCKS)
183 	int i;
184 
185 	obj->debug_index = 0;
186 	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
187 		obj->debug_hold_thrs[i][0] = 0;
188 		obj->debug_hold_file[i] = NULL;
189 		obj->debug_hold_line[i] = 0;
190 	}
191 #endif
192 }
193 
194 void
195 vm_object_lock_swap(void)
196 {
197 	lwkt_token_swap();
198 }
199 
200 void
201 vm_object_lock(vm_object_t obj)
202 {
203 	lwkt_gettoken(&obj->token);
204 }
205 
206 /*
207  * Returns TRUE on sucesss
208  */
209 static int
210 vm_object_lock_try(vm_object_t obj)
211 {
212 	return(lwkt_trytoken(&obj->token));
213 }
214 
215 void
216 vm_object_lock_shared(vm_object_t obj)
217 {
218 	lwkt_gettoken_shared(&obj->token);
219 }
220 
221 void
222 vm_object_unlock(vm_object_t obj)
223 {
224 	lwkt_reltoken(&obj->token);
225 }
226 
227 void
228 vm_object_upgrade(vm_object_t obj)
229 {
230 	lwkt_reltoken(&obj->token);
231 	lwkt_gettoken(&obj->token);
232 }
233 
234 void
235 vm_object_downgrade(vm_object_t obj)
236 {
237 	lwkt_reltoken(&obj->token);
238 	lwkt_gettoken_shared(&obj->token);
239 }
240 
241 static __inline void
242 vm_object_assert_held(vm_object_t obj)
243 {
244 	ASSERT_LWKT_TOKEN_HELD(&obj->token);
245 }
246 
247 static __inline int
248 vm_quickcolor(void)
249 {
250 	globaldata_t gd = mycpu;
251 	int pg_color;
252 
253 	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
254 	pg_color += gd->gd_quick_color;
255 	gd->gd_quick_color += PQ_PRIME2;
256 
257 	return pg_color;
258 }
259 
260 void
261 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
262 {
263 	KKASSERT(obj != NULL);
264 
265 	/*
266 	 * Object must be held (object allocation is stable due to callers
267 	 * context, typically already holding the token on a parent object)
268 	 * prior to potentially blocking on the lock, otherwise the object
269 	 * can get ripped away from us.
270 	 */
271 	refcount_acquire(&obj->hold_count);
272 	vm_object_lock(obj);
273 
274 #if defined(DEBUG_LOCKS)
275 	debugvm_object_add(obj, file, line, 1);
276 #endif
277 }
278 
279 int
280 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
281 {
282 	KKASSERT(obj != NULL);
283 
284 	/*
285 	 * Object must be held (object allocation is stable due to callers
286 	 * context, typically already holding the token on a parent object)
287 	 * prior to potentially blocking on the lock, otherwise the object
288 	 * can get ripped away from us.
289 	 */
290 	refcount_acquire(&obj->hold_count);
291 	if (vm_object_lock_try(obj) == 0) {
292 		if (refcount_release(&obj->hold_count)) {
293 			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
294 				kfree(obj, M_VM_OBJECT);
295 		}
296 		return(0);
297 	}
298 
299 #if defined(DEBUG_LOCKS)
300 	debugvm_object_add(obj, file, line, 1);
301 #endif
302 	return(1);
303 }
304 
305 void
306 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
307 {
308 	KKASSERT(obj != NULL);
309 
310 	/*
311 	 * Object must be held (object allocation is stable due to callers
312 	 * context, typically already holding the token on a parent object)
313 	 * prior to potentially blocking on the lock, otherwise the object
314 	 * can get ripped away from us.
315 	 */
316 	refcount_acquire(&obj->hold_count);
317 	vm_object_lock_shared(obj);
318 
319 #if defined(DEBUG_LOCKS)
320 	debugvm_object_add(obj, file, line, 1);
321 #endif
322 }
323 
324 /*
325  * Drop the token and hold_count on the object.
326  *
327  * WARNING! Token might be shared.
328  */
329 void
330 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
331 {
332 	if (obj == NULL)
333 		return;
334 
335 	/*
336 	 * No new holders should be possible once we drop hold_count 1->0 as
337 	 * there is no longer any way to reference the object.
338 	 */
339 	KKASSERT(obj->hold_count > 0);
340 	if (refcount_release(&obj->hold_count)) {
341 #if defined(DEBUG_LOCKS)
342 		debugvm_object_add(obj, file, line, -1);
343 #endif
344 
345 		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
346 			vm_object_unlock(obj);
347 			kfree(obj, M_VM_OBJECT);
348 		} else {
349 			vm_object_unlock(obj);
350 		}
351 	} else {
352 #if defined(DEBUG_LOCKS)
353 		debugvm_object_add(obj, file, line, -1);
354 #endif
355 		vm_object_unlock(obj);
356 	}
357 }
358 
359 /*
360  * Initialize a freshly allocated object, returning a held object.
361  *
362  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
363  *
364  * No requirements.
365  */
366 void
367 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
368 {
369 	struct vm_object_hash *hash;
370 
371 	RB_INIT(&object->rb_memq);
372 	LIST_INIT(&object->shadow_head);
373 	lwkt_token_init(&object->token, "vmobj");
374 
375 	object->type = type;
376 	object->size = size;
377 	object->ref_count = 1;
378 	object->memattr = VM_MEMATTR_DEFAULT;
379 	object->hold_count = 0;
380 	object->flags = 0;
381 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
382 		vm_object_set_flag(object, OBJ_ONEMAPPING);
383 	object->paging_in_progress = 0;
384 	object->resident_page_count = 0;
385 	object->shadow_count = 0;
386 	/* cpu localization twist */
387 	object->pg_color = vm_quickcolor();
388 	object->handle = NULL;
389 	object->backing_object = NULL;
390 	object->backing_object_offset = (vm_ooffset_t)0;
391 
392 	atomic_add_int(&object->generation, 1);
393 	object->swblock_count = 0;
394 	RB_INIT(&object->swblock_root);
395 	vm_object_lock_init(object);
396 	pmap_object_init(object);
397 
398 	vm_object_hold(object);
399 
400 	hash = VMOBJ_HASH(object);
401 	lwkt_gettoken(&hash->token);
402 	TAILQ_INSERT_TAIL(&hash->list, object, object_list);
403 	lwkt_reltoken(&hash->token);
404 }
405 
406 /*
407  * Initialize a VM object.
408  */
409 void
410 vm_object_init(vm_object_t object, vm_pindex_t size)
411 {
412 	_vm_object_allocate(OBJT_DEFAULT, size, object);
413 	vm_object_drop(object);
414 }
415 
416 /*
417  * Initialize the VM objects module.
418  *
419  * Called from the low level boot code only.  Note that this occurs before
420  * kmalloc is initialized so we cannot allocate any VM objects.
421  */
422 void
423 vm_object_init1(void)
424 {
425 	int i;
426 
427 	for (i = 0; i < VMOBJ_HSIZE; ++i) {
428 		TAILQ_INIT(&vm_object_hash[i].list);
429 		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
430 	}
431 
432 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
433 			    &kernel_object);
434 	vm_object_drop(&kernel_object);
435 }
436 
437 void
438 vm_object_init2(void)
439 {
440 	kmalloc_set_unlimited(M_VM_OBJECT);
441 }
442 
443 /*
444  * Allocate and return a new object of the specified type and size.
445  *
446  * No requirements.
447  */
448 vm_object_t
449 vm_object_allocate(objtype_t type, vm_pindex_t size)
450 {
451 	vm_object_t obj;
452 
453 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
454 	_vm_object_allocate(type, size, obj);
455 	vm_object_drop(obj);
456 
457 	return (obj);
458 }
459 
460 /*
461  * This version returns a held object, allowing further atomic initialization
462  * of the object.
463  */
464 vm_object_t
465 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
466 {
467 	vm_object_t obj;
468 
469 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
470 	_vm_object_allocate(type, size, obj);
471 
472 	return (obj);
473 }
474 
475 /*
476  * Add an additional reference to a vm_object.  The object must already be
477  * held.  The original non-lock version is no longer supported.  The object
478  * must NOT be chain locked by anyone at the time the reference is added.
479  *
480  * Referencing a chain-locked object can blow up the fairly sensitive
481  * ref_count and shadow_count tests in the deallocator.  Most callers
482  * will call vm_object_chain_wait() prior to calling
483  * vm_object_reference_locked() to avoid the case.  The held token
484  * allows the caller to pair the wait and ref.
485  *
486  * The object must be held, but may be held shared if desired (hence why
487  * we use an atomic op).
488  */
489 void
490 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
491 {
492 	KKASSERT(object != NULL);
493 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
494 	KKASSERT((object->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) == 0);
495 	atomic_add_int(&object->ref_count, 1);
496 	if (object->type == OBJT_VNODE) {
497 		vref(object->handle);
498 		/* XXX what if the vnode is being destroyed? */
499 	}
500 #if defined(DEBUG_LOCKS)
501 	debugvm_object_add(object, file, line, 1);
502 #endif
503 }
504 
505 /*
506  * This version explicitly allows the chain to be held (i.e. by the
507  * caller).  The token must also be held.
508  */
509 void
510 VMOBJDEBUG(vm_object_reference_locked_chain_held)(vm_object_t object
511 	   VMOBJDBARGS)
512 {
513 	KKASSERT(object != NULL);
514 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
515 	atomic_add_int(&object->ref_count, 1);
516 	if (object->type == OBJT_VNODE) {
517 		vref(object->handle);
518 		/* XXX what if the vnode is being destroyed? */
519 	}
520 #if defined(DEBUG_LOCKS)
521 	debugvm_object_add(object, file, line, 1);
522 #endif
523 }
524 
525 /*
526  * This version is only allowed for vnode objects.
527  */
528 void
529 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
530 {
531 	KKASSERT(object->type == OBJT_VNODE);
532 	atomic_add_int(&object->ref_count, 1);
533 	vref(object->handle);
534 #if defined(DEBUG_LOCKS)
535 	debugvm_object_add(object, file, line, 1);
536 #endif
537 }
538 
539 /*
540  * Object OBJ_CHAINLOCK lock handling.
541  *
542  * The caller can chain-lock backing objects recursively and then
543  * use vm_object_chain_release_all() to undo the whole chain.
544  *
545  * Chain locks are used to prevent collapses and are only applicable
546  * to OBJT_DEFAULT and OBJT_SWAP objects.  Chain locking operations
547  * on other object types are ignored.  This is also important because
548  * it allows e.g. the vnode underlying a memory mapping to take concurrent
549  * faults.
550  *
551  * The object must usually be held on entry, though intermediate
552  * objects need not be held on release.  The object must be held exclusively,
553  * NOT shared.  Note that the prefault path checks the shared state and
554  * avoids using the chain functions.
555  */
556 void
557 vm_object_chain_wait(vm_object_t object, int shared)
558 {
559 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
560 	for (;;) {
561 		uint32_t chainlk = object->chainlk;
562 
563 		cpu_ccfence();
564 		if (shared) {
565 			if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
566 				tsleep_interlock(object, 0);
567 				if (atomic_cmpset_int(&object->chainlk,
568 						      chainlk,
569 						      chainlk | CHAINLK_WAIT)) {
570 					tsleep(object, PINTERLOCKED,
571 					       "objchns", 0);
572 				}
573 				/* retry */
574 			} else {
575 				break;
576 			}
577 			/* retry */
578 		} else {
579 			if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
580 				tsleep_interlock(object, 0);
581 				if (atomic_cmpset_int(&object->chainlk,
582 						      chainlk,
583 						      chainlk | CHAINLK_WAIT))
584 				{
585 					tsleep(object, PINTERLOCKED,
586 					       "objchnx", 0);
587 				}
588 				/* retry */
589 			} else {
590 				if (atomic_cmpset_int(&object->chainlk,
591 						      chainlk,
592 						      chainlk & ~CHAINLK_WAIT))
593 				{
594 					if (chainlk & CHAINLK_WAIT)
595 						wakeup(object);
596 					break;
597 				}
598 				/* retry */
599 			}
600 		}
601 		/* retry */
602 	}
603 }
604 
605 void
606 vm_object_chain_acquire(vm_object_t object, int shared)
607 {
608 	if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
609 		return;
610 	if (vm_shared_fault == 0)
611 		shared = 0;
612 
613 	for (;;) {
614 		uint32_t chainlk = object->chainlk;
615 
616 		cpu_ccfence();
617 		if (shared) {
618 			if (chainlk & (CHAINLK_EXCL | CHAINLK_EXCLREQ)) {
619 				tsleep_interlock(object, 0);
620 				if (atomic_cmpset_int(&object->chainlk,
621 						      chainlk,
622 						      chainlk | CHAINLK_WAIT)) {
623 					tsleep(object, PINTERLOCKED,
624 					       "objchns", 0);
625 				}
626 				/* retry */
627 			} else if (atomic_cmpset_int(&object->chainlk,
628 					      chainlk, chainlk + 1)) {
629 				break;
630 			}
631 			/* retry */
632 		} else {
633 			if (chainlk & (CHAINLK_MASK | CHAINLK_EXCL)) {
634 				tsleep_interlock(object, 0);
635 				if (atomic_cmpset_int(&object->chainlk,
636 						      chainlk,
637 						      chainlk |
638 						       CHAINLK_WAIT |
639 						       CHAINLK_EXCLREQ)) {
640 					tsleep(object, PINTERLOCKED,
641 					       "objchnx", 0);
642 				}
643 				/* retry */
644 			} else {
645 				if (atomic_cmpset_int(&object->chainlk,
646 						      chainlk,
647 						      (chainlk | CHAINLK_EXCL) &
648 						      ~(CHAINLK_EXCLREQ |
649 							CHAINLK_WAIT))) {
650 					if (chainlk & CHAINLK_WAIT)
651 						wakeup(object);
652 					break;
653 				}
654 				/* retry */
655 			}
656 		}
657 		/* retry */
658 	}
659 }
660 
661 void
662 vm_object_chain_release(vm_object_t object)
663 {
664 	/*ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));*/
665 	if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP)
666 		return;
667 	KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
668 	for (;;) {
669 		uint32_t chainlk = object->chainlk;
670 
671 		cpu_ccfence();
672 		if (chainlk & CHAINLK_MASK) {
673 			if ((chainlk & CHAINLK_MASK) == 1 &&
674 			    atomic_cmpset_int(&object->chainlk,
675 					      chainlk,
676 					      (chainlk - 1) & ~CHAINLK_WAIT)) {
677 				if (chainlk & CHAINLK_WAIT)
678 					wakeup(object);
679 				break;
680 			}
681 			if ((chainlk & CHAINLK_MASK) > 1 &&
682 			    atomic_cmpset_int(&object->chainlk,
683 					      chainlk, chainlk - 1)) {
684 				break;
685 			}
686 			/* retry */
687 		} else {
688 			KKASSERT(chainlk & CHAINLK_EXCL);
689 			if (atomic_cmpset_int(&object->chainlk,
690 					      chainlk,
691 					      chainlk & ~(CHAINLK_EXCL |
692 							  CHAINLK_WAIT))) {
693 				if (chainlk & CHAINLK_WAIT)
694 					wakeup(object);
695 				break;
696 			}
697 		}
698 	}
699 }
700 
701 /*
702  * Release the chain from first_object through and including stopobj.
703  * The caller is typically holding the first and last object locked
704  * (shared or exclusive) to prevent destruction races.
705  *
706  * We release stopobj first as an optimization as this object is most
707  * likely to be shared across multiple processes.
708  */
709 void
710 vm_object_chain_release_all(vm_object_t first_object, vm_object_t stopobj)
711 {
712 	vm_object_t backing_object;
713 	vm_object_t object;
714 
715 	vm_object_chain_release(stopobj);
716 	object = first_object;
717 
718 	while (object != stopobj) {
719 		KKASSERT(object);
720 		backing_object = object->backing_object;
721 		vm_object_chain_release(object);
722 		object = backing_object;
723 	}
724 }
725 
726 /*
727  * Dereference an object and its underlying vnode.  The object may be
728  * held shared.  On return the object will remain held.
729  *
730  * This function may return a vnode in *vpp which the caller must release
731  * after the caller drops its own lock.  If vpp is NULL, we assume that
732  * the caller was holding an exclusive lock on the object and we vrele()
733  * the vp ourselves.
734  */
735 static void
736 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
737 				   VMOBJDBARGS)
738 {
739 	struct vnode *vp = (struct vnode *) object->handle;
740 
741 	KASSERT(object->type == OBJT_VNODE,
742 	    ("vm_object_vndeallocate: not a vnode object"));
743 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
744 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
745 #ifdef INVARIANTS
746 	if (object->ref_count == 0) {
747 		vprint("vm_object_vndeallocate", vp);
748 		panic("vm_object_vndeallocate: bad object reference count");
749 	}
750 #endif
751 	for (;;) {
752 		int count = object->ref_count;
753 		cpu_ccfence();
754 		if (count == 1) {
755 			vm_object_upgrade(object);
756 			if (atomic_cmpset_int(&object->ref_count, count, 0)) {
757 				vclrflags(vp, VTEXT);
758 				break;
759 			}
760 		} else {
761 			if (atomic_cmpset_int(&object->ref_count,
762 					      count, count - 1)) {
763 				break;
764 			}
765 		}
766 		/* retry */
767 	}
768 #if defined(DEBUG_LOCKS)
769 	debugvm_object_add(object, file, line, -1);
770 #endif
771 
772 	/*
773 	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
774 	 * if the object was locked exclusively.  But there are two races
775 	 * here.
776 	 *
777 	 * We had to upgrade the object above to safely clear VTEXT
778 	 * but the alternative path where the shared lock is retained
779 	 * can STILL race to 0 in other paths and cause our own vrele()
780 	 * to terminate the vnode.  We can't allow that if the VM object
781 	 * is still locked shared.
782 	 */
783 	if (vpp)
784 		*vpp = vp;
785 	else
786 		vrele(vp);
787 }
788 
789 /*
790  * Release a reference to the specified object, gained either through a
791  * vm_object_allocate or a vm_object_reference call.  When all references
792  * are gone, storage associated with this object may be relinquished.
793  *
794  * The caller does not have to hold the object locked but must have control
795  * over the reference in question in order to guarantee that the object
796  * does not get ripped out from under us.
797  *
798  * XXX Currently all deallocations require an exclusive lock.
799  */
800 void
801 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
802 {
803 	struct vnode *vp;
804 	int count;
805 
806 	if (object == NULL)
807 		return;
808 
809 	for (;;) {
810 		count = object->ref_count;
811 		cpu_ccfence();
812 
813 		/*
814 		 * If decrementing the count enters into special handling
815 		 * territory (0, 1, or 2) we have to do it the hard way.
816 		 * Fortunate though, objects with only a few refs like this
817 		 * are not likely to be heavily contended anyway.
818 		 *
819 		 * For vnode objects we only care about 1->0 transitions.
820 		 */
821 		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
822 #if defined(DEBUG_LOCKS)
823 			debugvm_object_add(object, file, line, 0);
824 #endif
825 			vm_object_hold(object);
826 			vm_object_deallocate_locked(object);
827 			vm_object_drop(object);
828 			break;
829 		}
830 
831 		/*
832 		 * Try to decrement ref_count without acquiring a hold on
833 		 * the object.  This is particularly important for the exec*()
834 		 * and exit*() code paths because the program binary may
835 		 * have a great deal of sharing and an exclusive lock will
836 		 * crowbar performance in those circumstances.
837 		 */
838 		if (object->type == OBJT_VNODE) {
839 			vp = (struct vnode *)object->handle;
840 			if (atomic_cmpset_int(&object->ref_count,
841 					      count, count - 1)) {
842 #if defined(DEBUG_LOCKS)
843 				debugvm_object_add(object, file, line, -1);
844 #endif
845 
846 				vrele(vp);
847 				break;
848 			}
849 			/* retry */
850 		} else {
851 			if (atomic_cmpset_int(&object->ref_count,
852 					      count, count - 1)) {
853 #if defined(DEBUG_LOCKS)
854 				debugvm_object_add(object, file, line, -1);
855 #endif
856 				break;
857 			}
858 			/* retry */
859 		}
860 		/* retry */
861 	}
862 }
863 
864 void
865 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
866 {
867 	struct vm_object_dealloc_list *dlist = NULL;
868 	struct vm_object_dealloc_list *dtmp;
869 	vm_object_t temp;
870 	int must_drop = 0;
871 
872 	/*
873 	 * We may chain deallocate object, but additional objects may
874 	 * collect on the dlist which also have to be deallocated.  We
875 	 * must avoid a recursion, vm_object chains can get deep.
876 	 */
877 
878 again:
879 	while (object != NULL) {
880 		/*
881 		 * vnode case, caller either locked the object exclusively
882 		 * or this is a recursion with must_drop != 0 and the vnode
883 		 * object will be locked shared.
884 		 *
885 		 * If locked shared we have to drop the object before we can
886 		 * call vrele() or risk a shared/exclusive livelock.
887 		 */
888 		if (object->type == OBJT_VNODE) {
889 			ASSERT_LWKT_TOKEN_HELD(&object->token);
890 			if (must_drop) {
891 				struct vnode *tmp_vp;
892 
893 				vm_object_vndeallocate(object, &tmp_vp);
894 				vm_object_drop(object);
895 				must_drop = 0;
896 				object = NULL;
897 				vrele(tmp_vp);
898 			} else {
899 				vm_object_vndeallocate(object, NULL);
900 			}
901 			break;
902 		}
903 		ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
904 
905 		/*
906 		 * Normal case (object is locked exclusively)
907 		 */
908 		if (object->ref_count == 0) {
909 			panic("vm_object_deallocate: object deallocated "
910 			      "too many times: %d", object->type);
911 		}
912 		if (object->ref_count > 2) {
913 			atomic_add_int(&object->ref_count, -1);
914 #if defined(DEBUG_LOCKS)
915 			debugvm_object_add(object, file, line, -1);
916 #endif
917 			break;
918 		}
919 
920 		/*
921 		 * Here on ref_count of one or two, which are special cases for
922 		 * objects.
923 		 *
924 		 * Nominal ref_count > 1 case if the second ref is not from
925 		 * a shadow.
926 		 *
927 		 * (ONEMAPPING only applies to DEFAULT AND SWAP objects)
928 		 */
929 		if (object->ref_count == 2 && object->shadow_count == 0) {
930 			if (object->type == OBJT_DEFAULT ||
931 			    object->type == OBJT_SWAP) {
932 				vm_object_set_flag(object, OBJ_ONEMAPPING);
933 			}
934 			atomic_add_int(&object->ref_count, -1);
935 #if defined(DEBUG_LOCKS)
936 			debugvm_object_add(object, file, line, -1);
937 #endif
938 			break;
939 		}
940 
941 		/*
942 		 * If the second ref is from a shadow we chain along it
943 		 * upwards if object's handle is exhausted.
944 		 *
945 		 * We have to decrement object->ref_count before potentially
946 		 * collapsing the first shadow object or the collapse code
947 		 * will not be able to handle the degenerate case to remove
948 		 * object.  However, if we do it too early the object can
949 		 * get ripped out from under us.
950 		 */
951 		if (object->ref_count == 2 && object->shadow_count == 1 &&
952 		    object->handle == NULL && (object->type == OBJT_DEFAULT ||
953 					       object->type == OBJT_SWAP)) {
954 			temp = LIST_FIRST(&object->shadow_head);
955 			KKASSERT(temp != NULL);
956 			vm_object_hold(temp);
957 
958 			/*
959 			 * Wait for any paging to complete so the collapse
960 			 * doesn't (or isn't likely to) qcollapse.  pip
961 			 * waiting must occur before we acquire the
962 			 * chainlock.
963 			 */
964 			while (
965 				temp->paging_in_progress ||
966 				object->paging_in_progress
967 			) {
968 				vm_object_pip_wait(temp, "objde1");
969 				vm_object_pip_wait(object, "objde2");
970 			}
971 
972 			/*
973 			 * If the parent is locked we have to give up, as
974 			 * otherwise we would be acquiring locks in the
975 			 * wrong order and potentially deadlock.
976 			 */
977 			if (temp->chainlk & (CHAINLK_EXCL | CHAINLK_MASK)) {
978 				vm_object_drop(temp);
979 				goto skip;
980 			}
981 			vm_object_chain_acquire(temp, 0);
982 
983 			/*
984 			 * Recheck/retry after the hold and the paging
985 			 * wait, both of which can block us.
986 			 */
987 			if (object->ref_count != 2 ||
988 			    object->shadow_count != 1 ||
989 			    object->handle ||
990 			    LIST_FIRST(&object->shadow_head) != temp ||
991 			    (object->type != OBJT_DEFAULT &&
992 			     object->type != OBJT_SWAP)) {
993 				vm_object_chain_release(temp);
994 				vm_object_drop(temp);
995 				continue;
996 			}
997 
998 			/*
999 			 * We can safely drop object's ref_count now.
1000 			 */
1001 			KKASSERT(object->ref_count == 2);
1002 			atomic_add_int(&object->ref_count, -1);
1003 #if defined(DEBUG_LOCKS)
1004 			debugvm_object_add(object, file, line, -1);
1005 #endif
1006 
1007 			/*
1008 			 * If our single parent is not collapseable just
1009 			 * decrement ref_count (2->1) and stop.
1010 			 */
1011 			if (temp->handle || (temp->type != OBJT_DEFAULT &&
1012 					     temp->type != OBJT_SWAP)) {
1013 				vm_object_chain_release(temp);
1014 				vm_object_drop(temp);
1015 				break;
1016 			}
1017 
1018 			/*
1019 			 * At this point we have already dropped object's
1020 			 * ref_count so it is possible for a race to
1021 			 * deallocate obj out from under us.  Any collapse
1022 			 * will re-check the situation.  We must not block
1023 			 * until we are able to collapse.
1024 			 *
1025 			 * Bump temp's ref_count to avoid an unwanted
1026 			 * degenerate recursion (can't call
1027 			 * vm_object_reference_locked() because it asserts
1028 			 * that CHAINLOCK is not set).
1029 			 */
1030 			atomic_add_int(&temp->ref_count, 1);
1031 			KKASSERT(temp->ref_count > 1);
1032 
1033 			/*
1034 			 * Collapse temp, then deallocate the extra ref
1035 			 * formally.
1036 			 */
1037 			vm_object_collapse(temp, &dlist);
1038 			vm_object_chain_release(temp);
1039 			if (must_drop) {
1040 				vm_object_lock_swap();
1041 				vm_object_drop(object);
1042 			}
1043 			object = temp;
1044 			must_drop = 1;
1045 			continue;
1046 		}
1047 
1048 		/*
1049 		 * Drop the ref and handle termination on the 1->0 transition.
1050 		 * We may have blocked above so we have to recheck.
1051 		 */
1052 skip:
1053 		KKASSERT(object->ref_count != 0);
1054 		if (object->ref_count >= 2) {
1055 			atomic_add_int(&object->ref_count, -1);
1056 #if defined(DEBUG_LOCKS)
1057 			debugvm_object_add(object, file, line, -1);
1058 #endif
1059 			break;
1060 		}
1061 		KKASSERT(object->ref_count == 1);
1062 
1063 		/*
1064 		 * 1->0 transition.  Chain through the backing_object.
1065 		 * Maintain the ref until we've located the backing object,
1066 		 * then re-check.
1067 		 */
1068 		while ((temp = object->backing_object) != NULL) {
1069 			if (temp->type == OBJT_VNODE)
1070 				vm_object_hold_shared(temp);
1071 			else
1072 				vm_object_hold(temp);
1073 			if (temp == object->backing_object)
1074 				break;
1075 			vm_object_drop(temp);
1076 		}
1077 
1078 		/*
1079 		 * 1->0 transition verified, retry if ref_count is no longer
1080 		 * 1.  Otherwise disconnect the backing_object (temp) and
1081 		 * clean up.
1082 		 */
1083 		if (object->ref_count != 1) {
1084 			vm_object_drop(temp);
1085 			continue;
1086 		}
1087 
1088 		/*
1089 		 * It shouldn't be possible for the object to be chain locked
1090 		 * if we're removing the last ref on it.
1091 		 *
1092 		 * Removing object from temp's shadow list requires dropping
1093 		 * temp, which we will do on loop.
1094 		 *
1095 		 * NOTE! vnodes do not use the shadow list, but still have
1096 		 *	 the backing_object reference.
1097 		 */
1098 		KKASSERT((object->chainlk & (CHAINLK_EXCL|CHAINLK_MASK)) == 0);
1099 
1100 		if (temp) {
1101 			if (object->flags & OBJ_ONSHADOW) {
1102 				LIST_REMOVE(object, shadow_list);
1103 				temp->shadow_count--;
1104 				atomic_add_int(&temp->generation, 1);
1105 				vm_object_clear_flag(object, OBJ_ONSHADOW);
1106 			}
1107 			object->backing_object = NULL;
1108 		}
1109 
1110 		atomic_add_int(&object->ref_count, -1);
1111 		if ((object->flags & OBJ_DEAD) == 0)
1112 			vm_object_terminate(object);
1113 		if (must_drop && temp)
1114 			vm_object_lock_swap();
1115 		if (must_drop)
1116 			vm_object_drop(object);
1117 		object = temp;
1118 		must_drop = 1;
1119 	}
1120 
1121 	if (must_drop && object)
1122 		vm_object_drop(object);
1123 
1124 	/*
1125 	 * Additional tail recursion on dlist.  Avoid a recursion.  Objects
1126 	 * on the dlist have a hold count but are not locked.
1127 	 */
1128 	if ((dtmp = dlist) != NULL) {
1129 		dlist = dtmp->next;
1130 		object = dtmp->object;
1131 		kfree(dtmp, M_TEMP);
1132 
1133 		vm_object_lock(object);	/* already held, add lock */
1134 		must_drop = 1;		/* and we're responsible for it */
1135 		goto again;
1136 	}
1137 }
1138 
1139 /*
1140  * Destroy the specified object, freeing up related resources.
1141  *
1142  * The object must have zero references.
1143  *
1144  * The object must held.  The caller is responsible for dropping the object
1145  * after terminate returns.  Terminate does NOT drop the object.
1146  */
1147 static int vm_object_terminate_callback(vm_page_t p, void *data);
1148 
1149 void
1150 vm_object_terminate(vm_object_t object)
1151 {
1152 	struct rb_vm_page_scan_info info;
1153 	struct vm_object_hash *hash;
1154 
1155 	/*
1156 	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
1157 	 * able to safely block.
1158 	 */
1159 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1160 	KKASSERT((object->flags & OBJ_DEAD) == 0);
1161 	vm_object_set_flag(object, OBJ_DEAD);
1162 
1163 	/*
1164 	 * Wait for the pageout daemon to be done with the object
1165 	 */
1166 	vm_object_pip_wait(object, "objtrm1");
1167 
1168 	KASSERT(!object->paging_in_progress,
1169 		("vm_object_terminate: pageout in progress"));
1170 
1171 	/*
1172 	 * Clean and free the pages, as appropriate. All references to the
1173 	 * object are gone, so we don't need to lock it.
1174 	 */
1175 	if (object->type == OBJT_VNODE) {
1176 		struct vnode *vp;
1177 
1178 		/*
1179 		 * Clean pages and flush buffers.
1180 		 *
1181 		 * NOTE!  TMPFS buffer flushes do not typically flush the
1182 		 *	  actual page to swap as this would be highly
1183 		 *	  inefficient, and normal filesystems usually wrap
1184 		 *	  page flushes with buffer cache buffers.
1185 		 *
1186 		 *	  To deal with this we have to call vinvalbuf() both
1187 		 *	  before and after the vm_object_page_clean().
1188 		 */
1189 		vp = (struct vnode *) object->handle;
1190 		vinvalbuf(vp, V_SAVE, 0, 0);
1191 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
1192 		vinvalbuf(vp, V_SAVE, 0, 0);
1193 	}
1194 
1195 	/*
1196 	 * Wait for any I/O to complete, after which there had better not
1197 	 * be any references left on the object.
1198 	 */
1199 	vm_object_pip_wait(object, "objtrm2");
1200 
1201 	if (object->ref_count != 0) {
1202 		panic("vm_object_terminate: object with references, "
1203 		      "ref_count=%d", object->ref_count);
1204 	}
1205 
1206 	/*
1207 	 * Cleanup any shared pmaps associated with this object.
1208 	 */
1209 	pmap_object_free(object);
1210 
1211 	/*
1212 	 * Now free any remaining pages. For internal objects, this also
1213 	 * removes them from paging queues. Don't free wired pages, just
1214 	 * remove them from the object.
1215 	 */
1216 	info.count = 0;
1217 	info.object = object;
1218 	do {
1219 		info.error = 0;
1220 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1221 					vm_object_terminate_callback, &info);
1222 	} while (info.error);
1223 
1224 	/*
1225 	 * Let the pager know object is dead.
1226 	 */
1227 	vm_pager_deallocate(object);
1228 
1229 	/*
1230 	 * Wait for the object hold count to hit 1, clean out pages as
1231 	 * we go.  vmobj_token interlocks any race conditions that might
1232 	 * pick the object up from the vm_object_list after we have cleared
1233 	 * rb_memq.
1234 	 */
1235 	for (;;) {
1236 		if (RB_ROOT(&object->rb_memq) == NULL)
1237 			break;
1238 		kprintf("vm_object_terminate: Warning, object %p "
1239 			"still has %ld pages\n",
1240 			object, object->resident_page_count);
1241 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
1242 					vm_object_terminate_callback, &info);
1243 	}
1244 
1245 	/*
1246 	 * There had better not be any pages left
1247 	 */
1248 	KKASSERT(object->resident_page_count == 0);
1249 
1250 	/*
1251 	 * Remove the object from the global object list.
1252 	 */
1253 	hash = VMOBJ_HASH(object);
1254 	lwkt_gettoken(&hash->token);
1255 	TAILQ_REMOVE(&hash->list, object, object_list);
1256 	lwkt_reltoken(&hash->token);
1257 
1258 	if (object->ref_count != 0) {
1259 		panic("vm_object_terminate2: object with references, "
1260 		      "ref_count=%d", object->ref_count);
1261 	}
1262 
1263 	/*
1264 	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
1265 	 *	 the object here.  See vm_object_drop().
1266 	 */
1267 }
1268 
1269 /*
1270  * The caller must hold the object.
1271  */
1272 static int
1273 vm_object_terminate_callback(vm_page_t p, void *data)
1274 {
1275 	struct rb_vm_page_scan_info *info = data;
1276 	vm_object_t object;
1277 
1278 	object = p->object;
1279 	KKASSERT(object == info->object);
1280 	if (vm_page_busy_try(p, TRUE)) {
1281 		vm_page_sleep_busy(p, TRUE, "vmotrm");
1282 		info->error = 1;
1283 		return 0;
1284 	}
1285 	if (object != p->object) {
1286 		/* XXX remove once we determine it can't happen */
1287 		kprintf("vm_object_terminate: Warning: Encountered "
1288 			"busied page %p on queue %d\n", p, p->queue);
1289 		vm_page_wakeup(p);
1290 		info->error = 1;
1291 	} else if (p->wire_count == 0) {
1292 		/*
1293 		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
1294 		 */
1295 		vm_page_free(p);
1296 		mycpu->gd_cnt.v_pfree++;
1297 	} else {
1298 		if (p->queue != PQ_NONE)
1299 			kprintf("vm_object_terminate: Warning: Encountered "
1300 				"wired page %p on queue %d\n", p, p->queue);
1301 		vm_page_remove(p);
1302 		vm_page_wakeup(p);
1303 	}
1304 
1305 	/*
1306 	 * Must be at end to avoid SMP races, caller holds object token
1307 	 */
1308 	if ((++info->count & 63) == 0)
1309 		lwkt_user_yield();
1310 	return(0);
1311 }
1312 
1313 /*
1314  * Clean all dirty pages in the specified range of object.  Leaves page
1315  * on whatever queue it is currently on.   If NOSYNC is set then do not
1316  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
1317  * leaving the object dirty.
1318  *
1319  * When stuffing pages asynchronously, allow clustering.  XXX we need a
1320  * synchronous clustering mode implementation.
1321  *
1322  * Odd semantics: if start == end, we clean everything.
1323  *
1324  * The object must be locked? XXX
1325  */
1326 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
1327 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
1328 
1329 void
1330 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1331 		     int flags)
1332 {
1333 	struct rb_vm_page_scan_info info;
1334 	struct vnode *vp;
1335 	int wholescan;
1336 	int pagerflags;
1337 	int generation;
1338 
1339 	vm_object_hold(object);
1340 	if (object->type != OBJT_VNODE ||
1341 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
1342 		vm_object_drop(object);
1343 		return;
1344 	}
1345 
1346 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
1347 			VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
1348 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
1349 
1350 	vp = object->handle;
1351 
1352 	/*
1353 	 * Interlock other major object operations.  This allows us to
1354 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
1355 	 */
1356 	vm_object_set_flag(object, OBJ_CLEANING);
1357 
1358 	/*
1359 	 * Handle 'entire object' case
1360 	 */
1361 	info.start_pindex = start;
1362 	if (end == 0) {
1363 		info.end_pindex = object->size - 1;
1364 	} else {
1365 		info.end_pindex = end - 1;
1366 	}
1367 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
1368 	info.limit = flags;
1369 	info.pagerflags = pagerflags;
1370 	info.object = object;
1371 
1372 	/*
1373 	 * If cleaning the entire object do a pass to mark the pages read-only.
1374 	 * If everything worked out ok, clear OBJ_WRITEABLE and
1375 	 * OBJ_MIGHTBEDIRTY.
1376 	 */
1377 	if (wholescan) {
1378 		info.error = 0;
1379 		info.count = 0;
1380 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1381 					vm_object_page_clean_pass1, &info);
1382 		if (info.error == 0) {
1383 			vm_object_clear_flag(object,
1384 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1385 			if (object->type == OBJT_VNODE &&
1386 			    (vp = (struct vnode *)object->handle) != NULL) {
1387 				/*
1388 				 * Use new-style interface to clear VISDIRTY
1389 				 * because the vnode is not necessarily removed
1390 				 * from the syncer list(s) as often as it was
1391 				 * under the old interface, which can leave
1392 				 * the vnode on the syncer list after reclaim.
1393 				 */
1394 				vclrobjdirty(vp);
1395 			}
1396 		}
1397 	}
1398 
1399 	/*
1400 	 * Do a pass to clean all the dirty pages we find.
1401 	 */
1402 	do {
1403 		info.error = 0;
1404 		info.count = 0;
1405 		generation = object->generation;
1406 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1407 					vm_object_page_clean_pass2, &info);
1408 	} while (info.error || generation != object->generation);
1409 
1410 	vm_object_clear_flag(object, OBJ_CLEANING);
1411 	vm_object_drop(object);
1412 }
1413 
1414 /*
1415  * The caller must hold the object.
1416  */
1417 static
1418 int
1419 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1420 {
1421 	struct rb_vm_page_scan_info *info = data;
1422 
1423 	KKASSERT(p->object == info->object);
1424 
1425 	vm_page_flag_set(p, PG_CLEANCHK);
1426 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1427 		info->error = 1;
1428 	} else if (vm_page_busy_try(p, FALSE)) {
1429 		info->error = 1;
1430 	} else {
1431 		KKASSERT(p->object == info->object);
1432 		vm_page_protect(p, VM_PROT_READ);
1433 		vm_page_wakeup(p);
1434 	}
1435 
1436 	/*
1437 	 * Must be at end to avoid SMP races, caller holds object token
1438 	 */
1439 	if ((++info->count & 63) == 0)
1440 		lwkt_user_yield();
1441 	return(0);
1442 }
1443 
1444 /*
1445  * The caller must hold the object
1446  */
1447 static
1448 int
1449 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1450 {
1451 	struct rb_vm_page_scan_info *info = data;
1452 	int generation;
1453 
1454 	KKASSERT(p->object == info->object);
1455 
1456 	/*
1457 	 * Do not mess with pages that were inserted after we started
1458 	 * the cleaning pass.
1459 	 */
1460 	if ((p->flags & PG_CLEANCHK) == 0)
1461 		goto done;
1462 
1463 	generation = info->object->generation;
1464 
1465 	if (vm_page_busy_try(p, TRUE)) {
1466 		vm_page_sleep_busy(p, TRUE, "vpcwai");
1467 		info->error = 1;
1468 		goto done;
1469 	}
1470 
1471 	KKASSERT(p->object == info->object &&
1472 		 info->object->generation == generation);
1473 
1474 	/*
1475 	 * Before wasting time traversing the pmaps, check for trivial
1476 	 * cases where the page cannot be dirty.
1477 	 */
1478 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1479 		KKASSERT((p->dirty & p->valid) == 0 &&
1480 			 (p->flags & PG_NEED_COMMIT) == 0);
1481 		vm_page_wakeup(p);
1482 		goto done;
1483 	}
1484 
1485 	/*
1486 	 * Check whether the page is dirty or not.  The page has been set
1487 	 * to be read-only so the check will not race a user dirtying the
1488 	 * page.
1489 	 */
1490 	vm_page_test_dirty(p);
1491 	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1492 		vm_page_flag_clear(p, PG_CLEANCHK);
1493 		vm_page_wakeup(p);
1494 		goto done;
1495 	}
1496 
1497 	/*
1498 	 * If we have been asked to skip nosync pages and this is a
1499 	 * nosync page, skip it.  Note that the object flags were
1500 	 * not cleared in this case (because pass1 will have returned an
1501 	 * error), so we do not have to set them.
1502 	 */
1503 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1504 		vm_page_flag_clear(p, PG_CLEANCHK);
1505 		vm_page_wakeup(p);
1506 		goto done;
1507 	}
1508 
1509 	/*
1510 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1511 	 * the pages that get successfully flushed.  Set info->error if
1512 	 * we raced an object modification.
1513 	 */
1514 	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1515 	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1516 
1517 	/*
1518 	 * Must be at end to avoid SMP races, caller holds object token
1519 	 */
1520 done:
1521 	if ((++info->count & 63) == 0)
1522 		lwkt_user_yield();
1523 	return(0);
1524 }
1525 
1526 /*
1527  * Collect the specified page and nearby pages and flush them out.
1528  * The number of pages flushed is returned.  The passed page is busied
1529  * by the caller and we are responsible for its disposition.
1530  *
1531  * The caller must hold the object.
1532  */
1533 static void
1534 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1535 {
1536 	int error;
1537 	int is;
1538 	int ib;
1539 	int i;
1540 	int page_base;
1541 	vm_pindex_t pi;
1542 	vm_page_t ma[BLIST_MAX_ALLOC];
1543 
1544 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1545 
1546 	pi = p->pindex;
1547 	page_base = pi % BLIST_MAX_ALLOC;
1548 	ma[page_base] = p;
1549 	ib = page_base - 1;
1550 	is = page_base + 1;
1551 
1552 	while (ib >= 0) {
1553 		vm_page_t tp;
1554 
1555 		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1556 					     TRUE, &error);
1557 		if (error)
1558 			break;
1559 		if (tp == NULL)
1560 			break;
1561 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1562 		    (tp->flags & PG_CLEANCHK) == 0) {
1563 			vm_page_wakeup(tp);
1564 			break;
1565 		}
1566 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1567 			vm_page_flag_clear(tp, PG_CLEANCHK);
1568 			vm_page_wakeup(tp);
1569 			break;
1570 		}
1571 		vm_page_test_dirty(tp);
1572 		if ((tp->dirty & tp->valid) == 0 &&
1573 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1574 			vm_page_flag_clear(tp, PG_CLEANCHK);
1575 			vm_page_wakeup(tp);
1576 			break;
1577 		}
1578 		ma[ib] = tp;
1579 		--ib;
1580 	}
1581 	++ib;	/* fixup */
1582 
1583 	while (is < BLIST_MAX_ALLOC &&
1584 	       pi - page_base + is < object->size) {
1585 		vm_page_t tp;
1586 
1587 		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1588 					     TRUE, &error);
1589 		if (error)
1590 			break;
1591 		if (tp == NULL)
1592 			break;
1593 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1594 		    (tp->flags & PG_CLEANCHK) == 0) {
1595 			vm_page_wakeup(tp);
1596 			break;
1597 		}
1598 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1599 			vm_page_flag_clear(tp, PG_CLEANCHK);
1600 			vm_page_wakeup(tp);
1601 			break;
1602 		}
1603 		vm_page_test_dirty(tp);
1604 		if ((tp->dirty & tp->valid) == 0 &&
1605 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1606 			vm_page_flag_clear(tp, PG_CLEANCHK);
1607 			vm_page_wakeup(tp);
1608 			break;
1609 		}
1610 		ma[is] = tp;
1611 		++is;
1612 	}
1613 
1614 	/*
1615 	 * All pages in the ma[] array are busied now
1616 	 */
1617 	for (i = ib; i < is; ++i) {
1618 		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1619 		vm_page_hold(ma[i]);	/* XXX need this any more? */
1620 	}
1621 	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1622 	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1623 		vm_page_unhold(ma[i]);
1624 }
1625 
1626 /*
1627  * Same as vm_object_pmap_copy, except range checking really
1628  * works, and is meant for small sections of an object.
1629  *
1630  * This code protects resident pages by making them read-only
1631  * and is typically called on a fork or split when a page
1632  * is converted to copy-on-write.
1633  *
1634  * NOTE: If the page is already at VM_PROT_NONE, calling
1635  * vm_page_protect will have no effect.
1636  */
1637 void
1638 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1639 {
1640 	vm_pindex_t idx;
1641 	vm_page_t p;
1642 
1643 	if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
1644 		return;
1645 
1646 	vm_object_hold(object);
1647 	for (idx = start; idx < end; idx++) {
1648 		p = vm_page_lookup(object, idx);
1649 		if (p == NULL)
1650 			continue;
1651 		vm_page_protect(p, VM_PROT_READ);
1652 	}
1653 	vm_object_drop(object);
1654 }
1655 
1656 /*
1657  * Removes all physical pages in the specified object range from all
1658  * physical maps.
1659  *
1660  * The object must *not* be locked.
1661  */
1662 
1663 static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
1664 
1665 void
1666 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1667 {
1668 	struct rb_vm_page_scan_info info;
1669 
1670 	if (object == NULL)
1671 		return;
1672 	if (start == end)
1673 		return;
1674 	info.start_pindex = start;
1675 	info.end_pindex = end - 1;
1676 	info.count = 0;
1677 	info.object = object;
1678 
1679 	vm_object_hold(object);
1680 	do {
1681 		info.error = 0;
1682 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1683 					vm_object_pmap_remove_callback, &info);
1684 	} while (info.error);
1685 	if (start == 0 && end == object->size)
1686 		vm_object_clear_flag(object, OBJ_WRITEABLE);
1687 	vm_object_drop(object);
1688 }
1689 
1690 /*
1691  * The caller must hold the object
1692  */
1693 static int
1694 vm_object_pmap_remove_callback(vm_page_t p, void *data)
1695 {
1696 	struct rb_vm_page_scan_info *info = data;
1697 
1698 	if (info->object != p->object ||
1699 	    p->pindex < info->start_pindex ||
1700 	    p->pindex > info->end_pindex) {
1701 		kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n",
1702 			info->object, p);
1703 		info->error = 1;
1704 		return(0);
1705 	}
1706 
1707 	vm_page_protect(p, VM_PROT_NONE);
1708 
1709 	/*
1710 	 * Must be at end to avoid SMP races, caller holds object token
1711 	 */
1712 	if ((++info->count & 63) == 0)
1713 		lwkt_user_yield();
1714 	return(0);
1715 }
1716 
1717 /*
1718  * Implements the madvise function at the object/page level.
1719  *
1720  * MADV_WILLNEED	(any object)
1721  *
1722  *	Activate the specified pages if they are resident.
1723  *
1724  * MADV_DONTNEED	(any object)
1725  *
1726  *	Deactivate the specified pages if they are resident.
1727  *
1728  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1729  *
1730  *	Deactivate and clean the specified pages if they are
1731  *	resident.  This permits the process to reuse the pages
1732  *	without faulting or the kernel to reclaim the pages
1733  *	without I/O.
1734  *
1735  * No requirements.
1736  */
1737 void
1738 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1739 		  vm_pindex_t count, int advise)
1740 {
1741 	vm_pindex_t end, tpindex;
1742 	vm_object_t tobject;
1743 	vm_object_t xobj;
1744 	vm_page_t m;
1745 	int error;
1746 
1747 	if (object == NULL)
1748 		return;
1749 
1750 	end = pindex + count;
1751 
1752 	vm_object_hold(object);
1753 	tobject = object;
1754 
1755 	/*
1756 	 * Locate and adjust resident pages
1757 	 */
1758 	for (; pindex < end; pindex += 1) {
1759 relookup:
1760 		if (tobject != object)
1761 			vm_object_drop(tobject);
1762 		tobject = object;
1763 		tpindex = pindex;
1764 shadowlookup:
1765 		/*
1766 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1767 		 * and those pages must be OBJ_ONEMAPPING.
1768 		 */
1769 		if (advise == MADV_FREE) {
1770 			if ((tobject->type != OBJT_DEFAULT &&
1771 			     tobject->type != OBJT_SWAP) ||
1772 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
1773 				continue;
1774 			}
1775 		}
1776 
1777 		m = vm_page_lookup_busy_try(tobject, tpindex, TRUE, &error);
1778 
1779 		if (error) {
1780 			vm_page_sleep_busy(m, TRUE, "madvpo");
1781 			goto relookup;
1782 		}
1783 		if (m == NULL) {
1784 			/*
1785 			 * There may be swap even if there is no backing page
1786 			 */
1787 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
1788 				swap_pager_freespace(tobject, tpindex, 1);
1789 
1790 			/*
1791 			 * next object
1792 			 */
1793 			while ((xobj = tobject->backing_object) != NULL) {
1794 				KKASSERT(xobj != object);
1795 				vm_object_hold(xobj);
1796 				if (xobj == tobject->backing_object)
1797 					break;
1798 				vm_object_drop(xobj);
1799 			}
1800 			if (xobj == NULL)
1801 				continue;
1802 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
1803 			if (tobject != object) {
1804 				vm_object_lock_swap();
1805 				vm_object_drop(tobject);
1806 			}
1807 			tobject = xobj;
1808 			goto shadowlookup;
1809 		}
1810 
1811 		/*
1812 		 * If the page is not in a normal active state, we skip it.
1813 		 * If the page is not managed there are no page queues to
1814 		 * mess with.  Things can break if we mess with pages in
1815 		 * any of the below states.
1816 		 */
1817 		if (m->wire_count ||
1818 		    (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1819 		    m->valid != VM_PAGE_BITS_ALL
1820 		) {
1821 			vm_page_wakeup(m);
1822 			continue;
1823 		}
1824 
1825 		/*
1826 		 * Theoretically once a page is known not to be busy, an
1827 		 * interrupt cannot come along and rip it out from under us.
1828 		 */
1829 
1830 		if (advise == MADV_WILLNEED) {
1831 			vm_page_activate(m);
1832 		} else if (advise == MADV_DONTNEED) {
1833 			vm_page_dontneed(m);
1834 		} else if (advise == MADV_FREE) {
1835 			/*
1836 			 * Mark the page clean.  This will allow the page
1837 			 * to be freed up by the system.  However, such pages
1838 			 * are often reused quickly by malloc()/free()
1839 			 * so we do not do anything that would cause
1840 			 * a page fault if we can help it.
1841 			 *
1842 			 * Specifically, we do not try to actually free
1843 			 * the page now nor do we try to put it in the
1844 			 * cache (which would cause a page fault on reuse).
1845 			 *
1846 			 * But we do make the page is freeable as we
1847 			 * can without actually taking the step of unmapping
1848 			 * it.
1849 			 */
1850 			pmap_clear_modify(m);
1851 			m->dirty = 0;
1852 			m->act_count = 0;
1853 			vm_page_dontneed(m);
1854 			if (tobject->type == OBJT_SWAP)
1855 				swap_pager_freespace(tobject, tpindex, 1);
1856 		}
1857 		vm_page_wakeup(m);
1858 	}
1859 	if (tobject != object)
1860 		vm_object_drop(tobject);
1861 	vm_object_drop(object);
1862 }
1863 
1864 /*
1865  * Create a new object which is backed by the specified existing object
1866  * range.  Replace the pointer and offset that was pointing at the existing
1867  * object with the pointer/offset for the new object.
1868  *
1869  * If addref is non-zero the returned object is given an additional reference.
1870  * This mechanic exists to avoid the situation where refs might be 1 and
1871  * race against a collapse when the caller intends to bump it.  So the
1872  * caller cannot add the ref after the fact.  Used when the caller is
1873  * duplicating a vm_map_entry.
1874  *
1875  * No other requirements.
1876  */
1877 void
1878 vm_object_shadow(vm_object_t *objectp, vm_ooffset_t *offset, vm_size_t length,
1879 		 int addref)
1880 {
1881 	vm_object_t source;
1882 	vm_object_t result;
1883 	int useshadowlist;
1884 
1885 	source = *objectp;
1886 
1887 	/*
1888 	 * Don't create the new object if the old object isn't shared.
1889 	 * We have to chain wait before adding the reference to avoid
1890 	 * racing a collapse or deallocation.
1891 	 *
1892 	 * Clear OBJ_ONEMAPPING flag when shadowing.
1893 	 *
1894 	 * The caller owns a ref on source via *objectp which we are going
1895 	 * to replace.  This ref is inherited by the backing_object assignment.
1896 	 * from nobject and does not need to be incremented here.
1897 	 *
1898 	 * However, we add a temporary extra reference to the original source
1899 	 * prior to holding nobject in case we block, to avoid races where
1900 	 * someone else might believe that the source can be collapsed.
1901 	 */
1902 	useshadowlist = 0;
1903 	if (source) {
1904 		if (source->type != OBJT_VNODE) {
1905 			useshadowlist = 1;
1906 			vm_object_hold(source);
1907 			vm_object_chain_wait(source, 0);
1908 			if (source->ref_count == 1 &&
1909 			    source->handle == NULL &&
1910 			    (source->type == OBJT_DEFAULT ||
1911 			     source->type == OBJT_SWAP)) {
1912 				if (addref) {
1913 					vm_object_reference_locked(source);
1914 					vm_object_clear_flag(source,
1915 							     OBJ_ONEMAPPING);
1916 				}
1917 				vm_object_drop(source);
1918 				return;
1919 			}
1920 			vm_object_reference_locked(source);
1921 			vm_object_clear_flag(source, OBJ_ONEMAPPING);
1922 		} else {
1923 			vm_object_reference_quick(source);
1924 			vm_object_clear_flag(source, OBJ_ONEMAPPING);
1925 		}
1926 	}
1927 
1928 	/*
1929 	 * Allocate a new object with the given length.  The new object
1930 	 * is returned referenced but we may have to add another one.
1931 	 * If we are adding a second reference we must clear OBJ_ONEMAPPING.
1932 	 * (typically because the caller is about to clone a vm_map_entry).
1933 	 *
1934 	 * The source object currently has an extra reference to prevent
1935 	 * collapses into it while we mess with its shadow list, which
1936 	 * we will remove later in this routine.
1937 	 *
1938 	 * The target object may require a second reference if asked for one
1939 	 * by the caller.
1940 	 */
1941 	result = vm_object_allocate(OBJT_DEFAULT, length);
1942 	if (result == NULL)
1943 		panic("vm_object_shadow: no object for shadowing");
1944 	vm_object_hold(result);
1945 	if (addref) {
1946 		vm_object_reference_locked(result);
1947 		vm_object_clear_flag(result, OBJ_ONEMAPPING);
1948 	}
1949 
1950 	/*
1951 	 * The new object shadows the source object.  Chain wait before
1952 	 * adjusting shadow_count or the shadow list to avoid races.
1953 	 *
1954 	 * Try to optimize the result object's page color when shadowing
1955 	 * in order to maintain page coloring consistency in the combined
1956 	 * shadowed object.
1957 	 *
1958 	 * The backing_object reference to source requires adding a ref to
1959 	 * source.  We simply inherit the ref from the original *objectp
1960 	 * (which we are replacing) so no additional refs need to be added.
1961 	 * (we must still clean up the extra ref we had to prevent collapse
1962 	 * races).
1963 	 *
1964 	 * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
1965 	 */
1966 	KKASSERT(result->backing_object == NULL);
1967 	result->backing_object = source;
1968 	if (source) {
1969 		if (useshadowlist) {
1970 			vm_object_chain_wait(source, 0);
1971 			LIST_INSERT_HEAD(&source->shadow_head,
1972 					 result, shadow_list);
1973 			source->shadow_count++;
1974 			atomic_add_int(&source->generation, 1);
1975 			vm_object_set_flag(result, OBJ_ONSHADOW);
1976 		}
1977 		/* cpu localization twist */
1978 		result->pg_color = vm_quickcolor();
1979 	}
1980 
1981 	/*
1982 	 * Adjust the return storage.  Drop the ref on source before
1983 	 * returning.
1984 	 */
1985 	result->backing_object_offset = *offset;
1986 	vm_object_drop(result);
1987 	*offset = 0;
1988 	if (source) {
1989 		if (useshadowlist) {
1990 			vm_object_deallocate_locked(source);
1991 			vm_object_drop(source);
1992 		} else {
1993 			vm_object_deallocate(source);
1994 		}
1995 	}
1996 
1997 	/*
1998 	 * Return the new things
1999 	 */
2000 	*objectp = result;
2001 }
2002 
2003 #define	OBSC_TEST_ALL_SHADOWED	0x0001
2004 #define	OBSC_COLLAPSE_NOWAIT	0x0002
2005 #define	OBSC_COLLAPSE_WAIT	0x0004
2006 
2007 static int vm_object_backing_scan_callback(vm_page_t p, void *data);
2008 
2009 /*
2010  * The caller must hold the object.
2011  */
2012 static __inline int
2013 vm_object_backing_scan(vm_object_t object, vm_object_t backing_object, int op)
2014 {
2015 	struct rb_vm_page_scan_info info;
2016 	struct vm_object_hash *hash;
2017 
2018 	vm_object_assert_held(object);
2019 	vm_object_assert_held(backing_object);
2020 
2021 	KKASSERT(backing_object == object->backing_object);
2022 	info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
2023 
2024 	/*
2025 	 * Initial conditions
2026 	 */
2027 	if (op & OBSC_TEST_ALL_SHADOWED) {
2028 		/*
2029 		 * We do not want to have to test for the existence of
2030 		 * swap pages in the backing object.  XXX but with the
2031 		 * new swapper this would be pretty easy to do.
2032 		 *
2033 		 * XXX what about anonymous MAP_SHARED memory that hasn't
2034 		 * been ZFOD faulted yet?  If we do not test for this, the
2035 		 * shadow test may succeed! XXX
2036 		 */
2037 		if (backing_object->type != OBJT_DEFAULT)
2038 			return(0);
2039 	}
2040 	if (op & OBSC_COLLAPSE_WAIT) {
2041 		KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
2042 		vm_object_set_flag(backing_object, OBJ_DEAD);
2043 
2044 		hash = VMOBJ_HASH(backing_object);
2045 		lwkt_gettoken(&hash->token);
2046 		TAILQ_REMOVE(&hash->list, backing_object, object_list);
2047 		lwkt_reltoken(&hash->token);
2048 	}
2049 
2050 	/*
2051 	 * Our scan.   We have to retry if a negative error code is returned,
2052 	 * otherwise 0 or 1 will be returned in info.error.  0 Indicates that
2053 	 * the scan had to be stopped because the parent does not completely
2054 	 * shadow the child.
2055 	 */
2056 	info.object = object;
2057 	info.backing_object = backing_object;
2058 	info.limit = op;
2059 	info.count = 0;
2060 	do {
2061 		info.error = 1;
2062 		vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
2063 					vm_object_backing_scan_callback,
2064 					&info);
2065 	} while (info.error < 0);
2066 
2067 	return(info.error);
2068 }
2069 
2070 /*
2071  * The caller must hold the object.
2072  */
2073 static int
2074 vm_object_backing_scan_callback(vm_page_t p, void *data)
2075 {
2076 	struct rb_vm_page_scan_info *info = data;
2077 	vm_object_t backing_object;
2078 	vm_object_t object;
2079 	vm_pindex_t pindex;
2080 	vm_pindex_t new_pindex;
2081 	vm_pindex_t backing_offset_index;
2082 	int op;
2083 
2084 	pindex = p->pindex;
2085 	new_pindex = pindex - info->backing_offset_index;
2086 	op = info->limit;
2087 	object = info->object;
2088 	backing_object = info->backing_object;
2089 	backing_offset_index = info->backing_offset_index;
2090 
2091 	if (op & OBSC_TEST_ALL_SHADOWED) {
2092 		vm_page_t pp;
2093 
2094 		/*
2095 		 * Ignore pages outside the parent object's range
2096 		 * and outside the parent object's mapping of the
2097 		 * backing object.
2098 		 *
2099 		 * note that we do not busy the backing object's
2100 		 * page.
2101 		 */
2102 		if (pindex < backing_offset_index ||
2103 		    new_pindex >= object->size
2104 		) {
2105 			return(0);
2106 		}
2107 
2108 		/*
2109 		 * See if the parent has the page or if the parent's
2110 		 * object pager has the page.  If the parent has the
2111 		 * page but the page is not valid, the parent's
2112 		 * object pager must have the page.
2113 		 *
2114 		 * If this fails, the parent does not completely shadow
2115 		 * the object and we might as well give up now.
2116 		 */
2117 		pp = vm_page_lookup(object, new_pindex);
2118 		if ((pp == NULL || pp->valid == 0) &&
2119 		    !vm_pager_has_page(object, new_pindex)
2120 		) {
2121 			info->error = 0;	/* problemo */
2122 			return(-1);		/* stop the scan */
2123 		}
2124 	}
2125 
2126 	/*
2127 	 * Check for busy page.  Note that we may have lost (p) when we
2128 	 * possibly blocked above.
2129 	 */
2130 	if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
2131 		vm_page_t pp;
2132 
2133 		if (vm_page_busy_try(p, TRUE)) {
2134 			if (op & OBSC_COLLAPSE_NOWAIT) {
2135 				return(0);
2136 			} else {
2137 				/*
2138 				 * If we slept, anything could have
2139 				 * happened.   Ask that the scan be restarted.
2140 				 *
2141 				 * Since the object is marked dead, the
2142 				 * backing offset should not have changed.
2143 				 */
2144 				vm_page_sleep_busy(p, TRUE, "vmocol");
2145 				info->error = -1;
2146 				return(-1);
2147 			}
2148 		}
2149 
2150 		/*
2151 		 * If (p) is no longer valid restart the scan.
2152 		 */
2153 		if (p->object != backing_object || p->pindex != pindex) {
2154 			kprintf("vm_object_backing_scan: Warning: page "
2155 				"%p ripped out from under us\n", p);
2156 			vm_page_wakeup(p);
2157 			info->error = -1;
2158 			return(-1);
2159 		}
2160 
2161 		if (op & OBSC_COLLAPSE_NOWAIT) {
2162 			if (p->valid == 0 ||
2163 			    p->wire_count ||
2164 			    (p->flags & PG_NEED_COMMIT)) {
2165 				vm_page_wakeup(p);
2166 				return(0);
2167 			}
2168 		} else {
2169 			/* XXX what if p->valid == 0 , hold_count, etc? */
2170 		}
2171 
2172 		KASSERT(
2173 		    p->object == backing_object,
2174 		    ("vm_object_qcollapse(): object mismatch")
2175 		);
2176 
2177 		/*
2178 		 * Destroy any associated swap
2179 		 */
2180 		if (backing_object->type == OBJT_SWAP)
2181 			swap_pager_freespace(backing_object, p->pindex, 1);
2182 
2183 		if (
2184 		    p->pindex < backing_offset_index ||
2185 		    new_pindex >= object->size
2186 		) {
2187 			/*
2188 			 * Page is out of the parent object's range, we
2189 			 * can simply destroy it.
2190 			 */
2191 			vm_page_protect(p, VM_PROT_NONE);
2192 			vm_page_free(p);
2193 			return(0);
2194 		}
2195 
2196 		pp = vm_page_lookup(object, new_pindex);
2197 		if (pp != NULL || vm_pager_has_page(object, new_pindex)) {
2198 			/*
2199 			 * page already exists in parent OR swap exists
2200 			 * for this location in the parent.  Destroy
2201 			 * the original page from the backing object.
2202 			 *
2203 			 * Leave the parent's page alone
2204 			 */
2205 			vm_page_protect(p, VM_PROT_NONE);
2206 			vm_page_free(p);
2207 			return(0);
2208 		}
2209 
2210 		/*
2211 		 * Page does not exist in parent, rename the
2212 		 * page from the backing object to the main object.
2213 		 *
2214 		 * If the page was mapped to a process, it can remain
2215 		 * mapped through the rename.
2216 		 */
2217 		if ((p->queue - p->pc) == PQ_CACHE)
2218 			vm_page_deactivate(p);
2219 
2220 		vm_page_rename(p, object, new_pindex);
2221 		vm_page_wakeup(p);
2222 		/* page automatically made dirty by rename */
2223 	}
2224 	return(0);
2225 }
2226 
2227 /*
2228  * This version of collapse allows the operation to occur earlier and
2229  * when paging_in_progress is true for an object...  This is not a complete
2230  * operation, but should plug 99.9% of the rest of the leaks.
2231  *
2232  * The caller must hold the object and backing_object and both must be
2233  * chainlocked.
2234  *
2235  * (only called from vm_object_collapse)
2236  */
2237 static void
2238 vm_object_qcollapse(vm_object_t object, vm_object_t backing_object)
2239 {
2240 	if (backing_object->ref_count == 1) {
2241 		atomic_add_int(&backing_object->ref_count, 2);
2242 #if defined(DEBUG_LOCKS)
2243 		debugvm_object_add(backing_object, "qcollapse", 1, 2);
2244 #endif
2245 		vm_object_backing_scan(object, backing_object,
2246 				       OBSC_COLLAPSE_NOWAIT);
2247 		atomic_add_int(&backing_object->ref_count, -2);
2248 #if defined(DEBUG_LOCKS)
2249 		debugvm_object_add(backing_object, "qcollapse", 2, -2);
2250 #endif
2251 	}
2252 }
2253 
2254 /*
2255  * Collapse an object with the object backing it.  Pages in the backing
2256  * object are moved into the parent, and the backing object is deallocated.
2257  * Any conflict is resolved in favor of the parent's existing pages.
2258  *
2259  * object must be held and chain-locked on call.
2260  *
2261  * The caller must have an extra ref on object to prevent a race from
2262  * destroying it during the collapse.
2263  */
2264 void
2265 vm_object_collapse(vm_object_t object, struct vm_object_dealloc_list **dlistp)
2266 {
2267 	struct vm_object_dealloc_list *dlist = NULL;
2268 	vm_object_t backing_object;
2269 
2270 	/*
2271 	 * Only one thread is attempting a collapse at any given moment.
2272 	 * There are few restrictions for (object) that callers of this
2273 	 * function check so reentrancy is likely.
2274 	 */
2275 	KKASSERT(object != NULL);
2276 	vm_object_assert_held(object);
2277 	KKASSERT(object->chainlk & (CHAINLK_MASK | CHAINLK_EXCL));
2278 
2279 	for (;;) {
2280 		vm_object_t bbobj;
2281 		int dodealloc;
2282 
2283 		/*
2284 		 * We can only collapse a DEFAULT/SWAP object with a
2285 		 * DEFAULT/SWAP object.
2286 		 */
2287 		if (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) {
2288 			backing_object = NULL;
2289 			break;
2290 		}
2291 
2292 		backing_object = object->backing_object;
2293 		if (backing_object == NULL)
2294 			break;
2295 		if (backing_object->type != OBJT_DEFAULT &&
2296 		    backing_object->type != OBJT_SWAP) {
2297 			backing_object = NULL;
2298 			break;
2299 		}
2300 
2301 		/*
2302 		 * Hold (token lock) the backing_object and retest conditions.
2303 		 */
2304 		vm_object_hold(backing_object);
2305 		if (backing_object != object->backing_object ||
2306 		    (backing_object->type != OBJT_DEFAULT &&
2307 		     backing_object->type != OBJT_SWAP)) {
2308 			vm_object_drop(backing_object);
2309 			continue;
2310 		}
2311 
2312 		/*
2313 		 * Chain-lock the backing object too because if we
2314 		 * successfully merge its pages into the top object we
2315 		 * will collapse backing_object->backing_object as the
2316 		 * new backing_object.  Re-check that it is still our
2317 		 * backing object.
2318 		 */
2319 		vm_object_chain_acquire(backing_object, 0);
2320 		if (backing_object != object->backing_object) {
2321 			vm_object_chain_release(backing_object);
2322 			vm_object_drop(backing_object);
2323 			continue;
2324 		}
2325 
2326 		/*
2327 		 * We check the backing object first, because it is most
2328 		 * likely not collapsable.
2329 		 */
2330 		if (backing_object->handle != NULL ||
2331 		    (backing_object->type != OBJT_DEFAULT &&
2332 		     backing_object->type != OBJT_SWAP) ||
2333 		    (backing_object->flags & OBJ_DEAD) ||
2334 		    object->handle != NULL ||
2335 		    (object->type != OBJT_DEFAULT &&
2336 		     object->type != OBJT_SWAP) ||
2337 		    (object->flags & OBJ_DEAD)) {
2338 			break;
2339 		}
2340 
2341 		/*
2342 		 * If paging is in progress we can't do a normal collapse.
2343 		 */
2344 		if (object->paging_in_progress != 0 ||
2345 		    backing_object->paging_in_progress != 0
2346 		) {
2347 			vm_object_qcollapse(object, backing_object);
2348 			break;
2349 		}
2350 
2351 		/*
2352 		 * We know that we can either collapse the backing object (if
2353 		 * the parent is the only reference to it) or (perhaps) have
2354 		 * the parent bypass the object if the parent happens to shadow
2355 		 * all the resident pages in the entire backing object.
2356 		 *
2357 		 * This is ignoring pager-backed pages such as swap pages.
2358 		 * vm_object_backing_scan fails the shadowing test in this
2359 		 * case.
2360 		 */
2361 		if (backing_object->ref_count == 1) {
2362 			/*
2363 			 * If there is exactly one reference to the backing
2364 			 * object, we can collapse it into the parent.
2365 			 */
2366 			KKASSERT(object->backing_object == backing_object);
2367 			vm_object_backing_scan(object, backing_object,
2368 					       OBSC_COLLAPSE_WAIT);
2369 
2370 			/*
2371 			 * Move the pager from backing_object to object.
2372 			 */
2373 			if (backing_object->type == OBJT_SWAP) {
2374 				vm_object_pip_add(backing_object, 1);
2375 
2376 				/*
2377 				 * scrap the paging_offset junk and do a
2378 				 * discrete copy.  This also removes major
2379 				 * assumptions about how the swap-pager
2380 				 * works from where it doesn't belong.  The
2381 				 * new swapper is able to optimize the
2382 				 * destroy-source case.
2383 				 */
2384 				vm_object_pip_add(object, 1);
2385 				swap_pager_copy(backing_object, object,
2386 				    OFF_TO_IDX(object->backing_object_offset),
2387 				    TRUE);
2388 				vm_object_pip_wakeup(object);
2389 				vm_object_pip_wakeup(backing_object);
2390 			}
2391 
2392 			/*
2393 			 * Object now shadows whatever backing_object did.
2394 			 * Remove object from backing_object's shadow_list.
2395 			 *
2396 			 * Removing object from backing_objects shadow list
2397 			 * requires releasing object, which we will do below.
2398 			 */
2399 			KKASSERT(object->backing_object == backing_object);
2400 			if (object->flags & OBJ_ONSHADOW) {
2401 				LIST_REMOVE(object, shadow_list);
2402 				backing_object->shadow_count--;
2403 				atomic_add_int(&backing_object->generation, 1);
2404 				vm_object_clear_flag(object, OBJ_ONSHADOW);
2405 			}
2406 
2407 			/*
2408 			 * backing_object->backing_object moves from within
2409 			 * backing_object to within object.
2410 			 *
2411 			 * OBJT_VNODE bbobj's should have empty shadow lists.
2412 			 */
2413 			while ((bbobj = backing_object->backing_object) != NULL) {
2414 				if (bbobj->type == OBJT_VNODE)
2415 					vm_object_hold_shared(bbobj);
2416 				else
2417 					vm_object_hold(bbobj);
2418 				if (bbobj == backing_object->backing_object)
2419 					break;
2420 				vm_object_drop(bbobj);
2421 			}
2422 
2423 			/*
2424 			 * We are removing backing_object from bbobj's
2425 			 * shadow list and adding object to bbobj's shadow
2426 			 * list, so the ref_count on bbobj is unchanged.
2427 			 */
2428 			if (bbobj) {
2429 				if (backing_object->flags & OBJ_ONSHADOW) {
2430 					/* not locked exclusively if vnode */
2431 					KKASSERT(bbobj->type != OBJT_VNODE);
2432 					LIST_REMOVE(backing_object,
2433 						    shadow_list);
2434 					bbobj->shadow_count--;
2435 					atomic_add_int(&bbobj->generation, 1);
2436 					vm_object_clear_flag(backing_object,
2437 							     OBJ_ONSHADOW);
2438 				}
2439 				backing_object->backing_object = NULL;
2440 			}
2441 			object->backing_object = bbobj;
2442 			if (bbobj) {
2443 				if (bbobj->type != OBJT_VNODE) {
2444 					LIST_INSERT_HEAD(&bbobj->shadow_head,
2445 							 object, shadow_list);
2446 					bbobj->shadow_count++;
2447 					atomic_add_int(&bbobj->generation, 1);
2448 					vm_object_set_flag(object,
2449 							   OBJ_ONSHADOW);
2450 				}
2451 			}
2452 
2453 			object->backing_object_offset +=
2454 				backing_object->backing_object_offset;
2455 
2456 			vm_object_drop(bbobj);
2457 
2458 			/*
2459 			 * Discard the old backing_object.  Nothing should be
2460 			 * able to ref it, other than a vm_map_split(),
2461 			 * and vm_map_split() will stall on our chain lock.
2462 			 * And we control the parent so it shouldn't be
2463 			 * possible for it to go away either.
2464 			 *
2465 			 * Since the backing object has no pages, no pager
2466 			 * left, and no object references within it, all
2467 			 * that is necessary is to dispose of it.
2468 			 */
2469 			KASSERT(backing_object->ref_count == 1,
2470 				("backing_object %p was somehow "
2471 				 "re-referenced during collapse!",
2472 				 backing_object));
2473 			KASSERT(RB_EMPTY(&backing_object->rb_memq),
2474 				("backing_object %p somehow has left "
2475 				 "over pages during collapse!",
2476 				 backing_object));
2477 
2478 			/*
2479 			 * The object can be destroyed.
2480 			 *
2481 			 * XXX just fall through and dodealloc instead
2482 			 *     of forcing destruction?
2483 			 */
2484 			atomic_add_int(&backing_object->ref_count, -1);
2485 #if defined(DEBUG_LOCKS)
2486 			debugvm_object_add(backing_object, "collapse", 1, -1);
2487 #endif
2488 			if ((backing_object->flags & OBJ_DEAD) == 0)
2489 				vm_object_terminate(backing_object);
2490 			object_collapses++;
2491 			dodealloc = 0;
2492 		} else {
2493 			/*
2494 			 * If we do not entirely shadow the backing object,
2495 			 * there is nothing we can do so we give up.
2496 			 */
2497 			if (vm_object_backing_scan(object, backing_object,
2498 						OBSC_TEST_ALL_SHADOWED) == 0) {
2499 				break;
2500 			}
2501 
2502 			/*
2503 			 * bbobj is backing_object->backing_object.  Since
2504 			 * object completely shadows backing_object we can
2505 			 * bypass it and become backed by bbobj instead.
2506 			 *
2507 			 * The shadow list for vnode backing objects is not
2508 			 * used and a shared hold is allowed.
2509 			 */
2510 			while ((bbobj = backing_object->backing_object) != NULL) {
2511 				if (bbobj->type == OBJT_VNODE)
2512 					vm_object_hold_shared(bbobj);
2513 				else
2514 					vm_object_hold(bbobj);
2515 				if (bbobj == backing_object->backing_object)
2516 					break;
2517 				vm_object_drop(bbobj);
2518 			}
2519 
2520 			/*
2521 			 * Make object shadow bbobj instead of backing_object.
2522 			 * Remove object from backing_object's shadow list.
2523 			 *
2524 			 * Deallocating backing_object will not remove
2525 			 * it, since its reference count is at least 2.
2526 			 *
2527 			 * Removing object from backing_object's shadow
2528 			 * list requires releasing a ref, which we do
2529 			 * below by setting dodealloc to 1.
2530 			 */
2531 			KKASSERT(object->backing_object == backing_object);
2532 			if (object->flags & OBJ_ONSHADOW) {
2533 				LIST_REMOVE(object, shadow_list);
2534 				backing_object->shadow_count--;
2535 				atomic_add_int(&backing_object->generation, 1);
2536 				vm_object_clear_flag(object, OBJ_ONSHADOW);
2537 			}
2538 
2539 			/*
2540 			 * Add a ref to bbobj, bbobj now shadows object.
2541 			 *
2542 			 * NOTE: backing_object->backing_object still points
2543 			 *	 to bbobj.  That relationship remains intact
2544 			 *	 because backing_object has > 1 ref, so
2545 			 *	 someone else is pointing to it (hence why
2546 			 *	 we can't collapse it into object and can
2547 			 *	 only handle the all-shadowed bypass case).
2548 			 */
2549 			if (bbobj) {
2550 				if (bbobj->type != OBJT_VNODE) {
2551 					vm_object_chain_wait(bbobj, 0);
2552 					vm_object_reference_locked(bbobj);
2553 					LIST_INSERT_HEAD(&bbobj->shadow_head,
2554 							 object, shadow_list);
2555 					bbobj->shadow_count++;
2556 					atomic_add_int(&bbobj->generation, 1);
2557 					vm_object_set_flag(object,
2558 							   OBJ_ONSHADOW);
2559 				} else {
2560 					vm_object_reference_quick(bbobj);
2561 				}
2562 				object->backing_object_offset +=
2563 					backing_object->backing_object_offset;
2564 				object->backing_object = bbobj;
2565 				vm_object_drop(bbobj);
2566 			} else {
2567 				object->backing_object = NULL;
2568 			}
2569 
2570 			/*
2571 			 * Drop the reference count on backing_object.  To
2572 			 * handle ref_count races properly we can't assume
2573 			 * that the ref_count is still at least 2 so we
2574 			 * have to actually call vm_object_deallocate()
2575 			 * (after clearing the chainlock).
2576 			 */
2577 			object_bypasses++;
2578 			dodealloc = 1;
2579 		}
2580 
2581 		/*
2582 		 * Ok, we want to loop on the new object->bbobj association,
2583 		 * possibly collapsing it further.  However if dodealloc is
2584 		 * non-zero we have to deallocate the backing_object which
2585 		 * itself can potentially undergo a collapse, creating a
2586 		 * recursion depth issue with the LWKT token subsystem.
2587 		 *
2588 		 * In the case where we must deallocate the backing_object
2589 		 * it is possible now that the backing_object has a single
2590 		 * shadow count on some other object (not represented here
2591 		 * as yet), since it no longer shadows us.  Thus when we
2592 		 * call vm_object_deallocate() it may attempt to collapse
2593 		 * itself into its remaining parent.
2594 		 */
2595 		if (dodealloc) {
2596 			struct vm_object_dealloc_list *dtmp;
2597 
2598 			vm_object_chain_release(backing_object);
2599 			vm_object_unlock(backing_object);
2600 			/* backing_object remains held */
2601 
2602 			/*
2603 			 * Auto-deallocation list for caller convenience.
2604 			 */
2605 			if (dlistp == NULL)
2606 				dlistp = &dlist;
2607 
2608 			dtmp = kmalloc(sizeof(*dtmp), M_TEMP, M_WAITOK);
2609 			dtmp->object = backing_object;
2610 			dtmp->next = *dlistp;
2611 			*dlistp = dtmp;
2612 		} else {
2613 			vm_object_chain_release(backing_object);
2614 			vm_object_drop(backing_object);
2615 		}
2616 		/* backing_object = NULL; not needed */
2617 		/* loop */
2618 	}
2619 
2620 	/*
2621 	 * Clean up any left over backing_object
2622 	 */
2623 	if (backing_object) {
2624 		vm_object_chain_release(backing_object);
2625 		vm_object_drop(backing_object);
2626 	}
2627 
2628 	/*
2629 	 * Clean up any auto-deallocation list.  This is a convenience
2630 	 * for top-level callers so they don't have to pass &dlist.
2631 	 * Do not clean up any caller-passed dlistp, the caller will
2632 	 * do that.
2633 	 */
2634 	if (dlist)
2635 		vm_object_deallocate_list(&dlist);
2636 
2637 }
2638 
2639 /*
2640  * vm_object_collapse() may collect additional objects in need of
2641  * deallocation.  This routine deallocates these objects.  The
2642  * deallocation itself can trigger additional collapses (which the
2643  * deallocate function takes care of).  This procedure is used to
2644  * reduce procedural recursion since these vm_object shadow chains
2645  * can become quite long.
2646  */
2647 void
2648 vm_object_deallocate_list(struct vm_object_dealloc_list **dlistp)
2649 {
2650 	struct vm_object_dealloc_list *dlist;
2651 
2652 	while ((dlist = *dlistp) != NULL) {
2653 		*dlistp = dlist->next;
2654 		vm_object_lock(dlist->object);
2655 		vm_object_deallocate_locked(dlist->object);
2656 		vm_object_drop(dlist->object);
2657 		kfree(dlist, M_TEMP);
2658 	}
2659 }
2660 
2661 /*
2662  * Removes all physical pages in the specified object range from the
2663  * object's list of pages.
2664  *
2665  * No requirements.
2666  */
2667 static int vm_object_page_remove_callback(vm_page_t p, void *data);
2668 
2669 void
2670 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
2671 		      boolean_t clean_only)
2672 {
2673 	struct rb_vm_page_scan_info info;
2674 	int all;
2675 
2676 	/*
2677 	 * Degenerate cases and assertions
2678 	 */
2679 	vm_object_hold(object);
2680 	if (object == NULL ||
2681 	    (object->resident_page_count == 0 && object->swblock_count == 0)) {
2682 		vm_object_drop(object);
2683 		return;
2684 	}
2685 	KASSERT(object->type != OBJT_PHYS,
2686 		("attempt to remove pages from a physical object"));
2687 
2688 	/*
2689 	 * Indicate that paging is occuring on the object
2690 	 */
2691 	vm_object_pip_add(object, 1);
2692 
2693 	/*
2694 	 * Figure out the actual removal range and whether we are removing
2695 	 * the entire contents of the object or not.  If removing the entire
2696 	 * contents, be sure to get all pages, even those that might be
2697 	 * beyond the end of the object.
2698 	 */
2699 	info.object = object;
2700 	info.start_pindex = start;
2701 	if (end == 0)
2702 		info.end_pindex = (vm_pindex_t)-1;
2703 	else
2704 		info.end_pindex = end - 1;
2705 	info.limit = clean_only;
2706 	info.count = 0;
2707 	all = (start == 0 && info.end_pindex >= object->size - 1);
2708 
2709 	/*
2710 	 * Loop until we are sure we have gotten them all.
2711 	 */
2712 	do {
2713 		info.error = 0;
2714 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
2715 					vm_object_page_remove_callback, &info);
2716 	} while (info.error);
2717 
2718 	/*
2719 	 * Remove any related swap if throwing away pages, or for
2720 	 * non-swap objects (the swap is a clean copy in that case).
2721 	 */
2722 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
2723 		if (all)
2724 			swap_pager_freespace_all(object);
2725 		else
2726 			swap_pager_freespace(object, info.start_pindex,
2727 			     info.end_pindex - info.start_pindex + 1);
2728 	}
2729 
2730 	/*
2731 	 * Cleanup
2732 	 */
2733 	vm_object_pip_wakeup(object);
2734 	vm_object_drop(object);
2735 }
2736 
2737 /*
2738  * The caller must hold the object.
2739  *
2740  * NOTE: User yields are allowed when removing more than one page, but not
2741  *	 allowed if only removing one page (the path for single page removals
2742  *	 might hold a spinlock).
2743  */
2744 static int
2745 vm_object_page_remove_callback(vm_page_t p, void *data)
2746 {
2747 	struct rb_vm_page_scan_info *info = data;
2748 
2749 	if (info->object != p->object ||
2750 	    p->pindex < info->start_pindex ||
2751 	    p->pindex > info->end_pindex) {
2752 		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
2753 			info->object, p);
2754 		return(0);
2755 	}
2756 	if (vm_page_busy_try(p, TRUE)) {
2757 		vm_page_sleep_busy(p, TRUE, "vmopar");
2758 		info->error = 1;
2759 		return(0);
2760 	}
2761 	if (info->object != p->object) {
2762 		/* this should never happen */
2763 		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
2764 			info->object, p);
2765 		vm_page_wakeup(p);
2766 		return(0);
2767 	}
2768 
2769 	/*
2770 	 * Wired pages cannot be destroyed, but they can be invalidated
2771 	 * and we do so if clean_only (limit) is not set.
2772 	 *
2773 	 * WARNING!  The page may be wired due to being part of a buffer
2774 	 *	     cache buffer, and the buffer might be marked B_CACHE.
2775 	 *	     This is fine as part of a truncation but VFSs must be
2776 	 *	     sure to fix the buffer up when re-extending the file.
2777 	 *
2778 	 * NOTE!     PG_NEED_COMMIT is ignored.
2779 	 */
2780 	if (p->wire_count != 0) {
2781 		vm_page_protect(p, VM_PROT_NONE);
2782 		if (info->limit == 0)
2783 			p->valid = 0;
2784 		vm_page_wakeup(p);
2785 		goto done;
2786 	}
2787 
2788 	/*
2789 	 * limit is our clean_only flag.  If set and the page is dirty or
2790 	 * requires a commit, do not free it.  If set and the page is being
2791 	 * held by someone, do not free it.
2792 	 */
2793 	if (info->limit && p->valid) {
2794 		vm_page_test_dirty(p);
2795 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
2796 			vm_page_wakeup(p);
2797 			goto done;
2798 		}
2799 	}
2800 
2801 	/*
2802 	 * Destroy the page
2803 	 */
2804 	vm_page_protect(p, VM_PROT_NONE);
2805 	vm_page_free(p);
2806 
2807 	/*
2808 	 * Must be at end to avoid SMP races, caller holds object token
2809 	 */
2810 done:
2811 	if ((++info->count & 63) == 0)
2812 		lwkt_user_yield();
2813 
2814 	return(0);
2815 }
2816 
2817 /*
2818  * Try to extend prev_object into an adjoining region of virtual
2819  * memory, return TRUE on success.
2820  *
2821  * The caller does not need to hold (prev_object) but must have a stable
2822  * pointer to it (typically by holding the vm_map locked).
2823  *
2824  * This function only works for anonymous memory objects which either
2825  * have (a) one reference or (b) we are extending the object's size.
2826  * Otherwise the related VM pages we want to use for the object might
2827  * be in use by another mapping.
2828  */
2829 boolean_t
2830 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
2831 		   vm_size_t prev_size, vm_size_t next_size)
2832 {
2833 	vm_pindex_t next_pindex;
2834 
2835 	if (prev_object == NULL)
2836 		return (TRUE);
2837 
2838 	vm_object_hold(prev_object);
2839 
2840 	if (prev_object->type != OBJT_DEFAULT &&
2841 	    prev_object->type != OBJT_SWAP) {
2842 		vm_object_drop(prev_object);
2843 		return (FALSE);
2844 	}
2845 
2846 	/*
2847 	 * Try to collapse the object first
2848 	 */
2849 	vm_object_chain_acquire(prev_object, 0);
2850 	vm_object_collapse(prev_object, NULL);
2851 
2852 	/*
2853 	 * We can't coalesce if we shadow another object (figuring out the
2854 	 * relationships become too complex).
2855 	 */
2856 	if (prev_object->backing_object != NULL) {
2857 		vm_object_chain_release(prev_object);
2858 		vm_object_drop(prev_object);
2859 		return (FALSE);
2860 	}
2861 
2862 	prev_size >>= PAGE_SHIFT;
2863 	next_size >>= PAGE_SHIFT;
2864 	next_pindex = prev_pindex + prev_size;
2865 
2866 	/*
2867 	 * We can't if the object has more than one ref count unless we
2868 	 * are extending it into newly minted space.
2869 	 */
2870 	if (prev_object->ref_count > 1 &&
2871 	    prev_object->size != next_pindex) {
2872 		vm_object_chain_release(prev_object);
2873 		vm_object_drop(prev_object);
2874 		return (FALSE);
2875 	}
2876 
2877 	/*
2878 	 * Remove any pages that may still be in the object from a previous
2879 	 * deallocation.
2880 	 */
2881 	if (next_pindex < prev_object->size) {
2882 		vm_object_page_remove(prev_object,
2883 				      next_pindex,
2884 				      next_pindex + next_size, FALSE);
2885 		if (prev_object->type == OBJT_SWAP)
2886 			swap_pager_freespace(prev_object,
2887 					     next_pindex, next_size);
2888 	}
2889 
2890 	/*
2891 	 * Extend the object if necessary.
2892 	 */
2893 	if (next_pindex + next_size > prev_object->size)
2894 		prev_object->size = next_pindex + next_size;
2895 	vm_object_chain_release(prev_object);
2896 	vm_object_drop(prev_object);
2897 
2898 	return (TRUE);
2899 }
2900 
2901 /*
2902  * Make the object writable and flag is being possibly dirty.
2903  *
2904  * The object might not be held (or might be held but held shared),
2905  * the related vnode is probably not held either.  Object and vnode are
2906  * stable by virtue of the vm_page busied by the caller preventing
2907  * destruction.
2908  *
2909  * If the related mount is flagged MNTK_THR_SYNC we need to call
2910  * vsetobjdirty().  Filesystems using this option usually shortcut
2911  * synchronization by only scanning the syncer list.
2912  */
2913 void
2914 vm_object_set_writeable_dirty(vm_object_t object)
2915 {
2916 	struct vnode *vp;
2917 
2918 	/*vm_object_assert_held(object);*/
2919 	/*
2920 	 * Avoid contention in vm fault path by checking the state before
2921 	 * issuing an atomic op on it.
2922 	 */
2923 	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
2924 	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
2925 		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
2926 	}
2927 	if (object->type == OBJT_VNODE &&
2928 	    (vp = (struct vnode *)object->handle) != NULL) {
2929 		if ((vp->v_flag & VOBJDIRTY) == 0) {
2930 			if (vp->v_mount &&
2931 			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
2932 				/*
2933 				 * New style THR_SYNC places vnodes on the
2934 				 * syncer list more deterministically.
2935 				 */
2936 				vsetobjdirty(vp);
2937 			} else {
2938 				/*
2939 				 * Old style scan would not necessarily place
2940 				 * a vnode on the syncer list when possibly
2941 				 * modified via mmap.
2942 				 */
2943 				vsetflags(vp, VOBJDIRTY);
2944 			}
2945 		}
2946 	}
2947 }
2948 
2949 #include "opt_ddb.h"
2950 #ifdef DDB
2951 #include <sys/cons.h>
2952 
2953 #include <ddb/ddb.h>
2954 
2955 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
2956 				       vm_map_entry_t entry);
2957 static int	vm_object_in_map (vm_object_t object);
2958 
2959 /*
2960  * The caller must hold the object.
2961  */
2962 static int
2963 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
2964 {
2965 	vm_map_t tmpm;
2966 	vm_map_entry_t tmpe;
2967 	vm_object_t obj, nobj;
2968 	int entcount;
2969 
2970 	if (map == 0)
2971 		return 0;
2972 	if (entry == 0) {
2973 		tmpe = map->header.next;
2974 		entcount = map->nentries;
2975 		while (entcount-- && (tmpe != &map->header)) {
2976 			if( _vm_object_in_map(map, object, tmpe)) {
2977 				return 1;
2978 			}
2979 			tmpe = tmpe->next;
2980 		}
2981 		return (0);
2982 	}
2983 	switch(entry->maptype) {
2984 	case VM_MAPTYPE_SUBMAP:
2985 		tmpm = entry->object.sub_map;
2986 		tmpe = tmpm->header.next;
2987 		entcount = tmpm->nentries;
2988 		while (entcount-- && tmpe != &tmpm->header) {
2989 			if( _vm_object_in_map(tmpm, object, tmpe)) {
2990 				return 1;
2991 			}
2992 			tmpe = tmpe->next;
2993 		}
2994 		break;
2995 	case VM_MAPTYPE_NORMAL:
2996 	case VM_MAPTYPE_VPAGETABLE:
2997 		obj = entry->object.vm_object;
2998 		while (obj) {
2999 			if (obj == object) {
3000 				if (obj != entry->object.vm_object)
3001 					vm_object_drop(obj);
3002 				return 1;
3003 			}
3004 			while ((nobj = obj->backing_object) != NULL) {
3005 				vm_object_hold(nobj);
3006 				if (nobj == obj->backing_object)
3007 					break;
3008 				vm_object_drop(nobj);
3009 			}
3010 			if (obj != entry->object.vm_object) {
3011 				if (nobj)
3012 					vm_object_lock_swap();
3013 				vm_object_drop(obj);
3014 			}
3015 			obj = nobj;
3016 		}
3017 		break;
3018 	default:
3019 		break;
3020 	}
3021 	return 0;
3022 }
3023 
3024 static int vm_object_in_map_callback(struct proc *p, void *data);
3025 
3026 struct vm_object_in_map_info {
3027 	vm_object_t object;
3028 	int rv;
3029 };
3030 
3031 /*
3032  * Debugging only
3033  */
3034 static int
3035 vm_object_in_map(vm_object_t object)
3036 {
3037 	struct vm_object_in_map_info info;
3038 
3039 	info.rv = 0;
3040 	info.object = object;
3041 
3042 	allproc_scan(vm_object_in_map_callback, &info, 0);
3043 	if (info.rv)
3044 		return 1;
3045 	if( _vm_object_in_map(&kernel_map, object, 0))
3046 		return 1;
3047 	if( _vm_object_in_map(&pager_map, object, 0))
3048 		return 1;
3049 	if( _vm_object_in_map(&buffer_map, object, 0))
3050 		return 1;
3051 	return 0;
3052 }
3053 
3054 /*
3055  * Debugging only
3056  */
3057 static int
3058 vm_object_in_map_callback(struct proc *p, void *data)
3059 {
3060 	struct vm_object_in_map_info *info = data;
3061 
3062 	if (p->p_vmspace) {
3063 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
3064 			info->rv = 1;
3065 			return -1;
3066 		}
3067 	}
3068 	return (0);
3069 }
3070 
3071 DB_SHOW_COMMAND(vmochk, vm_object_check)
3072 {
3073 	struct vm_object_hash *hash;
3074 	vm_object_t object;
3075 	int n;
3076 
3077 	/*
3078 	 * make sure that internal objs are in a map somewhere
3079 	 * and none have zero ref counts.
3080 	 */
3081 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
3082 		hash = &vm_object_hash[n];
3083 		for (object = TAILQ_FIRST(&hash->list);
3084 				object != NULL;
3085 				object = TAILQ_NEXT(object, object_list)) {
3086 			if (object->type == OBJT_MARKER)
3087 				continue;
3088 			if (object->handle != NULL ||
3089 			    (object->type != OBJT_DEFAULT &&
3090 			     object->type != OBJT_SWAP)) {
3091 				continue;
3092 			}
3093 			if (object->ref_count == 0) {
3094 				db_printf("vmochk: internal obj has "
3095 					  "zero ref count: %ld\n",
3096 					  (long)object->size);
3097 			}
3098 			if (vm_object_in_map(object))
3099 				continue;
3100 			db_printf("vmochk: internal obj is not in a map: "
3101 				  "ref: %d, size: %lu: 0x%lx, "
3102 				  "backing_object: %p\n",
3103 				  object->ref_count, (u_long)object->size,
3104 				  (u_long)object->size,
3105 				  (void *)object->backing_object);
3106 		}
3107 	}
3108 }
3109 
3110 /*
3111  * Debugging only
3112  */
3113 DB_SHOW_COMMAND(object, vm_object_print_static)
3114 {
3115 	/* XXX convert args. */
3116 	vm_object_t object = (vm_object_t)addr;
3117 	boolean_t full = have_addr;
3118 
3119 	vm_page_t p;
3120 
3121 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
3122 #define	count	was_count
3123 
3124 	int count;
3125 
3126 	if (object == NULL)
3127 		return;
3128 
3129 	db_iprintf(
3130 	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
3131 	    object, (int)object->type, (u_long)object->size,
3132 	    object->resident_page_count, object->ref_count, object->flags);
3133 	/*
3134 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
3135 	 */
3136 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
3137 	    object->shadow_count,
3138 	    object->backing_object ? object->backing_object->ref_count : 0,
3139 	    object->backing_object, (long)object->backing_object_offset);
3140 
3141 	if (!full)
3142 		return;
3143 
3144 	db_indent += 2;
3145 	count = 0;
3146 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
3147 		if (count == 0)
3148 			db_iprintf("memory:=");
3149 		else if (count == 6) {
3150 			db_printf("\n");
3151 			db_iprintf(" ...");
3152 			count = 0;
3153 		} else
3154 			db_printf(",");
3155 		count++;
3156 
3157 		db_printf("(off=0x%lx,page=0x%lx)",
3158 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
3159 	}
3160 	if (count != 0)
3161 		db_printf("\n");
3162 	db_indent -= 2;
3163 }
3164 
3165 /* XXX. */
3166 #undef count
3167 
3168 /*
3169  * XXX need this non-static entry for calling from vm_map_print.
3170  *
3171  * Debugging only
3172  */
3173 void
3174 vm_object_print(/* db_expr_t */ long addr,
3175 		boolean_t have_addr,
3176 		/* db_expr_t */ long count,
3177 		char *modif)
3178 {
3179 	vm_object_print_static(addr, have_addr, count, modif);
3180 }
3181 
3182 /*
3183  * Debugging only
3184  */
3185 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
3186 {
3187 	struct vm_object_hash *hash;
3188 	vm_object_t object;
3189 	int nl = 0;
3190 	int c;
3191 	int n;
3192 
3193 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
3194 		hash = &vm_object_hash[n];
3195 		for (object = TAILQ_FIRST(&hash->list);
3196 				object != NULL;
3197 				object = TAILQ_NEXT(object, object_list)) {
3198 			vm_pindex_t idx, fidx;
3199 			vm_pindex_t osize;
3200 			vm_paddr_t pa = -1, padiff;
3201 			int rcount;
3202 			vm_page_t m;
3203 
3204 			if (object->type == OBJT_MARKER)
3205 				continue;
3206 			db_printf("new object: %p\n", (void *)object);
3207 			if ( nl > 18) {
3208 				c = cngetc();
3209 				if (c != ' ')
3210 					return;
3211 				nl = 0;
3212 			}
3213 			nl++;
3214 			rcount = 0;
3215 			fidx = 0;
3216 			osize = object->size;
3217 			if (osize > 128)
3218 				osize = 128;
3219 			for (idx = 0; idx < osize; idx++) {
3220 				m = vm_page_lookup(object, idx);
3221 				if (m == NULL) {
3222 					if (rcount) {
3223 						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3224 							(long)fidx, rcount, (long)pa);
3225 						if ( nl > 18) {
3226 							c = cngetc();
3227 							if (c != ' ')
3228 								return;
3229 							nl = 0;
3230 						}
3231 						nl++;
3232 						rcount = 0;
3233 					}
3234 					continue;
3235 				}
3236 
3237 				if (rcount &&
3238 					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
3239 					++rcount;
3240 					continue;
3241 				}
3242 				if (rcount) {
3243 					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
3244 					padiff >>= PAGE_SHIFT;
3245 					padiff &= PQ_L2_MASK;
3246 					if (padiff == 0) {
3247 						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
3248 						++rcount;
3249 						continue;
3250 					}
3251 					db_printf(" index(%ld)run(%d)pa(0x%lx)",
3252 						(long)fidx, rcount, (long)pa);
3253 					db_printf("pd(%ld)\n", (long)padiff);
3254 					if ( nl > 18) {
3255 						c = cngetc();
3256 						if (c != ' ')
3257 							return;
3258 						nl = 0;
3259 					}
3260 					nl++;
3261 				}
3262 				fidx = idx;
3263 				pa = VM_PAGE_TO_PHYS(m);
3264 				rcount = 1;
3265 			}
3266 			if (rcount) {
3267 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
3268 					(long)fidx, rcount, (long)pa);
3269 				if ( nl > 18) {
3270 					c = cngetc();
3271 					if (c != ' ')
3272 						return;
3273 					nl = 0;
3274 				}
3275 				nl++;
3276 			}
3277 		}
3278 	}
3279 }
3280 #endif /* DDB */
3281