xref: /dragonfly/sys/vm/vm_object.c (revision 49837aef)
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62 
63 /*
64  *	Virtual memory object module.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>		/* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/malloc.h>
77 #include <sys/sysctl.h>
78 #include <sys/refcount.h>
79 
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_zone.h>
92 
93 #include <vm/vm_page2.h>
94 
95 #include <machine/specialreg.h>
96 
97 #define EASY_SCAN_FACTOR	8
98 
99 static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100 					     int pagerflags);
101 static void	vm_object_lock_init(vm_object_t);
102 
103 /*
104  *	Virtual memory objects maintain the actual data
105  *	associated with allocated virtual memory.  A given
106  *	page of memory exists within exactly one object.
107  *
108  *	An object is only deallocated when all "references"
109  *	are given up.  Only one "reference" to a given
110  *	region of an object should be writeable.
111  *
112  *	Associated with each object is a list of all resident
113  *	memory pages belonging to that object; this list is
114  *	maintained by the "vm_page" module, and locked by the object's
115  *	lock.
116  *
117  *	Each object also records a "pager" routine which is
118  *	used to retrieve (and store) pages to the proper backing
119  *	storage.  In addition, objects may be backed by other
120  *	objects from which they were virtual-copied.
121  *
122  *	The only items within the object structure which are
123  *	modified after time of creation are:
124  *		reference count		locked by object's lock
125  *		pager routine		locked by object's lock
126  *
127  */
128 
129 struct vm_object kernel_object;
130 
131 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
132 
133 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
134 
135 #define VMOBJ_HASH_PRIME1	66555444443333333ULL
136 #define VMOBJ_HASH_PRIME2	989042931893ULL
137 
138 int vm_object_debug;
139 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
140 
141 static __inline
142 struct vm_object_hash *
143 vmobj_hash(vm_object_t obj)
144 {
145 	uintptr_t hash1;
146 	uintptr_t hash2;
147 
148 	hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
149 	hash1 %= VMOBJ_HASH_PRIME1;
150 	hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
151 	hash2 %= VMOBJ_HASH_PRIME2;
152 	return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
153 }
154 
155 #if defined(DEBUG_LOCKS)
156 
157 #define vm_object_vndeallocate(obj, vpp)	\
158                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
159 
160 /*
161  * Debug helper to track hold/drop/ref/deallocate calls.
162  */
163 static void
164 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
165 {
166 	int i;
167 
168 	i = atomic_fetchadd_int(&obj->debug_index, 1);
169 	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
170 	ksnprintf(obj->debug_hold_thrs[i],
171 		  sizeof(obj->debug_hold_thrs[i]),
172 		  "%c%d:(%d):%s",
173 		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
174 		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
175 		  obj->ref_count,
176 		  curthread->td_comm);
177 	obj->debug_hold_file[i] = file;
178 	obj->debug_hold_line[i] = line;
179 #if 0
180 	/* Uncomment for debugging obj refs/derefs in reproducable cases */
181 	if (strcmp(curthread->td_comm, "sshd") == 0) {
182 		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
183 			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
184 			obj, obj->ref_count, addrem, file, line);
185 	}
186 #endif
187 }
188 
189 #endif
190 
191 /*
192  * Misc low level routines
193  */
194 static void
195 vm_object_lock_init(vm_object_t obj)
196 {
197 #if defined(DEBUG_LOCKS)
198 	int i;
199 
200 	obj->debug_index = 0;
201 	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
202 		obj->debug_hold_thrs[i][0] = 0;
203 		obj->debug_hold_file[i] = NULL;
204 		obj->debug_hold_line[i] = 0;
205 	}
206 #endif
207 }
208 
209 void
210 vm_object_lock_swap(void)
211 {
212 	lwkt_token_swap();
213 }
214 
215 void
216 vm_object_lock(vm_object_t obj)
217 {
218 	lwkt_gettoken(&obj->token);
219 }
220 
221 /*
222  * Returns TRUE on sucesss
223  */
224 static int
225 vm_object_lock_try(vm_object_t obj)
226 {
227 	return(lwkt_trytoken(&obj->token));
228 }
229 
230 void
231 vm_object_lock_shared(vm_object_t obj)
232 {
233 	lwkt_gettoken_shared(&obj->token);
234 }
235 
236 void
237 vm_object_unlock(vm_object_t obj)
238 {
239 	lwkt_reltoken(&obj->token);
240 }
241 
242 void
243 vm_object_upgrade(vm_object_t obj)
244 {
245 	lwkt_reltoken(&obj->token);
246 	lwkt_gettoken(&obj->token);
247 }
248 
249 void
250 vm_object_downgrade(vm_object_t obj)
251 {
252 	lwkt_reltoken(&obj->token);
253 	lwkt_gettoken_shared(&obj->token);
254 }
255 
256 static __inline void
257 vm_object_assert_held(vm_object_t obj)
258 {
259 	ASSERT_LWKT_TOKEN_HELD(&obj->token);
260 }
261 
262 int
263 vm_quickcolor(void)
264 {
265 	globaldata_t gd = mycpu;
266 	int pg_color;
267 
268 	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
269 	pg_color += gd->gd_quick_color;
270 	gd->gd_quick_color += PQ_PRIME2;
271 
272 	return pg_color;
273 }
274 
275 void
276 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
277 {
278 	KKASSERT(obj != NULL);
279 
280 	/*
281 	 * Object must be held (object allocation is stable due to callers
282 	 * context, typically already holding the token on a parent object)
283 	 * prior to potentially blocking on the lock, otherwise the object
284 	 * can get ripped away from us.
285 	 */
286 	refcount_acquire(&obj->hold_count);
287 	vm_object_lock(obj);
288 
289 #if defined(DEBUG_LOCKS)
290 	debugvm_object_add(obj, file, line, 1);
291 #endif
292 }
293 
294 int
295 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
296 {
297 	KKASSERT(obj != NULL);
298 
299 	/*
300 	 * Object must be held (object allocation is stable due to callers
301 	 * context, typically already holding the token on a parent object)
302 	 * prior to potentially blocking on the lock, otherwise the object
303 	 * can get ripped away from us.
304 	 */
305 	refcount_acquire(&obj->hold_count);
306 	if (vm_object_lock_try(obj) == 0) {
307 		if (refcount_release(&obj->hold_count)) {
308 			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
309 				kfree(obj, M_VM_OBJECT);
310 		}
311 		return(0);
312 	}
313 
314 #if defined(DEBUG_LOCKS)
315 	debugvm_object_add(obj, file, line, 1);
316 #endif
317 	return(1);
318 }
319 
320 void
321 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
322 {
323 	KKASSERT(obj != NULL);
324 
325 	/*
326 	 * Object must be held (object allocation is stable due to callers
327 	 * context, typically already holding the token on a parent object)
328 	 * prior to potentially blocking on the lock, otherwise the object
329 	 * can get ripped away from us.
330 	 */
331 	refcount_acquire(&obj->hold_count);
332 	vm_object_lock_shared(obj);
333 
334 #if defined(DEBUG_LOCKS)
335 	debugvm_object_add(obj, file, line, 1);
336 #endif
337 }
338 
339 /*
340  * Drop the token and hold_count on the object.
341  *
342  * WARNING! Token might be shared.
343  */
344 void
345 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
346 {
347 	if (obj == NULL)
348 		return;
349 
350 	/*
351 	 * No new holders should be possible once we drop hold_count 1->0 as
352 	 * there is no longer any way to reference the object.
353 	 */
354 	KKASSERT(obj->hold_count > 0);
355 	if (refcount_release(&obj->hold_count)) {
356 #if defined(DEBUG_LOCKS)
357 		debugvm_object_add(obj, file, line, -1);
358 #endif
359 
360 		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
361 			vm_object_unlock(obj);
362 			kfree(obj, M_VM_OBJECT);
363 		} else {
364 			vm_object_unlock(obj);
365 		}
366 	} else {
367 #if defined(DEBUG_LOCKS)
368 		debugvm_object_add(obj, file, line, -1);
369 #endif
370 		vm_object_unlock(obj);
371 	}
372 }
373 
374 /*
375  * Initialize a freshly allocated object, returning a held object.
376  *
377  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
378  *
379  * No requirements.
380  */
381 void
382 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
383 {
384 	struct vm_object_hash *hash;
385 
386 	RB_INIT(&object->rb_memq);
387 	lwkt_token_init(&object->token, "vmobj");
388 
389 	TAILQ_INIT(&object->backing_list);
390 	lockinit(&object->backing_lk, "baclk", 0, 0);
391 
392 	object->type = type;
393 	object->size = size;
394 	object->ref_count = 1;
395 	object->memattr = VM_MEMATTR_DEFAULT;
396 	object->hold_count = 0;
397 	object->flags = 0;
398 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
399 		vm_object_set_flag(object, OBJ_ONEMAPPING);
400 	object->paging_in_progress = 0;
401 	object->resident_page_count = 0;
402 	/* cpu localization twist */
403 	object->pg_color = vm_quickcolor();
404 	object->handle = NULL;
405 
406 	atomic_add_int(&object->generation, 1);
407 	object->swblock_count = 0;
408 	RB_INIT(&object->swblock_root);
409 	vm_object_lock_init(object);
410 	pmap_object_init(object);
411 
412 	vm_object_hold(object);
413 
414 	hash = vmobj_hash(object);
415 	lwkt_gettoken(&hash->token);
416 	TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
417 	lwkt_reltoken(&hash->token);
418 }
419 
420 /*
421  * Initialize a VM object.
422  */
423 void
424 vm_object_init(vm_object_t object, vm_pindex_t size)
425 {
426 	_vm_object_allocate(OBJT_DEFAULT, size, object);
427 	vm_object_drop(object);
428 }
429 
430 /*
431  * Initialize the VM objects module.
432  *
433  * Called from the low level boot code only.  Note that this occurs before
434  * kmalloc is initialized so we cannot allocate any VM objects.
435  */
436 void
437 vm_object_init1(void)
438 {
439 	int i;
440 
441 	for (i = 0; i < VMOBJ_HSIZE; ++i) {
442 		TAILQ_INIT(&vm_object_hash[i].list);
443 		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
444 	}
445 
446 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
447 			    &kernel_object);
448 	vm_object_drop(&kernel_object);
449 }
450 
451 void
452 vm_object_init2(void)
453 {
454 	kmalloc_set_unlimited(M_VM_OBJECT);
455 }
456 
457 /*
458  * Allocate and return a new object of the specified type and size.
459  *
460  * No requirements.
461  */
462 vm_object_t
463 vm_object_allocate(objtype_t type, vm_pindex_t size)
464 {
465 	vm_object_t obj;
466 
467 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
468 	_vm_object_allocate(type, size, obj);
469 	vm_object_drop(obj);
470 
471 	return (obj);
472 }
473 
474 /*
475  * This version returns a held object, allowing further atomic initialization
476  * of the object.
477  */
478 vm_object_t
479 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
480 {
481 	vm_object_t obj;
482 
483 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
484 	_vm_object_allocate(type, size, obj);
485 
486 	return (obj);
487 }
488 
489 /*
490  * Add an additional reference to a vm_object.  The object must already be
491  * held.  The original non-lock version is no longer supported.  The object
492  * must NOT be chain locked by anyone at the time the reference is added.
493  *
494  * The object must be held, but may be held shared if desired (hence why
495  * we use an atomic op).
496  */
497 void
498 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
499 {
500 	KKASSERT(object != NULL);
501 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
502 	atomic_add_int(&object->ref_count, 1);
503 	if (object->type == OBJT_VNODE) {
504 		vref(object->handle);
505 		/* XXX what if the vnode is being destroyed? */
506 	}
507 #if defined(DEBUG_LOCKS)
508 	debugvm_object_add(object, file, line, 1);
509 #endif
510 }
511 
512 /*
513  * This version is only allowed in situations where the caller
514  * already knows that the object is deterministically referenced
515  * (usually because its taken from a ref'd vnode, or during a map_entry
516  * replication).
517  */
518 void
519 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
520 {
521 	KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
522 	atomic_add_int(&object->ref_count, 1);
523 	if (object->type == OBJT_VNODE)
524 		vref(object->handle);
525 #if defined(DEBUG_LOCKS)
526 	debugvm_object_add(object, file, line, 1);
527 #endif
528 }
529 
530 /*
531  * Dereference an object and its underlying vnode.  The object may be
532  * held shared.  On return the object will remain held.
533  *
534  * This function may return a vnode in *vpp which the caller must release
535  * after the caller drops its own lock.  If vpp is NULL, we assume that
536  * the caller was holding an exclusive lock on the object and we vrele()
537  * the vp ourselves.
538  */
539 static void
540 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
541 				   VMOBJDBARGS)
542 {
543 	struct vnode *vp = (struct vnode *) object->handle;
544 
545 	KASSERT(object->type == OBJT_VNODE,
546 	    ("vm_object_vndeallocate: not a vnode object"));
547 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
548 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
549 #ifdef INVARIANTS
550 	if (object->ref_count == 0) {
551 		vprint("vm_object_vndeallocate", vp);
552 		panic("vm_object_vndeallocate: bad object reference count");
553 	}
554 #endif
555 	for (;;) {
556 		int count = object->ref_count;
557 		cpu_ccfence();
558 		if (count == 1) {
559 			vm_object_upgrade(object);
560 			if (atomic_cmpset_int(&object->ref_count, count, 0)) {
561 				vclrflags(vp, VTEXT);
562 				break;
563 			}
564 		} else {
565 			if (atomic_cmpset_int(&object->ref_count,
566 					      count, count - 1)) {
567 				break;
568 			}
569 		}
570 		/* retry */
571 	}
572 #if defined(DEBUG_LOCKS)
573 	debugvm_object_add(object, file, line, -1);
574 #endif
575 
576 	/*
577 	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
578 	 * if the object was locked exclusively.  But there are two races
579 	 * here.
580 	 *
581 	 * We had to upgrade the object above to safely clear VTEXT
582 	 * but the alternative path where the shared lock is retained
583 	 * can STILL race to 0 in other paths and cause our own vrele()
584 	 * to terminate the vnode.  We can't allow that if the VM object
585 	 * is still locked shared.
586 	 */
587 	if (vpp)
588 		*vpp = vp;
589 	else
590 		vrele(vp);
591 }
592 
593 /*
594  * Release a reference to the specified object, gained either through a
595  * vm_object_allocate or a vm_object_reference call.  When all references
596  * are gone, storage associated with this object may be relinquished.
597  *
598  * The caller does not have to hold the object locked but must have control
599  * over the reference in question in order to guarantee that the object
600  * does not get ripped out from under us.
601  *
602  * XXX Currently all deallocations require an exclusive lock.
603  */
604 void
605 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
606 {
607 	struct vnode *vp;
608 	int count;
609 
610 	if (object == NULL)
611 		return;
612 
613 	for (;;) {
614 		count = object->ref_count;
615 		cpu_ccfence();
616 
617 		/*
618 		 * If decrementing the count enters into special handling
619 		 * territory (0, 1, or 2) we have to do it the hard way.
620 		 * Fortunate though, objects with only a few refs like this
621 		 * are not likely to be heavily contended anyway.
622 		 *
623 		 * For vnode objects we only care about 1->0 transitions.
624 		 */
625 		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
626 #if defined(DEBUG_LOCKS)
627 			debugvm_object_add(object, file, line, 0);
628 #endif
629 			vm_object_hold(object);
630 			vm_object_deallocate_locked(object);
631 			vm_object_drop(object);
632 			break;
633 		}
634 
635 		/*
636 		 * Try to decrement ref_count without acquiring a hold on
637 		 * the object.  This is particularly important for the exec*()
638 		 * and exit*() code paths because the program binary may
639 		 * have a great deal of sharing and an exclusive lock will
640 		 * crowbar performance in those circumstances.
641 		 */
642 		if (object->type == OBJT_VNODE) {
643 			vp = (struct vnode *)object->handle;
644 			if (atomic_cmpset_int(&object->ref_count,
645 					      count, count - 1)) {
646 #if defined(DEBUG_LOCKS)
647 				debugvm_object_add(object, file, line, -1);
648 #endif
649 
650 				vrele(vp);
651 				break;
652 			}
653 			/* retry */
654 		} else {
655 			if (atomic_cmpset_int(&object->ref_count,
656 					      count, count - 1)) {
657 #if defined(DEBUG_LOCKS)
658 				debugvm_object_add(object, file, line, -1);
659 #endif
660 				break;
661 			}
662 			/* retry */
663 		}
664 		/* retry */
665 	}
666 }
667 
668 void
669 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
670 {
671 	/*
672 	 * Degenerate case
673 	 */
674 	if (object == NULL)
675 		return;
676 
677 	/*
678 	 * vnode case, caller either locked the object exclusively
679 	 * or this is a recursion with must_drop != 0 and the vnode
680 	 * object will be locked shared.
681 	 *
682 	 * If locked shared we have to drop the object before we can
683 	 * call vrele() or risk a shared/exclusive livelock.
684 	 */
685 	if (object->type == OBJT_VNODE) {
686 		ASSERT_LWKT_TOKEN_HELD(&object->token);
687 		vm_object_vndeallocate(object, NULL);
688 		return;
689 	}
690 	ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
691 
692 	/*
693 	 * Normal case (object is locked exclusively)
694 	 */
695 	if (object->ref_count == 0) {
696 		panic("vm_object_deallocate: object deallocated "
697 		      "too many times: %d", object->type);
698 	}
699 	if (object->ref_count > 2) {
700 		atomic_add_int(&object->ref_count, -1);
701 #if defined(DEBUG_LOCKS)
702 		debugvm_object_add(object, file, line, -1);
703 #endif
704 		return;
705 	}
706 
707 	/*
708 	 * Drop the ref and handle termination on the 1->0 transition.
709 	 * We may have blocked above so we have to recheck.
710 	 */
711 	KKASSERT(object->ref_count != 0);
712 	if (object->ref_count >= 2) {
713 		atomic_add_int(&object->ref_count, -1);
714 #if defined(DEBUG_LOCKS)
715 		debugvm_object_add(object, file, line, -1);
716 #endif
717 		return;
718 	}
719 
720 	atomic_add_int(&object->ref_count, -1);
721 	if ((object->flags & OBJ_DEAD) == 0)
722 		vm_object_terminate(object);
723 }
724 
725 /*
726  * Destroy the specified object, freeing up related resources.
727  *
728  * The object must have zero references.
729  *
730  * The object must held.  The caller is responsible for dropping the object
731  * after terminate returns.  Terminate does NOT drop the object.
732  */
733 static int vm_object_terminate_callback(vm_page_t p, void *data);
734 
735 void
736 vm_object_terminate(vm_object_t object)
737 {
738 	struct rb_vm_page_scan_info info;
739 	struct vm_object_hash *hash;
740 
741 	/*
742 	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
743 	 * able to safely block.
744 	 */
745 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
746 	KKASSERT((object->flags & OBJ_DEAD) == 0);
747 	vm_object_set_flag(object, OBJ_DEAD);
748 
749 	/*
750 	 * Wait for the pageout daemon to be done with the object
751 	 */
752 	vm_object_pip_wait(object, "objtrm1");
753 
754 	KASSERT(!object->paging_in_progress,
755 		("vm_object_terminate: pageout in progress"));
756 
757 	/*
758 	 * Clean and free the pages, as appropriate. All references to the
759 	 * object are gone, so we don't need to lock it.
760 	 */
761 	if (object->type == OBJT_VNODE) {
762 		struct vnode *vp;
763 
764 		/*
765 		 * Clean pages and flush buffers.
766 		 *
767 		 * NOTE!  TMPFS buffer flushes do not typically flush the
768 		 *	  actual page to swap as this would be highly
769 		 *	  inefficient, and normal filesystems usually wrap
770 		 *	  page flushes with buffer cache buffers.
771 		 *
772 		 *	  To deal with this we have to call vinvalbuf() both
773 		 *	  before and after the vm_object_page_clean().
774 		 */
775 		vp = (struct vnode *) object->handle;
776 		vinvalbuf(vp, V_SAVE, 0, 0);
777 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
778 		vinvalbuf(vp, V_SAVE, 0, 0);
779 	}
780 
781 	/*
782 	 * Wait for any I/O to complete, after which there had better not
783 	 * be any references left on the object.
784 	 */
785 	vm_object_pip_wait(object, "objtrm2");
786 
787 	if (object->ref_count != 0) {
788 		panic("vm_object_terminate: object with references, "
789 		      "ref_count=%d", object->ref_count);
790 	}
791 
792 	/*
793 	 * Cleanup any shared pmaps associated with this object.
794 	 */
795 	pmap_object_free(object);
796 
797 	/*
798 	 * Now free any remaining pages. For internal objects, this also
799 	 * removes them from paging queues. Don't free wired pages, just
800 	 * remove them from the object.
801 	 */
802 	info.count = 0;
803 	info.object = object;
804 	do {
805 		info.error = 0;
806 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
807 					vm_object_terminate_callback, &info);
808 	} while (info.error);
809 
810 	/*
811 	 * Let the pager know object is dead.
812 	 */
813 	vm_pager_deallocate(object);
814 
815 	/*
816 	 * Wait for the object hold count to hit 1, clean out pages as
817 	 * we go.  vmobj_token interlocks any race conditions that might
818 	 * pick the object up from the vm_object_list after we have cleared
819 	 * rb_memq.
820 	 */
821 	for (;;) {
822 		if (RB_ROOT(&object->rb_memq) == NULL)
823 			break;
824 		kprintf("vm_object_terminate: Warning, object %p "
825 			"still has %ld pages\n",
826 			object, object->resident_page_count);
827 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
828 					vm_object_terminate_callback, &info);
829 	}
830 
831 	/*
832 	 * There had better not be any pages left
833 	 */
834 	KKASSERT(object->resident_page_count == 0);
835 
836 	/*
837 	 * Remove the object from the global object list.
838 	 */
839 	hash = vmobj_hash(object);
840 	lwkt_gettoken(&hash->token);
841 	TAILQ_REMOVE(&hash->list, object, object_entry);
842 	lwkt_reltoken(&hash->token);
843 
844 	if (object->ref_count != 0) {
845 		panic("vm_object_terminate2: object with references, "
846 		      "ref_count=%d", object->ref_count);
847 	}
848 
849 	/*
850 	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
851 	 *	 the object here.  See vm_object_drop().
852 	 */
853 }
854 
855 /*
856  * The caller must hold the object.
857  */
858 static int
859 vm_object_terminate_callback(vm_page_t p, void *data)
860 {
861 	struct rb_vm_page_scan_info *info = data;
862 	vm_object_t object;
863 
864 	object = p->object;
865 	KKASSERT(object == info->object);
866 	if (vm_page_busy_try(p, TRUE)) {
867 		vm_page_sleep_busy(p, TRUE, "vmotrm");
868 		info->error = 1;
869 		return 0;
870 	}
871 	if (object != p->object) {
872 		/* XXX remove once we determine it can't happen */
873 		kprintf("vm_object_terminate: Warning: Encountered "
874 			"busied page %p on queue %d\n", p, p->queue);
875 		vm_page_wakeup(p);
876 		info->error = 1;
877 	} else if (p->wire_count == 0) {
878 		/*
879 		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
880 		 */
881 		vm_page_free(p);
882 		mycpu->gd_cnt.v_pfree++;
883 	} else {
884 		if (p->queue != PQ_NONE) {
885 			kprintf("vm_object_terminate: Warning: Encountered "
886 				"wired page %p on queue %d\n", p, p->queue);
887 			if (vm_object_debug > 0) {
888 				--vm_object_debug;
889 				print_backtrace(10);
890 			}
891 		}
892 		vm_page_remove(p);
893 		vm_page_wakeup(p);
894 	}
895 
896 	/*
897 	 * Must be at end to avoid SMP races, caller holds object token
898 	 */
899 	if ((++info->count & 63) == 0)
900 		lwkt_user_yield();
901 	return(0);
902 }
903 
904 /*
905  * Clean all dirty pages in the specified range of object.  Leaves page
906  * on whatever queue it is currently on.   If NOSYNC is set then do not
907  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
908  * leaving the object dirty.
909  *
910  * When stuffing pages asynchronously, allow clustering.  XXX we need a
911  * synchronous clustering mode implementation.
912  *
913  * Odd semantics: if start == end, we clean everything.
914  *
915  * The object must be locked? XXX
916  */
917 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
918 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
919 
920 void
921 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
922 		     int flags)
923 {
924 	struct rb_vm_page_scan_info info;
925 	struct vnode *vp;
926 	int wholescan;
927 	int pagerflags;
928 	int generation;
929 
930 	vm_object_hold(object);
931 	if (object->type != OBJT_VNODE ||
932 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
933 		vm_object_drop(object);
934 		return;
935 	}
936 
937 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
938 			VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
939 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
940 
941 	vp = object->handle;
942 
943 	/*
944 	 * Interlock other major object operations.  This allows us to
945 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
946 	 */
947 	vm_object_set_flag(object, OBJ_CLEANING);
948 
949 	/*
950 	 * Handle 'entire object' case
951 	 */
952 	info.start_pindex = start;
953 	if (end == 0) {
954 		info.end_pindex = object->size - 1;
955 	} else {
956 		info.end_pindex = end - 1;
957 	}
958 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
959 	info.limit = flags;
960 	info.pagerflags = pagerflags;
961 	info.object = object;
962 
963 	/*
964 	 * If cleaning the entire object do a pass to mark the pages read-only.
965 	 * If everything worked out ok, clear OBJ_WRITEABLE and
966 	 * OBJ_MIGHTBEDIRTY.
967 	 */
968 	if (wholescan) {
969 		info.error = 0;
970 		info.count = 0;
971 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
972 					vm_object_page_clean_pass1, &info);
973 		if (info.error == 0) {
974 			vm_object_clear_flag(object,
975 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
976 			if (object->type == OBJT_VNODE &&
977 			    (vp = (struct vnode *)object->handle) != NULL) {
978 				/*
979 				 * Use new-style interface to clear VISDIRTY
980 				 * because the vnode is not necessarily removed
981 				 * from the syncer list(s) as often as it was
982 				 * under the old interface, which can leave
983 				 * the vnode on the syncer list after reclaim.
984 				 */
985 				vclrobjdirty(vp);
986 			}
987 		}
988 	}
989 
990 	/*
991 	 * Do a pass to clean all the dirty pages we find.
992 	 */
993 	do {
994 		info.error = 0;
995 		info.count = 0;
996 		generation = object->generation;
997 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
998 					vm_object_page_clean_pass2, &info);
999 	} while (info.error || generation != object->generation);
1000 
1001 	vm_object_clear_flag(object, OBJ_CLEANING);
1002 	vm_object_drop(object);
1003 }
1004 
1005 /*
1006  * The caller must hold the object.
1007  */
1008 static
1009 int
1010 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1011 {
1012 	struct rb_vm_page_scan_info *info = data;
1013 
1014 	KKASSERT(p->object == info->object);
1015 
1016 	vm_page_flag_set(p, PG_CLEANCHK);
1017 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1018 		info->error = 1;
1019 	} else if (vm_page_busy_try(p, FALSE)) {
1020 		info->error = 1;
1021 	} else {
1022 		KKASSERT(p->object == info->object);
1023 		vm_page_protect(p, VM_PROT_READ);
1024 		vm_page_wakeup(p);
1025 	}
1026 
1027 	/*
1028 	 * Must be at end to avoid SMP races, caller holds object token
1029 	 */
1030 	if ((++info->count & 63) == 0)
1031 		lwkt_user_yield();
1032 	return(0);
1033 }
1034 
1035 /*
1036  * The caller must hold the object
1037  */
1038 static
1039 int
1040 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1041 {
1042 	struct rb_vm_page_scan_info *info = data;
1043 	int generation;
1044 
1045 	KKASSERT(p->object == info->object);
1046 
1047 	/*
1048 	 * Do not mess with pages that were inserted after we started
1049 	 * the cleaning pass.
1050 	 */
1051 	if ((p->flags & PG_CLEANCHK) == 0)
1052 		goto done;
1053 
1054 	generation = info->object->generation;
1055 
1056 	if (vm_page_busy_try(p, TRUE)) {
1057 		vm_page_sleep_busy(p, TRUE, "vpcwai");
1058 		info->error = 1;
1059 		goto done;
1060 	}
1061 
1062 	KKASSERT(p->object == info->object &&
1063 		 info->object->generation == generation);
1064 
1065 	/*
1066 	 * Before wasting time traversing the pmaps, check for trivial
1067 	 * cases where the page cannot be dirty.
1068 	 */
1069 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1070 		KKASSERT((p->dirty & p->valid) == 0 &&
1071 			 (p->flags & PG_NEED_COMMIT) == 0);
1072 		vm_page_wakeup(p);
1073 		goto done;
1074 	}
1075 
1076 	/*
1077 	 * Check whether the page is dirty or not.  The page has been set
1078 	 * to be read-only so the check will not race a user dirtying the
1079 	 * page.
1080 	 */
1081 	vm_page_test_dirty(p);
1082 	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1083 		vm_page_flag_clear(p, PG_CLEANCHK);
1084 		vm_page_wakeup(p);
1085 		goto done;
1086 	}
1087 
1088 	/*
1089 	 * If we have been asked to skip nosync pages and this is a
1090 	 * nosync page, skip it.  Note that the object flags were
1091 	 * not cleared in this case (because pass1 will have returned an
1092 	 * error), so we do not have to set them.
1093 	 */
1094 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1095 		vm_page_flag_clear(p, PG_CLEANCHK);
1096 		vm_page_wakeup(p);
1097 		goto done;
1098 	}
1099 
1100 	/*
1101 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1102 	 * the pages that get successfully flushed.  Set info->error if
1103 	 * we raced an object modification.
1104 	 */
1105 	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1106 	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1107 
1108 	/*
1109 	 * Must be at end to avoid SMP races, caller holds object token
1110 	 */
1111 done:
1112 	if ((++info->count & 63) == 0)
1113 		lwkt_user_yield();
1114 	return(0);
1115 }
1116 
1117 /*
1118  * Collect the specified page and nearby pages and flush them out.
1119  * The number of pages flushed is returned.  The passed page is busied
1120  * by the caller and we are responsible for its disposition.
1121  *
1122  * The caller must hold the object.
1123  */
1124 static void
1125 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1126 {
1127 	int error;
1128 	int is;
1129 	int ib;
1130 	int i;
1131 	int page_base;
1132 	vm_pindex_t pi;
1133 	vm_page_t ma[BLIST_MAX_ALLOC];
1134 
1135 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1136 
1137 	pi = p->pindex;
1138 	page_base = pi % BLIST_MAX_ALLOC;
1139 	ma[page_base] = p;
1140 	ib = page_base - 1;
1141 	is = page_base + 1;
1142 
1143 	while (ib >= 0) {
1144 		vm_page_t tp;
1145 
1146 		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1147 					     TRUE, &error);
1148 		if (error)
1149 			break;
1150 		if (tp == NULL)
1151 			break;
1152 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1153 		    (tp->flags & PG_CLEANCHK) == 0) {
1154 			vm_page_wakeup(tp);
1155 			break;
1156 		}
1157 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1158 			vm_page_flag_clear(tp, PG_CLEANCHK);
1159 			vm_page_wakeup(tp);
1160 			break;
1161 		}
1162 		vm_page_test_dirty(tp);
1163 		if ((tp->dirty & tp->valid) == 0 &&
1164 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1165 			vm_page_flag_clear(tp, PG_CLEANCHK);
1166 			vm_page_wakeup(tp);
1167 			break;
1168 		}
1169 		ma[ib] = tp;
1170 		--ib;
1171 	}
1172 	++ib;	/* fixup */
1173 
1174 	while (is < BLIST_MAX_ALLOC &&
1175 	       pi - page_base + is < object->size) {
1176 		vm_page_t tp;
1177 
1178 		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1179 					     TRUE, &error);
1180 		if (error)
1181 			break;
1182 		if (tp == NULL)
1183 			break;
1184 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1185 		    (tp->flags & PG_CLEANCHK) == 0) {
1186 			vm_page_wakeup(tp);
1187 			break;
1188 		}
1189 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1190 			vm_page_flag_clear(tp, PG_CLEANCHK);
1191 			vm_page_wakeup(tp);
1192 			break;
1193 		}
1194 		vm_page_test_dirty(tp);
1195 		if ((tp->dirty & tp->valid) == 0 &&
1196 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1197 			vm_page_flag_clear(tp, PG_CLEANCHK);
1198 			vm_page_wakeup(tp);
1199 			break;
1200 		}
1201 		ma[is] = tp;
1202 		++is;
1203 	}
1204 
1205 	/*
1206 	 * All pages in the ma[] array are busied now
1207 	 */
1208 	for (i = ib; i < is; ++i) {
1209 		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1210 		vm_page_hold(ma[i]);	/* XXX need this any more? */
1211 	}
1212 	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1213 	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1214 		vm_page_unhold(ma[i]);
1215 }
1216 
1217 /*
1218  * Implements the madvise function at the object/page level.
1219  *
1220  * MADV_WILLNEED	(any object)
1221  *
1222  *	Activate the specified pages if they are resident.
1223  *
1224  * MADV_DONTNEED	(any object)
1225  *
1226  *	Deactivate the specified pages if they are resident.
1227  *
1228  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1229  *
1230  *	Deactivate and clean the specified pages if they are
1231  *	resident.  This permits the process to reuse the pages
1232  *	without faulting or the kernel to reclaim the pages
1233  *	without I/O.
1234  *
1235  * No requirements.
1236  */
1237 void
1238 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1239 		  vm_pindex_t count, int advise)
1240 {
1241 	vm_pindex_t end;
1242 	vm_page_t m;
1243 	int error;
1244 
1245 	if (object == NULL)
1246 		return;
1247 
1248 	end = pindex + count;
1249 
1250 	vm_object_hold(object);
1251 
1252 	/*
1253 	 * Locate and adjust resident pages.  This only applies to the
1254 	 * primary object in the mapping.
1255 	 */
1256 	for (; pindex < end; pindex += 1) {
1257 relookup:
1258 		/*
1259 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1260 		 * and those pages must be OBJ_ONEMAPPING.
1261 		 */
1262 		if (advise == MADV_FREE) {
1263 			if ((object->type != OBJT_DEFAULT &&
1264 			     object->type != OBJT_SWAP) ||
1265 			    (object->flags & OBJ_ONEMAPPING) == 0) {
1266 				continue;
1267 			}
1268 		}
1269 
1270 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1271 
1272 		if (error) {
1273 			vm_page_sleep_busy(m, TRUE, "madvpo");
1274 			goto relookup;
1275 		}
1276 		if (m == NULL) {
1277 			/*
1278 			 * There may be swap even if there is no backing page
1279 			 */
1280 			if (advise == MADV_FREE && object->type == OBJT_SWAP)
1281 				swap_pager_freespace(object, pindex, 1);
1282 			continue;
1283 		}
1284 
1285 		/*
1286 		 * If the page is not in a normal active state, we skip it.
1287 		 * If the page is not managed there are no page queues to
1288 		 * mess with.  Things can break if we mess with pages in
1289 		 * any of the below states.
1290 		 */
1291 		if (m->wire_count ||
1292 		    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1293 				 PG_NEED_COMMIT)) ||
1294 		    m->valid != VM_PAGE_BITS_ALL
1295 		) {
1296 			vm_page_wakeup(m);
1297 			continue;
1298 		}
1299 
1300 		/*
1301 		 * Theoretically once a page is known not to be busy, an
1302 		 * interrupt cannot come along and rip it out from under us.
1303 		 */
1304 		if (advise == MADV_WILLNEED) {
1305 			vm_page_activate(m);
1306 		} else if (advise == MADV_DONTNEED) {
1307 			vm_page_dontneed(m);
1308 		} else if (advise == MADV_FREE) {
1309 			/*
1310 			 * Mark the page clean.  This will allow the page
1311 			 * to be freed up by the system.  However, such pages
1312 			 * are often reused quickly by malloc()/free()
1313 			 * so we do not do anything that would cause
1314 			 * a page fault if we can help it.
1315 			 *
1316 			 * Specifically, we do not try to actually free
1317 			 * the page now nor do we try to put it in the
1318 			 * cache (which would cause a page fault on reuse).
1319 			 *
1320 			 * But we do make the page is freeable as we
1321 			 * can without actually taking the step of unmapping
1322 			 * it.
1323 			 */
1324 			pmap_clear_modify(m);
1325 			m->dirty = 0;
1326 			m->act_count = 0;
1327 			vm_page_dontneed(m);
1328 			if (object->type == OBJT_SWAP)
1329 				swap_pager_freespace(object, pindex, 1);
1330 		}
1331 		vm_page_wakeup(m);
1332 	}
1333 	vm_object_drop(object);
1334 }
1335 
1336 /*
1337  * Removes all physical pages in the specified object range from the
1338  * object's list of pages.
1339  *
1340  * No requirements.
1341  */
1342 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1343 
1344 void
1345 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1346 		      boolean_t clean_only)
1347 {
1348 	struct rb_vm_page_scan_info info;
1349 	int all;
1350 
1351 	/*
1352 	 * Degenerate cases and assertions
1353 	 */
1354 	vm_object_hold(object);
1355 	if (object == NULL ||
1356 	    (object->resident_page_count == 0 && object->swblock_count == 0)) {
1357 		vm_object_drop(object);
1358 		return;
1359 	}
1360 	KASSERT(object->type != OBJT_PHYS,
1361 		("attempt to remove pages from a physical object"));
1362 
1363 	/*
1364 	 * Indicate that paging is occuring on the object
1365 	 */
1366 	vm_object_pip_add(object, 1);
1367 
1368 	/*
1369 	 * Figure out the actual removal range and whether we are removing
1370 	 * the entire contents of the object or not.  If removing the entire
1371 	 * contents, be sure to get all pages, even those that might be
1372 	 * beyond the end of the object.
1373 	 */
1374 	info.object = object;
1375 	info.start_pindex = start;
1376 	if (end == 0)
1377 		info.end_pindex = (vm_pindex_t)-1;
1378 	else
1379 		info.end_pindex = end - 1;
1380 	info.limit = clean_only;
1381 	info.count = 0;
1382 	all = (start == 0 && info.end_pindex >= object->size - 1);
1383 
1384 	/*
1385 	 * Loop until we are sure we have gotten them all.
1386 	 */
1387 	do {
1388 		info.error = 0;
1389 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1390 					vm_object_page_remove_callback, &info);
1391 	} while (info.error);
1392 
1393 	/*
1394 	 * Remove any related swap if throwing away pages, or for
1395 	 * non-swap objects (the swap is a clean copy in that case).
1396 	 */
1397 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1398 		if (all)
1399 			swap_pager_freespace_all(object);
1400 		else
1401 			swap_pager_freespace(object, info.start_pindex,
1402 			     info.end_pindex - info.start_pindex + 1);
1403 	}
1404 
1405 	/*
1406 	 * Cleanup
1407 	 */
1408 	vm_object_pip_wakeup(object);
1409 	vm_object_drop(object);
1410 }
1411 
1412 /*
1413  * The caller must hold the object.
1414  *
1415  * NOTE: User yields are allowed when removing more than one page, but not
1416  *	 allowed if only removing one page (the path for single page removals
1417  *	 might hold a spinlock).
1418  */
1419 static int
1420 vm_object_page_remove_callback(vm_page_t p, void *data)
1421 {
1422 	struct rb_vm_page_scan_info *info = data;
1423 
1424 	if (info->object != p->object ||
1425 	    p->pindex < info->start_pindex ||
1426 	    p->pindex > info->end_pindex) {
1427 		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1428 			info->object, p);
1429 		return(0);
1430 	}
1431 	if (vm_page_busy_try(p, TRUE)) {
1432 		vm_page_sleep_busy(p, TRUE, "vmopar");
1433 		info->error = 1;
1434 		return(0);
1435 	}
1436 	if (info->object != p->object) {
1437 		/* this should never happen */
1438 		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1439 			info->object, p);
1440 		vm_page_wakeup(p);
1441 		return(0);
1442 	}
1443 
1444 	/*
1445 	 * Wired pages cannot be destroyed, but they can be invalidated
1446 	 * and we do so if clean_only (limit) is not set.
1447 	 *
1448 	 * WARNING!  The page may be wired due to being part of a buffer
1449 	 *	     cache buffer, and the buffer might be marked B_CACHE.
1450 	 *	     This is fine as part of a truncation but VFSs must be
1451 	 *	     sure to fix the buffer up when re-extending the file.
1452 	 *
1453 	 * NOTE!     PG_NEED_COMMIT is ignored.
1454 	 */
1455 	if (p->wire_count != 0) {
1456 		vm_page_protect(p, VM_PROT_NONE);
1457 		if (info->limit == 0)
1458 			p->valid = 0;
1459 		vm_page_wakeup(p);
1460 		goto done;
1461 	}
1462 
1463 	/*
1464 	 * limit is our clean_only flag.  If set and the page is dirty or
1465 	 * requires a commit, do not free it.  If set and the page is being
1466 	 * held by someone, do not free it.
1467 	 */
1468 	if (info->limit && p->valid) {
1469 		vm_page_test_dirty(p);
1470 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1471 			vm_page_wakeup(p);
1472 			goto done;
1473 		}
1474 	}
1475 
1476 	/*
1477 	 * Destroy the page.  But we have to re-test whether its dirty after
1478 	 * removing it from its pmaps.
1479 	 */
1480 	vm_page_protect(p, VM_PROT_NONE);
1481 	if (info->limit && p->valid) {
1482 		vm_page_test_dirty(p);
1483 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1484 			vm_page_wakeup(p);
1485 			goto done;
1486 		}
1487 	}
1488 	vm_page_free(p);
1489 
1490 	/*
1491 	 * Must be at end to avoid SMP races, caller holds object token
1492 	 */
1493 done:
1494 	if ((++info->count & 63) == 0)
1495 		lwkt_user_yield();
1496 
1497 	return(0);
1498 }
1499 
1500 /*
1501  * Try to extend prev_object into an adjoining region of virtual
1502  * memory, return TRUE on success.
1503  *
1504  * The caller does not need to hold (prev_object) but must have a stable
1505  * pointer to it (typically by holding the vm_map locked).
1506  *
1507  * This function only works for anonymous memory objects which either
1508  * have (a) one reference or (b) we are extending the object's size.
1509  * Otherwise the related VM pages we want to use for the object might
1510  * be in use by another mapping.
1511  */
1512 boolean_t
1513 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1514 		   vm_size_t prev_size, vm_size_t next_size)
1515 {
1516 	vm_pindex_t next_pindex;
1517 
1518 	if (prev_object == NULL)
1519 		return (TRUE);
1520 
1521 	vm_object_hold(prev_object);
1522 
1523 	if (prev_object->type != OBJT_DEFAULT &&
1524 	    prev_object->type != OBJT_SWAP) {
1525 		vm_object_drop(prev_object);
1526 		return (FALSE);
1527 	}
1528 
1529 #if 0
1530 	/* caller now checks this */
1531 	/*
1532 	 * Try to collapse the object first
1533 	 */
1534 	vm_object_collapse(prev_object, NULL);
1535 #endif
1536 
1537 #if 0
1538 	/* caller now checks this */
1539 	/*
1540 	 * We can't coalesce if we shadow another object (figuring out the
1541 	 * relationships become too complex).
1542 	 */
1543 	if (prev_object->backing_object != NULL) {
1544 		vm_object_chain_release(prev_object);
1545 		vm_object_drop(prev_object);
1546 		return (FALSE);
1547 	}
1548 #endif
1549 
1550 	prev_size >>= PAGE_SHIFT;
1551 	next_size >>= PAGE_SHIFT;
1552 	next_pindex = prev_pindex + prev_size;
1553 
1554 	/*
1555 	 * We can't if the object has more than one ref count unless we
1556 	 * are extending it into newly minted space.
1557 	 */
1558 	if (prev_object->ref_count > 1 &&
1559 	    prev_object->size != next_pindex) {
1560 		vm_object_drop(prev_object);
1561 		return (FALSE);
1562 	}
1563 
1564 	/*
1565 	 * Remove any pages that may still be in the object from a previous
1566 	 * deallocation.
1567 	 */
1568 	if (next_pindex < prev_object->size) {
1569 		vm_object_page_remove(prev_object,
1570 				      next_pindex,
1571 				      next_pindex + next_size, FALSE);
1572 		if (prev_object->type == OBJT_SWAP)
1573 			swap_pager_freespace(prev_object,
1574 					     next_pindex, next_size);
1575 	}
1576 
1577 	/*
1578 	 * Extend the object if necessary.
1579 	 */
1580 	if (next_pindex + next_size > prev_object->size)
1581 		prev_object->size = next_pindex + next_size;
1582 	vm_object_drop(prev_object);
1583 
1584 	return (TRUE);
1585 }
1586 
1587 /*
1588  * Make the object writable and flag is being possibly dirty.
1589  *
1590  * The object might not be held (or might be held but held shared),
1591  * the related vnode is probably not held either.  Object and vnode are
1592  * stable by virtue of the vm_page busied by the caller preventing
1593  * destruction.
1594  *
1595  * If the related mount is flagged MNTK_THR_SYNC we need to call
1596  * vsetobjdirty().  Filesystems using this option usually shortcut
1597  * synchronization by only scanning the syncer list.
1598  */
1599 void
1600 vm_object_set_writeable_dirty(vm_object_t object)
1601 {
1602 	struct vnode *vp;
1603 
1604 	/*vm_object_assert_held(object);*/
1605 	/*
1606 	 * Avoid contention in vm fault path by checking the state before
1607 	 * issuing an atomic op on it.
1608 	 */
1609 	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1610 	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1611 		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1612 	}
1613 	if (object->type == OBJT_VNODE &&
1614 	    (vp = (struct vnode *)object->handle) != NULL) {
1615 		if ((vp->v_flag & VOBJDIRTY) == 0) {
1616 			if (vp->v_mount &&
1617 			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1618 				/*
1619 				 * New style THR_SYNC places vnodes on the
1620 				 * syncer list more deterministically.
1621 				 */
1622 				vsetobjdirty(vp);
1623 			} else {
1624 				/*
1625 				 * Old style scan would not necessarily place
1626 				 * a vnode on the syncer list when possibly
1627 				 * modified via mmap.
1628 				 */
1629 				vsetflags(vp, VOBJDIRTY);
1630 			}
1631 		}
1632 	}
1633 }
1634 
1635 #include "opt_ddb.h"
1636 #ifdef DDB
1637 #include <sys/cons.h>
1638 
1639 #include <ddb/ddb.h>
1640 
1641 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1642 				       vm_map_entry_t entry);
1643 static int	vm_object_in_map (vm_object_t object);
1644 
1645 /*
1646  * The caller must hold the object.
1647  */
1648 static int
1649 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1650 {
1651 	vm_map_backing_t ba;
1652 	vm_map_t tmpm;
1653 	vm_map_entry_t tmpe;
1654 	int entcount;
1655 
1656 	if (map == NULL)
1657 		return 0;
1658 	if (entry == NULL) {
1659 		tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1660 		entcount = map->nentries;
1661 		while (entcount-- && tmpe) {
1662 			if( _vm_object_in_map(map, object, tmpe)) {
1663 				return 1;
1664 			}
1665 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1666 		}
1667 		return (0);
1668 	}
1669 	switch(entry->maptype) {
1670 	case VM_MAPTYPE_SUBMAP:
1671 		tmpm = entry->ba.sub_map;
1672 		tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1673 		entcount = tmpm->nentries;
1674 		while (entcount-- && tmpe) {
1675 			if( _vm_object_in_map(tmpm, object, tmpe)) {
1676 				return 1;
1677 			}
1678 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1679 		}
1680 		break;
1681 	case VM_MAPTYPE_NORMAL:
1682 	case VM_MAPTYPE_VPAGETABLE:
1683 		ba = &entry->ba;
1684 		while (ba) {
1685 			if (ba->object == object)
1686 				return TRUE;
1687 			ba = ba->backing_ba;
1688 		}
1689 		break;
1690 	default:
1691 		break;
1692 	}
1693 	return 0;
1694 }
1695 
1696 static int vm_object_in_map_callback(struct proc *p, void *data);
1697 
1698 struct vm_object_in_map_info {
1699 	vm_object_t object;
1700 	int rv;
1701 };
1702 
1703 /*
1704  * Debugging only
1705  */
1706 static int
1707 vm_object_in_map(vm_object_t object)
1708 {
1709 	struct vm_object_in_map_info info;
1710 
1711 	info.rv = 0;
1712 	info.object = object;
1713 
1714 	allproc_scan(vm_object_in_map_callback, &info, 0);
1715 	if (info.rv)
1716 		return 1;
1717 	if( _vm_object_in_map(&kernel_map, object, 0))
1718 		return 1;
1719 	if( _vm_object_in_map(&pager_map, object, 0))
1720 		return 1;
1721 	if( _vm_object_in_map(&buffer_map, object, 0))
1722 		return 1;
1723 	return 0;
1724 }
1725 
1726 /*
1727  * Debugging only
1728  */
1729 static int
1730 vm_object_in_map_callback(struct proc *p, void *data)
1731 {
1732 	struct vm_object_in_map_info *info = data;
1733 
1734 	if (p->p_vmspace) {
1735 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1736 			info->rv = 1;
1737 			return -1;
1738 		}
1739 	}
1740 	return (0);
1741 }
1742 
1743 DB_SHOW_COMMAND(vmochk, vm_object_check)
1744 {
1745 	struct vm_object_hash *hash;
1746 	vm_object_t object;
1747 	int n;
1748 
1749 	/*
1750 	 * make sure that internal objs are in a map somewhere
1751 	 * and none have zero ref counts.
1752 	 */
1753 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1754 		hash = &vm_object_hash[n];
1755 		for (object = TAILQ_FIRST(&hash->list);
1756 				object != NULL;
1757 				object = TAILQ_NEXT(object, object_entry)) {
1758 			if (object->type == OBJT_MARKER)
1759 				continue;
1760 			if (object->handle != NULL ||
1761 			    (object->type != OBJT_DEFAULT &&
1762 			     object->type != OBJT_SWAP)) {
1763 				continue;
1764 			}
1765 			if (object->ref_count == 0) {
1766 				db_printf("vmochk: internal obj has "
1767 					  "zero ref count: %ld\n",
1768 					  (long)object->size);
1769 			}
1770 			if (vm_object_in_map(object))
1771 				continue;
1772 			db_printf("vmochk: internal obj is not in a map: "
1773 				  "ref: %d, size: %lu: 0x%lx\n",
1774 				  object->ref_count, (u_long)object->size,
1775 				  (u_long)object->size);
1776 		}
1777 	}
1778 }
1779 
1780 /*
1781  * Debugging only
1782  */
1783 DB_SHOW_COMMAND(object, vm_object_print_static)
1784 {
1785 	/* XXX convert args. */
1786 	vm_object_t object = (vm_object_t)addr;
1787 	boolean_t full = have_addr;
1788 
1789 	vm_page_t p;
1790 
1791 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1792 #define	count	was_count
1793 
1794 	int count;
1795 
1796 	if (object == NULL)
1797 		return;
1798 
1799 	db_iprintf(
1800 	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1801 	    object, (int)object->type, (u_long)object->size,
1802 	    object->resident_page_count, object->ref_count, object->flags);
1803 	/*
1804 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1805 	 */
1806 	db_iprintf("\n");
1807 
1808 	if (!full)
1809 		return;
1810 
1811 	db_indent += 2;
1812 	count = 0;
1813 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1814 		if (count == 0)
1815 			db_iprintf("memory:=");
1816 		else if (count == 6) {
1817 			db_printf("\n");
1818 			db_iprintf(" ...");
1819 			count = 0;
1820 		} else
1821 			db_printf(",");
1822 		count++;
1823 
1824 		db_printf("(off=0x%lx,page=0x%lx)",
1825 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1826 	}
1827 	if (count != 0)
1828 		db_printf("\n");
1829 	db_indent -= 2;
1830 }
1831 
1832 /* XXX. */
1833 #undef count
1834 
1835 /*
1836  * XXX need this non-static entry for calling from vm_map_print.
1837  *
1838  * Debugging only
1839  */
1840 void
1841 vm_object_print(/* db_expr_t */ long addr,
1842 		boolean_t have_addr,
1843 		/* db_expr_t */ long count,
1844 		char *modif)
1845 {
1846 	vm_object_print_static(addr, have_addr, count, modif);
1847 }
1848 
1849 /*
1850  * Debugging only
1851  */
1852 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1853 {
1854 	struct vm_object_hash *hash;
1855 	vm_object_t object;
1856 	int nl = 0;
1857 	int c;
1858 	int n;
1859 
1860 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1861 		hash = &vm_object_hash[n];
1862 		for (object = TAILQ_FIRST(&hash->list);
1863 				object != NULL;
1864 				object = TAILQ_NEXT(object, object_entry)) {
1865 			vm_pindex_t idx, fidx;
1866 			vm_pindex_t osize;
1867 			vm_paddr_t pa = -1, padiff;
1868 			int rcount;
1869 			vm_page_t m;
1870 
1871 			if (object->type == OBJT_MARKER)
1872 				continue;
1873 			db_printf("new object: %p\n", (void *)object);
1874 			if ( nl > 18) {
1875 				c = cngetc();
1876 				if (c != ' ')
1877 					return;
1878 				nl = 0;
1879 			}
1880 			nl++;
1881 			rcount = 0;
1882 			fidx = 0;
1883 			osize = object->size;
1884 			if (osize > 128)
1885 				osize = 128;
1886 			for (idx = 0; idx < osize; idx++) {
1887 				m = vm_page_lookup(object, idx);
1888 				if (m == NULL) {
1889 					if (rcount) {
1890 						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1891 							(long)fidx, rcount, (long)pa);
1892 						if ( nl > 18) {
1893 							c = cngetc();
1894 							if (c != ' ')
1895 								return;
1896 							nl = 0;
1897 						}
1898 						nl++;
1899 						rcount = 0;
1900 					}
1901 					continue;
1902 				}
1903 
1904 				if (rcount &&
1905 					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1906 					++rcount;
1907 					continue;
1908 				}
1909 				if (rcount) {
1910 					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1911 					padiff >>= PAGE_SHIFT;
1912 					padiff &= PQ_L2_MASK;
1913 					if (padiff == 0) {
1914 						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1915 						++rcount;
1916 						continue;
1917 					}
1918 					db_printf(" index(%ld)run(%d)pa(0x%lx)",
1919 						(long)fidx, rcount, (long)pa);
1920 					db_printf("pd(%ld)\n", (long)padiff);
1921 					if ( nl > 18) {
1922 						c = cngetc();
1923 						if (c != ' ')
1924 							return;
1925 						nl = 0;
1926 					}
1927 					nl++;
1928 				}
1929 				fidx = idx;
1930 				pa = VM_PAGE_TO_PHYS(m);
1931 				rcount = 1;
1932 			}
1933 			if (rcount) {
1934 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1935 					(long)fidx, rcount, (long)pa);
1936 				if ( nl > 18) {
1937 					c = cngetc();
1938 					if (c != ' ')
1939 						return;
1940 					nl = 0;
1941 				}
1942 				nl++;
1943 			}
1944 		}
1945 	}
1946 }
1947 #endif /* DDB */
1948