xref: /dragonfly/sys/vm/vm_object.c (revision 631c21f2)
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62 
63 /*
64  *	Virtual memory object module.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>		/* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/malloc.h>
77 #include <sys/sysctl.h>
78 #include <sys/refcount.h>
79 
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_zone.h>
92 
93 #include <vm/vm_page2.h>
94 
95 #include <machine/specialreg.h>
96 
97 #define EASY_SCAN_FACTOR	8
98 
99 static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100 					     int pagerflags);
101 static void	vm_object_lock_init(vm_object_t);
102 
103 /*
104  *	Virtual memory objects maintain the actual data
105  *	associated with allocated virtual memory.  A given
106  *	page of memory exists within exactly one object.
107  *
108  *	An object is only deallocated when all "references"
109  *	are given up.  Only one "reference" to a given
110  *	region of an object should be writeable.
111  *
112  *	Associated with each object is a list of all resident
113  *	memory pages belonging to that object; this list is
114  *	maintained by the "vm_page" module, and locked by the object's
115  *	lock.
116  *
117  *	Each object also records a "pager" routine which is
118  *	used to retrieve (and store) pages to the proper backing
119  *	storage.  In addition, objects may be backed by other
120  *	objects from which they were virtual-copied.
121  *
122  *	The only items within the object structure which are
123  *	modified after time of creation are:
124  *		reference count		locked by object's lock
125  *		pager routine		locked by object's lock
126  *
127  */
128 
129 struct vm_object kernel_object;
130 
131 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
132 
133 static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object),
134 		"vm_object", "vm_object structures");
135 
136 #define VMOBJ_HASH_PRIME1	66555444443333333ULL
137 #define VMOBJ_HASH_PRIME2	989042931893ULL
138 
139 int vm_object_debug;
140 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
141 
142 static __inline
143 struct vm_object_hash *
144 vmobj_hash(vm_object_t obj)
145 {
146 	uintptr_t hash1;
147 	uintptr_t hash2;
148 
149 	hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
150 	hash1 %= VMOBJ_HASH_PRIME1;
151 	hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
152 	hash2 %= VMOBJ_HASH_PRIME2;
153 	return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
154 }
155 
156 #if defined(DEBUG_LOCKS)
157 
158 #define vm_object_vndeallocate(obj, vpp)	\
159                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
160 
161 /*
162  * Debug helper to track hold/drop/ref/deallocate calls.
163  */
164 static void
165 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
166 {
167 	int i;
168 
169 	i = atomic_fetchadd_int(&obj->debug_index, 1);
170 	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
171 	ksnprintf(obj->debug_hold_thrs[i],
172 		  sizeof(obj->debug_hold_thrs[i]),
173 		  "%c%d:(%d):%s",
174 		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
175 		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
176 		  obj->ref_count,
177 		  curthread->td_comm);
178 	obj->debug_hold_file[i] = file;
179 	obj->debug_hold_line[i] = line;
180 #if 0
181 	/* Uncomment for debugging obj refs/derefs in reproducable cases */
182 	if (strcmp(curthread->td_comm, "sshd") == 0) {
183 		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
184 			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
185 			obj, obj->ref_count, addrem, file, line);
186 	}
187 #endif
188 }
189 
190 #endif
191 
192 /*
193  * Misc low level routines
194  */
195 static void
196 vm_object_lock_init(vm_object_t obj)
197 {
198 #if defined(DEBUG_LOCKS)
199 	int i;
200 
201 	obj->debug_index = 0;
202 	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
203 		obj->debug_hold_thrs[i][0] = 0;
204 		obj->debug_hold_file[i] = NULL;
205 		obj->debug_hold_line[i] = 0;
206 	}
207 #endif
208 }
209 
210 void
211 vm_object_lock_swap(void)
212 {
213 	lwkt_token_swap();
214 }
215 
216 void
217 vm_object_lock(vm_object_t obj)
218 {
219 	lwkt_gettoken(&obj->token);
220 }
221 
222 /*
223  * Returns TRUE on sucesss
224  */
225 static int
226 vm_object_lock_try(vm_object_t obj)
227 {
228 	return(lwkt_trytoken(&obj->token));
229 }
230 
231 void
232 vm_object_lock_shared(vm_object_t obj)
233 {
234 	lwkt_gettoken_shared(&obj->token);
235 }
236 
237 void
238 vm_object_unlock(vm_object_t obj)
239 {
240 	lwkt_reltoken(&obj->token);
241 }
242 
243 void
244 vm_object_upgrade(vm_object_t obj)
245 {
246 	lwkt_reltoken(&obj->token);
247 	lwkt_gettoken(&obj->token);
248 }
249 
250 void
251 vm_object_downgrade(vm_object_t obj)
252 {
253 	lwkt_reltoken(&obj->token);
254 	lwkt_gettoken_shared(&obj->token);
255 }
256 
257 static __inline void
258 vm_object_assert_held(vm_object_t obj)
259 {
260 	ASSERT_LWKT_TOKEN_HELD(&obj->token);
261 }
262 
263 int
264 vm_quickcolor(void)
265 {
266 	globaldata_t gd = mycpu;
267 	int pg_color;
268 
269 	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
270 	pg_color += gd->gd_quick_color;
271 	gd->gd_quick_color += PQ_PRIME2;
272 
273 	return pg_color;
274 }
275 
276 void
277 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
278 {
279 	KKASSERT(obj != NULL);
280 
281 	/*
282 	 * Object must be held (object allocation is stable due to callers
283 	 * context, typically already holding the token on a parent object)
284 	 * prior to potentially blocking on the lock, otherwise the object
285 	 * can get ripped away from us.
286 	 */
287 	refcount_acquire(&obj->hold_count);
288 	vm_object_lock(obj);
289 
290 #if defined(DEBUG_LOCKS)
291 	debugvm_object_add(obj, file, line, 1);
292 #endif
293 }
294 
295 int
296 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
297 {
298 	KKASSERT(obj != NULL);
299 
300 	/*
301 	 * Object must be held (object allocation is stable due to callers
302 	 * context, typically already holding the token on a parent object)
303 	 * prior to potentially blocking on the lock, otherwise the object
304 	 * can get ripped away from us.
305 	 */
306 	refcount_acquire(&obj->hold_count);
307 	if (vm_object_lock_try(obj) == 0) {
308 		if (refcount_release(&obj->hold_count)) {
309 			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
310 				kfree_obj(obj, M_VM_OBJECT);
311 		}
312 		return(0);
313 	}
314 
315 #if defined(DEBUG_LOCKS)
316 	debugvm_object_add(obj, file, line, 1);
317 #endif
318 	return(1);
319 }
320 
321 void
322 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
323 {
324 	KKASSERT(obj != NULL);
325 
326 	/*
327 	 * Object must be held (object allocation is stable due to callers
328 	 * context, typically already holding the token on a parent object)
329 	 * prior to potentially blocking on the lock, otherwise the object
330 	 * can get ripped away from us.
331 	 */
332 	refcount_acquire(&obj->hold_count);
333 	vm_object_lock_shared(obj);
334 
335 #if defined(DEBUG_LOCKS)
336 	debugvm_object_add(obj, file, line, 1);
337 #endif
338 }
339 
340 /*
341  * Drop the token and hold_count on the object.
342  *
343  * WARNING! Token might be shared.
344  */
345 void
346 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
347 {
348 	if (obj == NULL)
349 		return;
350 
351 	/*
352 	 * No new holders should be possible once we drop hold_count 1->0 as
353 	 * there is no longer any way to reference the object.
354 	 */
355 	KKASSERT(obj->hold_count > 0);
356 	if (refcount_release(&obj->hold_count)) {
357 #if defined(DEBUG_LOCKS)
358 		debugvm_object_add(obj, file, line, -1);
359 #endif
360 
361 		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
362 			vm_object_unlock(obj);
363 			kfree_obj(obj, M_VM_OBJECT);
364 		} else {
365 			vm_object_unlock(obj);
366 		}
367 	} else {
368 #if defined(DEBUG_LOCKS)
369 		debugvm_object_add(obj, file, line, -1);
370 #endif
371 		vm_object_unlock(obj);
372 	}
373 }
374 
375 /*
376  * Initialize a freshly allocated object, returning a held object.
377  *
378  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
379  *
380  * No requirements.
381  */
382 void
383 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
384 		    const char *ident)
385 {
386 	struct vm_object_hash *hash;
387 
388 	RB_INIT(&object->rb_memq);
389 	lwkt_token_init(&object->token, ident);
390 
391 	TAILQ_INIT(&object->backing_list);
392 	lockinit(&object->backing_lk, "baclk", 0, 0);
393 
394 	object->type = type;
395 	object->size = size;
396 	object->ref_count = 1;
397 	object->memattr = VM_MEMATTR_DEFAULT;
398 	object->hold_count = 0;
399 	object->flags = 0;
400 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
401 		vm_object_set_flag(object, OBJ_ONEMAPPING);
402 	object->paging_in_progress = 0;
403 	object->resident_page_count = 0;
404 	/* cpu localization twist */
405 	object->pg_color = vm_quickcolor();
406 	object->handle = NULL;
407 
408 	atomic_add_int(&object->generation, 1);
409 	object->swblock_count = 0;
410 	RB_INIT(&object->swblock_root);
411 	vm_object_lock_init(object);
412 	pmap_object_init(object);
413 
414 	vm_object_hold(object);
415 
416 	hash = vmobj_hash(object);
417 	lwkt_gettoken(&hash->token);
418 	TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
419 	lwkt_reltoken(&hash->token);
420 }
421 
422 /*
423  * Initialize a VM object.
424  */
425 void
426 vm_object_init(vm_object_t object, vm_pindex_t size)
427 {
428 	_vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj");
429 	vm_object_drop(object);
430 }
431 
432 /*
433  * Initialize the VM objects module.
434  *
435  * Called from the low level boot code only.  Note that this occurs before
436  * kmalloc is initialized so we cannot allocate any VM objects.
437  */
438 void
439 vm_object_init1(void)
440 {
441 	int i;
442 
443 	for (i = 0; i < VMOBJ_HSIZE; ++i) {
444 		TAILQ_INIT(&vm_object_hash[i].list);
445 		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
446 	}
447 
448 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
449 			    &kernel_object, "kobj");
450 	vm_object_drop(&kernel_object);
451 }
452 
453 void
454 vm_object_init2(void)
455 {
456 	kmalloc_obj_set_unlimited(M_VM_OBJECT);
457 }
458 
459 /*
460  * Allocate and return a new object of the specified type and size.
461  *
462  * No requirements.
463  */
464 vm_object_t
465 vm_object_allocate(objtype_t type, vm_pindex_t size)
466 {
467 	vm_object_t obj;
468 
469 	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
470 	_vm_object_allocate(type, size, obj, "vmobj");
471 	vm_object_drop(obj);
472 
473 	return (obj);
474 }
475 
476 /*
477  * This version returns a held object, allowing further atomic initialization
478  * of the object.
479  */
480 vm_object_t
481 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
482 {
483 	vm_object_t obj;
484 
485 	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
486 	_vm_object_allocate(type, size, obj, "vmobj");
487 
488 	return (obj);
489 }
490 
491 /*
492  * Add an additional reference to a vm_object.  The object must already be
493  * held.  The original non-lock version is no longer supported.  The object
494  * must NOT be chain locked by anyone at the time the reference is added.
495  *
496  * The object must be held, but may be held shared if desired (hence why
497  * we use an atomic op).
498  */
499 void
500 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
501 {
502 	KKASSERT(object != NULL);
503 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
504 	atomic_add_int(&object->ref_count, 1);
505 	if (object->type == OBJT_VNODE) {
506 		vref(object->handle);
507 		/* XXX what if the vnode is being destroyed? */
508 	}
509 #if defined(DEBUG_LOCKS)
510 	debugvm_object_add(object, file, line, 1);
511 #endif
512 }
513 
514 /*
515  * This version is only allowed in situations where the caller
516  * already knows that the object is deterministically referenced
517  * (usually because its taken from a ref'd vnode, or during a map_entry
518  * replication).
519  */
520 void
521 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
522 {
523 	KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
524 	atomic_add_int(&object->ref_count, 1);
525 	if (object->type == OBJT_VNODE)
526 		vref(object->handle);
527 #if defined(DEBUG_LOCKS)
528 	debugvm_object_add(object, file, line, 1);
529 #endif
530 }
531 
532 /*
533  * Dereference an object and its underlying vnode.  The object may be
534  * held shared.  On return the object will remain held.
535  *
536  * This function may return a vnode in *vpp which the caller must release
537  * after the caller drops its own lock.  If vpp is NULL, we assume that
538  * the caller was holding an exclusive lock on the object and we vrele()
539  * the vp ourselves.
540  */
541 static void
542 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
543 				   VMOBJDBARGS)
544 {
545 	struct vnode *vp = (struct vnode *) object->handle;
546 	int count;
547 
548 	KASSERT(object->type == OBJT_VNODE,
549 	    ("vm_object_vndeallocate: not a vnode object"));
550 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
551 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
552 #ifdef INVARIANTS
553 	if (object->ref_count == 0) {
554 		vprint("vm_object_vndeallocate", vp);
555 		panic("vm_object_vndeallocate: bad object reference count");
556 	}
557 #endif
558 	count = object->ref_count;
559 	cpu_ccfence();
560 	for (;;) {
561 		if (count == 1) {
562 			vm_object_upgrade(object);
563 			if (atomic_fcmpset_int(&object->ref_count, &count, 0)) {
564 				vclrflags(vp, VTEXT);
565 				break;
566 			}
567 		} else {
568 			if (atomic_fcmpset_int(&object->ref_count,
569 					       &count, count - 1)) {
570 				break;
571 			}
572 		}
573 		cpu_pause();
574 		/* retry */
575 	}
576 #if defined(DEBUG_LOCKS)
577 	debugvm_object_add(object, file, line, -1);
578 #endif
579 
580 	/*
581 	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
582 	 * if the object was locked exclusively.  But there are two races
583 	 * here.
584 	 *
585 	 * We had to upgrade the object above to safely clear VTEXT
586 	 * but the alternative path where the shared lock is retained
587 	 * can STILL race to 0 in other paths and cause our own vrele()
588 	 * to terminate the vnode.  We can't allow that if the VM object
589 	 * is still locked shared.
590 	 */
591 	if (vpp)
592 		*vpp = vp;
593 	else
594 		vrele(vp);
595 }
596 
597 /*
598  * Release a reference to the specified object, gained either through a
599  * vm_object_allocate or a vm_object_reference call.  When all references
600  * are gone, storage associated with this object may be relinquished.
601  *
602  * The caller does not have to hold the object locked but must have control
603  * over the reference in question in order to guarantee that the object
604  * does not get ripped out from under us.
605  *
606  * XXX Currently all deallocations require an exclusive lock.
607  */
608 void
609 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
610 {
611 	struct vnode *vp;
612 	int count;
613 
614 	if (object == NULL)
615 		return;
616 
617 	count = object->ref_count;
618 	cpu_ccfence();
619 	for (;;) {
620 		/*
621 		 * If decrementing the count enters into special handling
622 		 * territory (0, 1, or 2) we have to do it the hard way.
623 		 * Fortunate though, objects with only a few refs like this
624 		 * are not likely to be heavily contended anyway.
625 		 *
626 		 * For vnode objects we only care about 1->0 transitions.
627 		 */
628 		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
629 #if defined(DEBUG_LOCKS)
630 			debugvm_object_add(object, file, line, 0);
631 #endif
632 			vm_object_hold(object);
633 			vm_object_deallocate_locked(object);
634 			vm_object_drop(object);
635 			break;
636 		}
637 
638 		/*
639 		 * Try to decrement ref_count without acquiring a hold on
640 		 * the object.  This is particularly important for the exec*()
641 		 * and exit*() code paths because the program binary may
642 		 * have a great deal of sharing and an exclusive lock will
643 		 * crowbar performance in those circumstances.
644 		 */
645 		if (object->type == OBJT_VNODE) {
646 			vp = (struct vnode *)object->handle;
647 			if (atomic_fcmpset_int(&object->ref_count,
648 					       &count, count - 1)) {
649 #if defined(DEBUG_LOCKS)
650 				debugvm_object_add(object, file, line, -1);
651 #endif
652 
653 				vrele(vp);
654 				break;
655 			}
656 			/* retry */
657 		} else {
658 			if (atomic_fcmpset_int(&object->ref_count,
659 					       &count, count - 1)) {
660 #if defined(DEBUG_LOCKS)
661 				debugvm_object_add(object, file, line, -1);
662 #endif
663 				break;
664 			}
665 			/* retry */
666 		}
667 		cpu_pause();
668 		/* retry */
669 	}
670 }
671 
672 void
673 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
674 {
675 	/*
676 	 * Degenerate case
677 	 */
678 	if (object == NULL)
679 		return;
680 
681 	/*
682 	 * vnode case, caller either locked the object exclusively
683 	 * or this is a recursion with must_drop != 0 and the vnode
684 	 * object will be locked shared.
685 	 *
686 	 * If locked shared we have to drop the object before we can
687 	 * call vrele() or risk a shared/exclusive livelock.
688 	 */
689 	if (object->type == OBJT_VNODE) {
690 		ASSERT_LWKT_TOKEN_HELD(&object->token);
691 		vm_object_vndeallocate(object, NULL);
692 		return;
693 	}
694 	ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
695 
696 	/*
697 	 * Normal case (object is locked exclusively)
698 	 */
699 	if (object->ref_count == 0) {
700 		panic("vm_object_deallocate: object deallocated "
701 		      "too many times: %d", object->type);
702 	}
703 	if (object->ref_count > 2) {
704 		atomic_add_int(&object->ref_count, -1);
705 #if defined(DEBUG_LOCKS)
706 		debugvm_object_add(object, file, line, -1);
707 #endif
708 		return;
709 	}
710 
711 	/*
712 	 * Drop the ref and handle termination on the 1->0 transition.
713 	 * We may have blocked above so we have to recheck.
714 	 */
715 	KKASSERT(object->ref_count != 0);
716 	if (object->ref_count >= 2) {
717 		atomic_add_int(&object->ref_count, -1);
718 #if defined(DEBUG_LOCKS)
719 		debugvm_object_add(object, file, line, -1);
720 #endif
721 		return;
722 	}
723 
724 	atomic_add_int(&object->ref_count, -1);
725 	if ((object->flags & OBJ_DEAD) == 0)
726 		vm_object_terminate(object);
727 }
728 
729 /*
730  * Destroy the specified object, freeing up related resources.
731  *
732  * The object must have zero references.
733  *
734  * The object must held.  The caller is responsible for dropping the object
735  * after terminate returns.  Terminate does NOT drop the object.
736  */
737 static int vm_object_terminate_callback(vm_page_t p, void *data);
738 
739 void
740 vm_object_terminate(vm_object_t object)
741 {
742 	struct rb_vm_page_scan_info info;
743 	struct vm_object_hash *hash;
744 
745 	/*
746 	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
747 	 * able to safely block.
748 	 */
749 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
750 	KKASSERT((object->flags & OBJ_DEAD) == 0);
751 	vm_object_set_flag(object, OBJ_DEAD);
752 
753 	/*
754 	 * Wait for the pageout daemon to be done with the object
755 	 */
756 	vm_object_pip_wait(object, "objtrm1");
757 
758 	KASSERT(!object->paging_in_progress,
759 		("vm_object_terminate: pageout in progress"));
760 
761 	/*
762 	 * Clean and free the pages, as appropriate. All references to the
763 	 * object are gone, so we don't need to lock it.
764 	 */
765 	if (object->type == OBJT_VNODE) {
766 		struct vnode *vp;
767 
768 		/*
769 		 * Clean pages and flush buffers.
770 		 *
771 		 * NOTE!  TMPFS buffer flushes do not typically flush the
772 		 *	  actual page to swap as this would be highly
773 		 *	  inefficient, and normal filesystems usually wrap
774 		 *	  page flushes with buffer cache buffers.
775 		 *
776 		 *	  To deal with this we have to call vinvalbuf() both
777 		 *	  before and after the vm_object_page_clean().
778 		 */
779 		vp = (struct vnode *) object->handle;
780 		vinvalbuf(vp, V_SAVE, 0, 0);
781 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
782 		vinvalbuf(vp, V_SAVE, 0, 0);
783 	}
784 
785 	/*
786 	 * Wait for any I/O to complete, after which there had better not
787 	 * be any references left on the object.
788 	 */
789 	vm_object_pip_wait(object, "objtrm2");
790 
791 	if (object->ref_count != 0) {
792 		panic("vm_object_terminate: object with references, "
793 		      "ref_count=%d", object->ref_count);
794 	}
795 
796 	/*
797 	 * Cleanup any shared pmaps associated with this object.
798 	 */
799 	pmap_object_free(object);
800 
801 	/*
802 	 * Now free any remaining pages. For internal objects, this also
803 	 * removes them from paging queues. Don't free wired pages, just
804 	 * remove them from the object.
805 	 */
806 	info.count = 0;
807 	info.object = object;
808 	do {
809 		info.error = 0;
810 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
811 					vm_object_terminate_callback, &info);
812 	} while (info.error);
813 
814 	/*
815 	 * Let the pager know object is dead.
816 	 */
817 	vm_pager_deallocate(object);
818 
819 	/*
820 	 * Wait for the object hold count to hit 1, clean out pages as
821 	 * we go.  vmobj_token interlocks any race conditions that might
822 	 * pick the object up from the vm_object_list after we have cleared
823 	 * rb_memq.
824 	 */
825 	for (;;) {
826 		if (RB_ROOT(&object->rb_memq) == NULL)
827 			break;
828 		kprintf("vm_object_terminate: Warning, object %p "
829 			"still has %ld pages\n",
830 			object, object->resident_page_count);
831 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
832 					vm_object_terminate_callback, &info);
833 	}
834 
835 	/*
836 	 * There had better not be any pages left
837 	 */
838 	KKASSERT(object->resident_page_count == 0);
839 
840 	/*
841 	 * Remove the object from the global object list.
842 	 */
843 	hash = vmobj_hash(object);
844 	lwkt_gettoken(&hash->token);
845 	TAILQ_REMOVE(&hash->list, object, object_entry);
846 	lwkt_reltoken(&hash->token);
847 
848 	if (object->ref_count != 0) {
849 		panic("vm_object_terminate2: object with references, "
850 		      "ref_count=%d", object->ref_count);
851 	}
852 
853 	/*
854 	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
855 	 *	 the object here.  See vm_object_drop().
856 	 */
857 }
858 
859 /*
860  * The caller must hold the object.
861  *
862  * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED
863  *	 or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync()
864  *	 is called, due to normal pmap operations.  This is because only
865  *	 global pmap operations on the vm_page can clear the bits and not
866  *	 just local operations on individual pmaps.
867  *
868  *	 Most interactions that necessitate the clearing of these bits
869  *	 proactively call vm_page_protect(), and we must do so here as well.
870  */
871 static int
872 vm_object_terminate_callback(vm_page_t p, void *data)
873 {
874 	struct rb_vm_page_scan_info *info = data;
875 	vm_object_t object;
876 
877 	object = p->object;
878 	KKASSERT(object == info->object);
879 	if (vm_page_busy_try(p, TRUE)) {
880 		vm_page_sleep_busy(p, TRUE, "vmotrm");
881 		info->error = 1;
882 		return 0;
883 	}
884 	if (object != p->object) {
885 		/* XXX remove once we determine it can't happen */
886 		kprintf("vm_object_terminate: Warning: Encountered "
887 			"busied page %p on queue %d\n", p, p->queue);
888 		vm_page_wakeup(p);
889 		info->error = 1;
890 	} else if (p->wire_count == 0) {
891 		/*
892 		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
893 		 */
894 		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
895 			vm_page_protect(p, VM_PROT_NONE);
896 		vm_page_free(p);
897 		mycpu->gd_cnt.v_pfree++;
898 	} else {
899 		if (p->queue != PQ_NONE) {
900 			kprintf("vm_object_terminate: Warning: Encountered "
901 				"wired page %p on queue %d\n", p, p->queue);
902 			if (vm_object_debug > 0) {
903 				--vm_object_debug;
904 				print_backtrace(10);
905 			}
906 		}
907 		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
908 			vm_page_protect(p, VM_PROT_NONE);
909 		vm_page_remove(p);
910 		vm_page_wakeup(p);
911 	}
912 
913 	/*
914 	 * Must be at end to avoid SMP races, caller holds object token
915 	 */
916 	if ((++info->count & 63) == 0)
917 		lwkt_user_yield();
918 	return(0);
919 }
920 
921 /*
922  * Clean all dirty pages in the specified range of object.  Leaves page
923  * on whatever queue it is currently on.   If NOSYNC is set then do not
924  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
925  * leaving the object dirty.
926  *
927  * When stuffing pages asynchronously, allow clustering.  XXX we need a
928  * synchronous clustering mode implementation.
929  *
930  * Odd semantics: if start == end, we clean everything.
931  *
932  * The object must be locked? XXX
933  */
934 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
935 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
936 
937 void
938 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
939 		     int flags)
940 {
941 	struct rb_vm_page_scan_info info;
942 	struct vnode *vp;
943 	int wholescan;
944 	int pagerflags;
945 	int generation;
946 
947 	vm_object_hold(object);
948 	if (object->type != OBJT_VNODE ||
949 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
950 		vm_object_drop(object);
951 		return;
952 	}
953 
954 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
955 			OBJPC_SYNC : OBJPC_CLUSTER_OK;
956 	pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0;
957 
958 	vp = object->handle;
959 
960 	/*
961 	 * Interlock other major object operations.  This allows us to
962 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
963 	 */
964 	vm_object_set_flag(object, OBJ_CLEANING);
965 
966 	/*
967 	 * Handle 'entire object' case
968 	 */
969 	info.start_pindex = start;
970 	if (end == 0) {
971 		info.end_pindex = object->size - 1;
972 	} else {
973 		info.end_pindex = end - 1;
974 	}
975 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
976 	info.limit = flags;
977 	info.pagerflags = pagerflags;
978 	info.object = object;
979 
980 	/*
981 	 * If cleaning the entire object do a pass to mark the pages read-only.
982 	 * If everything worked out ok, clear OBJ_WRITEABLE and
983 	 * OBJ_MIGHTBEDIRTY.
984 	 */
985 	if (wholescan) {
986 		info.error = 0;
987 		info.count = 0;
988 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
989 					vm_object_page_clean_pass1, &info);
990 		if (info.error == 0) {
991 			vm_object_clear_flag(object,
992 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
993 			if (object->type == OBJT_VNODE &&
994 			    (vp = (struct vnode *)object->handle) != NULL) {
995 				/*
996 				 * Use new-style interface to clear VISDIRTY
997 				 * because the vnode is not necessarily removed
998 				 * from the syncer list(s) as often as it was
999 				 * under the old interface, which can leave
1000 				 * the vnode on the syncer list after reclaim.
1001 				 */
1002 				vclrobjdirty(vp);
1003 			}
1004 		}
1005 	}
1006 
1007 	/*
1008 	 * Do a pass to clean all the dirty pages we find.
1009 	 */
1010 	do {
1011 		info.error = 0;
1012 		info.count = 0;
1013 		generation = object->generation;
1014 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1015 					vm_object_page_clean_pass2, &info);
1016 	} while (info.error || generation != object->generation);
1017 
1018 	vm_object_clear_flag(object, OBJ_CLEANING);
1019 	vm_object_drop(object);
1020 }
1021 
1022 /*
1023  * The caller must hold the object.
1024  */
1025 static
1026 int
1027 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1028 {
1029 	struct rb_vm_page_scan_info *info = data;
1030 
1031 	KKASSERT(p->object == info->object);
1032 
1033 	vm_page_flag_set(p, PG_CLEANCHK);
1034 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1035 		info->error = 1;
1036 	} else if (vm_page_busy_try(p, FALSE)) {
1037 		info->error = 1;
1038 	} else {
1039 		KKASSERT(p->object == info->object);
1040 		vm_page_protect(p, VM_PROT_READ);
1041 		vm_page_wakeup(p);
1042 	}
1043 
1044 	/*
1045 	 * Must be at end to avoid SMP races, caller holds object token
1046 	 */
1047 	if ((++info->count & 63) == 0)
1048 		lwkt_user_yield();
1049 	return(0);
1050 }
1051 
1052 /*
1053  * The caller must hold the object
1054  */
1055 static
1056 int
1057 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1058 {
1059 	struct rb_vm_page_scan_info *info = data;
1060 	int generation;
1061 
1062 	KKASSERT(p->object == info->object);
1063 
1064 	/*
1065 	 * Do not mess with pages that were inserted after we started
1066 	 * the cleaning pass.
1067 	 */
1068 	if ((p->flags & PG_CLEANCHK) == 0)
1069 		goto done;
1070 
1071 	generation = info->object->generation;
1072 
1073 	if (vm_page_busy_try(p, TRUE)) {
1074 		vm_page_sleep_busy(p, TRUE, "vpcwai");
1075 		info->error = 1;
1076 		goto done;
1077 	}
1078 
1079 	KKASSERT(p->object == info->object &&
1080 		 info->object->generation == generation);
1081 
1082 	/*
1083 	 * Before wasting time traversing the pmaps, check for trivial
1084 	 * cases where the page cannot be dirty.
1085 	 */
1086 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1087 		KKASSERT((p->dirty & p->valid) == 0 &&
1088 			 (p->flags & PG_NEED_COMMIT) == 0);
1089 		vm_page_wakeup(p);
1090 		goto done;
1091 	}
1092 
1093 	/*
1094 	 * Check whether the page is dirty or not.  The page has been set
1095 	 * to be read-only so the check will not race a user dirtying the
1096 	 * page.
1097 	 */
1098 	vm_page_test_dirty(p);
1099 	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1100 		vm_page_flag_clear(p, PG_CLEANCHK);
1101 		vm_page_wakeup(p);
1102 		goto done;
1103 	}
1104 
1105 	/*
1106 	 * If we have been asked to skip nosync pages and this is a
1107 	 * nosync page, skip it.  Note that the object flags were
1108 	 * not cleared in this case (because pass1 will have returned an
1109 	 * error), so we do not have to set them.
1110 	 */
1111 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1112 		vm_page_flag_clear(p, PG_CLEANCHK);
1113 		vm_page_wakeup(p);
1114 		goto done;
1115 	}
1116 
1117 	/*
1118 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1119 	 * the pages that get successfully flushed.  Set info->error if
1120 	 * we raced an object modification.
1121 	 */
1122 	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1123 	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1124 
1125 	/*
1126 	 * Must be at end to avoid SMP races, caller holds object token
1127 	 */
1128 done:
1129 	if ((++info->count & 63) == 0)
1130 		lwkt_user_yield();
1131 	return(0);
1132 }
1133 
1134 /*
1135  * Collect the specified page and nearby pages and flush them out.
1136  * The number of pages flushed is returned.  The passed page is busied
1137  * by the caller and we are responsible for its disposition.
1138  *
1139  * The caller must hold the object.
1140  */
1141 static void
1142 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1143 {
1144 	int error;
1145 	int is;
1146 	int ib;
1147 	int i;
1148 	int page_base;
1149 	vm_pindex_t pi;
1150 	vm_page_t ma[BLIST_MAX_ALLOC];
1151 
1152 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1153 
1154 	pi = p->pindex;
1155 	page_base = pi % BLIST_MAX_ALLOC;
1156 	ma[page_base] = p;
1157 	ib = page_base - 1;
1158 	is = page_base + 1;
1159 
1160 	while (ib >= 0) {
1161 		vm_page_t tp;
1162 
1163 		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1164 					     TRUE, &error);
1165 		if (error)
1166 			break;
1167 		if (tp == NULL)
1168 			break;
1169 		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1170 		    (tp->flags & PG_CLEANCHK) == 0) {
1171 			vm_page_wakeup(tp);
1172 			break;
1173 		}
1174 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1175 			vm_page_flag_clear(tp, PG_CLEANCHK);
1176 			vm_page_wakeup(tp);
1177 			break;
1178 		}
1179 		vm_page_test_dirty(tp);
1180 		if ((tp->dirty & tp->valid) == 0 &&
1181 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1182 			vm_page_flag_clear(tp, PG_CLEANCHK);
1183 			vm_page_wakeup(tp);
1184 			break;
1185 		}
1186 		ma[ib] = tp;
1187 		--ib;
1188 	}
1189 	++ib;	/* fixup */
1190 
1191 	while (is < BLIST_MAX_ALLOC &&
1192 	       pi - page_base + is < object->size) {
1193 		vm_page_t tp;
1194 
1195 		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1196 					     TRUE, &error);
1197 		if (error)
1198 			break;
1199 		if (tp == NULL)
1200 			break;
1201 		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1202 		    (tp->flags & PG_CLEANCHK) == 0) {
1203 			vm_page_wakeup(tp);
1204 			break;
1205 		}
1206 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1207 			vm_page_flag_clear(tp, PG_CLEANCHK);
1208 			vm_page_wakeup(tp);
1209 			break;
1210 		}
1211 		vm_page_test_dirty(tp);
1212 		if ((tp->dirty & tp->valid) == 0 &&
1213 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1214 			vm_page_flag_clear(tp, PG_CLEANCHK);
1215 			vm_page_wakeup(tp);
1216 			break;
1217 		}
1218 		ma[is] = tp;
1219 		++is;
1220 	}
1221 
1222 	/*
1223 	 * All pages in the ma[] array are busied now
1224 	 */
1225 	for (i = ib; i < is; ++i) {
1226 		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1227 		vm_page_hold(ma[i]);	/* XXX need this any more? */
1228 	}
1229 	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1230 	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1231 		vm_page_unhold(ma[i]);
1232 }
1233 
1234 /*
1235  * Implements the madvise function at the object/page level.
1236  *
1237  * MADV_WILLNEED	(any object)
1238  *
1239  *	Activate the specified pages if they are resident.
1240  *
1241  * MADV_DONTNEED	(any object)
1242  *
1243  *	Deactivate the specified pages if they are resident.
1244  *
1245  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1246  *
1247  *	Deactivate and clean the specified pages if they are
1248  *	resident.  This permits the process to reuse the pages
1249  *	without faulting or the kernel to reclaim the pages
1250  *	without I/O.
1251  *
1252  * No requirements.
1253  */
1254 void
1255 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1256 		  vm_pindex_t count, int advise)
1257 {
1258 	vm_pindex_t end;
1259 	vm_page_t m;
1260 	int error;
1261 
1262 	if (object == NULL)
1263 		return;
1264 
1265 	end = pindex + count;
1266 
1267 	vm_object_hold(object);
1268 
1269 	/*
1270 	 * Locate and adjust resident pages.  This only applies to the
1271 	 * primary object in the mapping.
1272 	 */
1273 	for (; pindex < end; pindex += 1) {
1274 relookup:
1275 		/*
1276 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1277 		 * and those pages must be OBJ_ONEMAPPING.
1278 		 */
1279 		if (advise == MADV_FREE) {
1280 			if ((object->type != OBJT_DEFAULT &&
1281 			     object->type != OBJT_SWAP) ||
1282 			    (object->flags & OBJ_ONEMAPPING) == 0) {
1283 				continue;
1284 			}
1285 		}
1286 
1287 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1288 
1289 		if (error) {
1290 			vm_page_sleep_busy(m, TRUE, "madvpo");
1291 			goto relookup;
1292 		}
1293 		if (m == NULL) {
1294 			/*
1295 			 * There may be swap even if there is no backing page
1296 			 */
1297 			if (advise == MADV_FREE && object->type == OBJT_SWAP)
1298 				swap_pager_freespace(object, pindex, 1);
1299 			continue;
1300 		}
1301 
1302 		/*
1303 		 * If the page is not in a normal active state, we skip it.
1304 		 * If the page is not managed there are no page queues to
1305 		 * mess with.  Things can break if we mess with pages in
1306 		 * any of the below states.
1307 		 */
1308 		if (m->wire_count ||
1309 		    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1310 				 PG_NEED_COMMIT)) ||
1311 		    m->valid != VM_PAGE_BITS_ALL
1312 		) {
1313 			vm_page_wakeup(m);
1314 			continue;
1315 		}
1316 
1317 		/*
1318 		 * Theoretically once a page is known not to be busy, an
1319 		 * interrupt cannot come along and rip it out from under us.
1320 		 */
1321 		if (advise == MADV_WILLNEED) {
1322 			vm_page_activate(m);
1323 		} else if (advise == MADV_DONTNEED) {
1324 			vm_page_dontneed(m);
1325 		} else if (advise == MADV_FREE) {
1326 			/*
1327 			 * Mark the page clean.  This will allow the page
1328 			 * to be freed up by the system.  However, such pages
1329 			 * are often reused quickly by malloc()/free()
1330 			 * so we do not do anything that would cause
1331 			 * a page fault if we can help it.
1332 			 *
1333 			 * Specifically, we do not try to actually free
1334 			 * the page now nor do we try to put it in the
1335 			 * cache (which would cause a page fault on reuse).
1336 			 *
1337 			 * But we do make the page is freeable as we
1338 			 * can without actually taking the step of unmapping
1339 			 * it.
1340 			 */
1341 			pmap_clear_modify(m);
1342 			m->dirty = 0;
1343 			m->act_count = 0;
1344 			vm_page_dontneed(m);
1345 			if (object->type == OBJT_SWAP)
1346 				swap_pager_freespace(object, pindex, 1);
1347 		}
1348 		vm_page_wakeup(m);
1349 	}
1350 	vm_object_drop(object);
1351 }
1352 
1353 /*
1354  * Removes all physical pages in the specified object range from the
1355  * object's list of pages.
1356  *
1357  * No requirements.
1358  */
1359 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1360 
1361 void
1362 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1363 		      boolean_t clean_only)
1364 {
1365 	struct rb_vm_page_scan_info info;
1366 	int all;
1367 
1368 	/*
1369 	 * Degenerate cases and assertions.
1370 	 *
1371 	 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects.
1372 	 *	 These objects do not have to have their pages entered into
1373 	 *	 them and are handled via their vm_map_backing lists.
1374 	 */
1375 	vm_object_hold(object);
1376 	if (object == NULL ||
1377 	    (object->type != OBJT_MGTDEVICE &&
1378 	     object->resident_page_count == 0 && object->swblock_count == 0)) {
1379 		vm_object_drop(object);
1380 		return;
1381 	}
1382 	KASSERT(object->type != OBJT_PHYS,
1383 		("attempt to remove pages from a physical object"));
1384 
1385 	/*
1386 	 * Indicate that paging is occuring on the object
1387 	 */
1388 	vm_object_pip_add(object, 1);
1389 
1390 	/*
1391 	 * Figure out the actual removal range and whether we are removing
1392 	 * the entire contents of the object or not.  If removing the entire
1393 	 * contents, be sure to get all pages, even those that might be
1394 	 * beyond the end of the object.
1395 	 *
1396 	 * NOTE: end is non-inclusive, but info.end_pindex is inclusive.
1397 	 */
1398 	info.object = object;
1399 	info.start_pindex = start;
1400 	if (end == 0 || end == (vm_pindex_t)-1) {
1401 		info.end_pindex = (vm_pindex_t)-1;
1402 		end = object->size;
1403 	} else {
1404 		info.end_pindex = end - 1;
1405 	}
1406 	info.limit = clean_only;
1407 	info.count = 0;
1408 	all = (start == 0 && info.end_pindex >= object->size - 1);
1409 
1410 	/*
1411 	 * Efficiently remove pages from the pmap via a backing scan.
1412 	 *
1413 	 * NOTE: This is the only way pages can be removed and unwired
1414 	 *	 from OBJT_MGTDEVICE devices which typically do not enter
1415 	 *	 their pages into the vm_object's RB tree.  And possibly
1416 	 *	 other OBJT_* types in the future.
1417 	 */
1418 	{
1419 		vm_map_backing_t ba;
1420 		vm_pindex_t sba, eba;
1421 		vm_offset_t sva, eva;
1422 
1423 		lockmgr(&object->backing_lk, LK_EXCLUSIVE);
1424 		TAILQ_FOREACH(ba, &object->backing_list, entry) {
1425 			/*
1426 			 * object offset range within the ba, intersectioned
1427 			 * with the page range specified for the object
1428 			 */
1429 			sba = OFF_TO_IDX(ba->offset);
1430 			eba = sba + OFF_TO_IDX(ba->end - ba->start);
1431 			if (sba < start)
1432 				sba = start;
1433 			if (eba > end)
1434 				eba = end;
1435 
1436 			/*
1437 			 * If the intersection is valid, remove the related
1438 			 * pages.
1439 			 *
1440 			 * NOTE! This may also remove other incidental pages
1441 			 *	 in the pmap, as the backing area may be
1442 			 *	 overloaded.
1443 			 *
1444 			 * NOTE! pages for MGTDEVICE objects are only removed
1445 			 *	 here, they aren't entered into rb_memq, so
1446 			 *	 we must use pmap_remove() instead of
1447 			 *	 the non-TLB-invalidating pmap_remove_pages().
1448 			 */
1449 			if (sba < eba) {
1450 				sva = ba->start + IDX_TO_OFF(sba) - ba->offset;
1451 				eva = sva + IDX_TO_OFF(eba - sba);
1452 #if 0
1453 				kprintf("VM_OBJECT_PAGE_REMOVE "
1454 					"%p[%016jx] %016jx-%016jx\n",
1455 					ba->pmap, ba->start, sva, eva);
1456 #endif
1457 				pmap_remove(ba->pmap, sva, eva);
1458 			}
1459 		}
1460 		lockmgr(&object->backing_lk, LK_RELEASE);
1461 	}
1462 
1463 	/*
1464 	 * Remove and free pages entered onto the object list.  Note that
1465 	 * for OBJT_MGTDEVICE objects, there are typically no pages entered.
1466 	 *
1467 	 * Loop until we are sure we have gotten them all.
1468 	 */
1469 	do {
1470 		info.error = 0;
1471 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1472 					vm_object_page_remove_callback, &info);
1473 	} while (info.error);
1474 
1475 	/*
1476 	 * Remove any related swap if throwing away pages, or for
1477 	 * non-swap objects (the swap is a clean copy in that case).
1478 	 */
1479 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1480 		if (all)
1481 			swap_pager_freespace_all(object);
1482 		else
1483 			swap_pager_freespace(object, info.start_pindex,
1484 			     info.end_pindex - info.start_pindex + 1);
1485 	}
1486 
1487 	/*
1488 	 * Cleanup
1489 	 */
1490 	vm_object_pip_wakeup(object);
1491 	vm_object_drop(object);
1492 }
1493 
1494 /*
1495  * The caller must hold the object.
1496  *
1497  * NOTE: User yields are allowed when removing more than one page, but not
1498  *	 allowed if only removing one page (the path for single page removals
1499  *	 might hold a spinlock).
1500  */
1501 static int
1502 vm_object_page_remove_callback(vm_page_t p, void *data)
1503 {
1504 	struct rb_vm_page_scan_info *info = data;
1505 
1506 	if (info->object != p->object ||
1507 	    p->pindex < info->start_pindex ||
1508 	    p->pindex > info->end_pindex) {
1509 		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1510 			info->object, p);
1511 		return(0);
1512 	}
1513 	if (vm_page_busy_try(p, TRUE)) {
1514 		vm_page_sleep_busy(p, TRUE, "vmopar");
1515 		info->error = 1;
1516 		return(0);
1517 	}
1518 	if (info->object != p->object) {
1519 		/* this should never happen */
1520 		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1521 			info->object, p);
1522 		vm_page_wakeup(p);
1523 		return(0);
1524 	}
1525 
1526 	/*
1527 	 * Wired pages cannot be destroyed, but they can be invalidated
1528 	 * and we do so if clean_only (limit) is not set.
1529 	 *
1530 	 * WARNING!  The page may be wired due to being part of a buffer
1531 	 *	     cache buffer, and the buffer might be marked B_CACHE.
1532 	 *	     This is fine as part of a truncation but VFSs must be
1533 	 *	     sure to fix the buffer up when re-extending the file.
1534 	 *
1535 	 * NOTE!     PG_NEED_COMMIT is ignored.
1536 	 */
1537 	if (p->wire_count != 0) {
1538 		vm_page_protect(p, VM_PROT_NONE);
1539 		if (info->limit == 0)
1540 			p->valid = 0;
1541 		vm_page_wakeup(p);
1542 		goto done;
1543 	}
1544 
1545 	/*
1546 	 * limit is our clean_only flag.  If set and the page is dirty or
1547 	 * requires a commit, do not free it.  If set and the page is being
1548 	 * held by someone, do not free it.
1549 	 */
1550 	if (info->limit && p->valid) {
1551 		vm_page_test_dirty(p);
1552 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1553 			vm_page_wakeup(p);
1554 			goto done;
1555 		}
1556 	}
1557 
1558 	/*
1559 	 * Destroy the page.  But we have to re-test whether its dirty after
1560 	 * removing it from its pmaps.
1561 	 */
1562 	vm_page_protect(p, VM_PROT_NONE);
1563 	if (info->limit && p->valid) {
1564 		vm_page_test_dirty(p);
1565 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1566 			vm_page_wakeup(p);
1567 			goto done;
1568 		}
1569 	}
1570 	vm_page_free(p);
1571 
1572 	/*
1573 	 * Must be at end to avoid SMP races, caller holds object token
1574 	 */
1575 done:
1576 	if ((++info->count & 63) == 0)
1577 		lwkt_user_yield();
1578 
1579 	return(0);
1580 }
1581 
1582 /*
1583  * Try to extend prev_object into an adjoining region of virtual
1584  * memory, return TRUE on success.
1585  *
1586  * The caller does not need to hold (prev_object) but must have a stable
1587  * pointer to it (typically by holding the vm_map locked).
1588  *
1589  * This function only works for anonymous memory objects which either
1590  * have (a) one reference or (b) we are extending the object's size.
1591  * Otherwise the related VM pages we want to use for the object might
1592  * be in use by another mapping.
1593  */
1594 boolean_t
1595 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1596 		   vm_size_t prev_size, vm_size_t next_size)
1597 {
1598 	vm_pindex_t next_pindex;
1599 
1600 	if (prev_object == NULL)
1601 		return (TRUE);
1602 
1603 	vm_object_hold(prev_object);
1604 
1605 	if (prev_object->type != OBJT_DEFAULT &&
1606 	    prev_object->type != OBJT_SWAP) {
1607 		vm_object_drop(prev_object);
1608 		return (FALSE);
1609 	}
1610 
1611 #if 0
1612 	/* caller now checks this */
1613 	/*
1614 	 * Try to collapse the object first
1615 	 */
1616 	vm_object_collapse(prev_object, NULL);
1617 #endif
1618 
1619 #if 0
1620 	/* caller now checks this */
1621 	/*
1622 	 * We can't coalesce if we shadow another object (figuring out the
1623 	 * relationships become too complex).
1624 	 */
1625 	if (prev_object->backing_object != NULL) {
1626 		vm_object_chain_release(prev_object);
1627 		vm_object_drop(prev_object);
1628 		return (FALSE);
1629 	}
1630 #endif
1631 
1632 	prev_size >>= PAGE_SHIFT;
1633 	next_size >>= PAGE_SHIFT;
1634 	next_pindex = prev_pindex + prev_size;
1635 
1636 	/*
1637 	 * We can't if the object has more than one ref count unless we
1638 	 * are extending it into newly minted space.
1639 	 */
1640 	if (prev_object->ref_count > 1 &&
1641 	    prev_object->size != next_pindex) {
1642 		vm_object_drop(prev_object);
1643 		return (FALSE);
1644 	}
1645 
1646 	/*
1647 	 * Remove any pages that may still be in the object from a previous
1648 	 * deallocation.
1649 	 */
1650 	if (next_pindex < prev_object->size) {
1651 		vm_object_page_remove(prev_object,
1652 				      next_pindex,
1653 				      next_pindex + next_size, FALSE);
1654 		if (prev_object->type == OBJT_SWAP)
1655 			swap_pager_freespace(prev_object,
1656 					     next_pindex, next_size);
1657 	}
1658 
1659 	/*
1660 	 * Extend the object if necessary.
1661 	 */
1662 	if (next_pindex + next_size > prev_object->size)
1663 		prev_object->size = next_pindex + next_size;
1664 	vm_object_drop(prev_object);
1665 
1666 	return (TRUE);
1667 }
1668 
1669 /*
1670  * Make the object writable and flag is being possibly dirty.
1671  *
1672  * The object might not be held (or might be held but held shared),
1673  * the related vnode is probably not held either.  Object and vnode are
1674  * stable by virtue of the vm_page busied by the caller preventing
1675  * destruction.
1676  *
1677  * If the related mount is flagged MNTK_THR_SYNC we need to call
1678  * vsetobjdirty().  Filesystems using this option usually shortcut
1679  * synchronization by only scanning the syncer list.
1680  */
1681 void
1682 vm_object_set_writeable_dirty(vm_object_t object)
1683 {
1684 	struct vnode *vp;
1685 
1686 	/*vm_object_assert_held(object);*/
1687 	/*
1688 	 * Avoid contention in vm fault path by checking the state before
1689 	 * issuing an atomic op on it.
1690 	 */
1691 	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1692 	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1693 		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1694 	}
1695 	if (object->type == OBJT_VNODE &&
1696 	    (vp = (struct vnode *)object->handle) != NULL) {
1697 		if ((vp->v_flag & VOBJDIRTY) == 0) {
1698 			if (vp->v_mount &&
1699 			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1700 				/*
1701 				 * New style THR_SYNC places vnodes on the
1702 				 * syncer list more deterministically.
1703 				 */
1704 				vsetobjdirty(vp);
1705 			} else {
1706 				/*
1707 				 * Old style scan would not necessarily place
1708 				 * a vnode on the syncer list when possibly
1709 				 * modified via mmap.
1710 				 */
1711 				vsetflags(vp, VOBJDIRTY);
1712 			}
1713 		}
1714 	}
1715 }
1716 
1717 #include "opt_ddb.h"
1718 #ifdef DDB
1719 #include <sys/cons.h>
1720 
1721 #include <ddb/ddb.h>
1722 
1723 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1724 				       vm_map_entry_t entry);
1725 static int	vm_object_in_map (vm_object_t object);
1726 
1727 /*
1728  * The caller must hold the object.
1729  */
1730 static int
1731 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1732 {
1733 	vm_map_backing_t ba;
1734 	vm_map_t tmpm;
1735 	vm_map_entry_t tmpe;
1736 	int entcount;
1737 
1738 	if (map == NULL)
1739 		return 0;
1740 	if (entry == NULL) {
1741 		tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1742 		entcount = map->nentries;
1743 		while (entcount-- && tmpe) {
1744 			if( _vm_object_in_map(map, object, tmpe)) {
1745 				return 1;
1746 			}
1747 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1748 		}
1749 		return (0);
1750 	}
1751 	switch(entry->maptype) {
1752 	case VM_MAPTYPE_SUBMAP:
1753 		tmpm = entry->ba.sub_map;
1754 		tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1755 		entcount = tmpm->nentries;
1756 		while (entcount-- && tmpe) {
1757 			if( _vm_object_in_map(tmpm, object, tmpe)) {
1758 				return 1;
1759 			}
1760 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1761 		}
1762 		break;
1763 	case VM_MAPTYPE_NORMAL:
1764 		ba = &entry->ba;
1765 		while (ba) {
1766 			if (ba->object == object)
1767 				return TRUE;
1768 			ba = ba->backing_ba;
1769 		}
1770 		break;
1771 	default:
1772 		break;
1773 	}
1774 	return 0;
1775 }
1776 
1777 static int vm_object_in_map_callback(struct proc *p, void *data);
1778 
1779 struct vm_object_in_map_info {
1780 	vm_object_t object;
1781 	int rv;
1782 };
1783 
1784 /*
1785  * Debugging only
1786  */
1787 static int
1788 vm_object_in_map(vm_object_t object)
1789 {
1790 	struct vm_object_in_map_info info;
1791 
1792 	info.rv = 0;
1793 	info.object = object;
1794 
1795 	allproc_scan(vm_object_in_map_callback, &info, 0);
1796 	if (info.rv)
1797 		return 1;
1798 	if( _vm_object_in_map(&kernel_map, object, 0))
1799 		return 1;
1800 	if( _vm_object_in_map(&pager_map, object, 0))
1801 		return 1;
1802 	if( _vm_object_in_map(&buffer_map, object, 0))
1803 		return 1;
1804 	return 0;
1805 }
1806 
1807 /*
1808  * Debugging only
1809  */
1810 static int
1811 vm_object_in_map_callback(struct proc *p, void *data)
1812 {
1813 	struct vm_object_in_map_info *info = data;
1814 
1815 	if (p->p_vmspace) {
1816 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1817 			info->rv = 1;
1818 			return -1;
1819 		}
1820 	}
1821 	return (0);
1822 }
1823 
1824 DB_SHOW_COMMAND(vmochk, vm_object_check)
1825 {
1826 	struct vm_object_hash *hash;
1827 	vm_object_t object;
1828 	int n;
1829 
1830 	/*
1831 	 * make sure that internal objs are in a map somewhere
1832 	 * and none have zero ref counts.
1833 	 */
1834 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1835 		hash = &vm_object_hash[n];
1836 		for (object = TAILQ_FIRST(&hash->list);
1837 				object != NULL;
1838 				object = TAILQ_NEXT(object, object_entry)) {
1839 			if (object->type == OBJT_MARKER)
1840 				continue;
1841 			if (object->handle != NULL ||
1842 			    (object->type != OBJT_DEFAULT &&
1843 			     object->type != OBJT_SWAP)) {
1844 				continue;
1845 			}
1846 			if (object->ref_count == 0) {
1847 				db_printf("vmochk: internal obj has "
1848 					  "zero ref count: %ld\n",
1849 					  (long)object->size);
1850 			}
1851 			if (vm_object_in_map(object))
1852 				continue;
1853 			db_printf("vmochk: internal obj is not in a map: "
1854 				  "ref: %d, size: %lu: 0x%lx\n",
1855 				  object->ref_count, (u_long)object->size,
1856 				  (u_long)object->size);
1857 		}
1858 	}
1859 }
1860 
1861 /*
1862  * Debugging only
1863  */
1864 DB_SHOW_COMMAND(object, vm_object_print_static)
1865 {
1866 	/* XXX convert args. */
1867 	vm_object_t object = (vm_object_t)addr;
1868 	boolean_t full = have_addr;
1869 
1870 	vm_page_t p;
1871 
1872 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1873 #define	count	was_count
1874 
1875 	int count;
1876 
1877 	if (object == NULL)
1878 		return;
1879 
1880 	db_iprintf(
1881 	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1882 	    object, (int)object->type, (u_long)object->size,
1883 	    object->resident_page_count, object->ref_count, object->flags);
1884 	/*
1885 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1886 	 */
1887 	db_iprintf("\n");
1888 
1889 	if (!full)
1890 		return;
1891 
1892 	db_indent += 2;
1893 	count = 0;
1894 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1895 		if (count == 0)
1896 			db_iprintf("memory:=");
1897 		else if (count == 6) {
1898 			db_printf("\n");
1899 			db_iprintf(" ...");
1900 			count = 0;
1901 		} else
1902 			db_printf(",");
1903 		count++;
1904 
1905 		db_printf("(off=0x%lx,page=0x%lx)",
1906 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1907 	}
1908 	if (count != 0)
1909 		db_printf("\n");
1910 	db_indent -= 2;
1911 }
1912 
1913 /* XXX. */
1914 #undef count
1915 
1916 /*
1917  * XXX need this non-static entry for calling from vm_map_print.
1918  *
1919  * Debugging only
1920  */
1921 void
1922 vm_object_print(/* db_expr_t */ long addr,
1923 		boolean_t have_addr,
1924 		/* db_expr_t */ long count,
1925 		char *modif)
1926 {
1927 	vm_object_print_static(addr, have_addr, count, modif);
1928 }
1929 
1930 /*
1931  * Debugging only
1932  */
1933 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1934 {
1935 	struct vm_object_hash *hash;
1936 	vm_object_t object;
1937 	int nl = 0;
1938 	int c;
1939 	int n;
1940 
1941 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1942 		hash = &vm_object_hash[n];
1943 		for (object = TAILQ_FIRST(&hash->list);
1944 				object != NULL;
1945 				object = TAILQ_NEXT(object, object_entry)) {
1946 			vm_pindex_t idx, fidx;
1947 			vm_pindex_t osize;
1948 			vm_paddr_t pa = -1, padiff;
1949 			int rcount;
1950 			vm_page_t m;
1951 
1952 			if (object->type == OBJT_MARKER)
1953 				continue;
1954 			db_printf("new object: %p\n", (void *)object);
1955 			if ( nl > 18) {
1956 				c = cngetc();
1957 				if (c != ' ')
1958 					return;
1959 				nl = 0;
1960 			}
1961 			nl++;
1962 			rcount = 0;
1963 			fidx = 0;
1964 			osize = object->size;
1965 			if (osize > 128)
1966 				osize = 128;
1967 			for (idx = 0; idx < osize; idx++) {
1968 				m = vm_page_lookup(object, idx);
1969 				if (m == NULL) {
1970 					if (rcount) {
1971 						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1972 							(long)fidx, rcount, (long)pa);
1973 						if ( nl > 18) {
1974 							c = cngetc();
1975 							if (c != ' ')
1976 								return;
1977 							nl = 0;
1978 						}
1979 						nl++;
1980 						rcount = 0;
1981 					}
1982 					continue;
1983 				}
1984 
1985 				if (rcount &&
1986 					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1987 					++rcount;
1988 					continue;
1989 				}
1990 				if (rcount) {
1991 					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1992 					padiff >>= PAGE_SHIFT;
1993 					padiff &= PQ_L2_MASK;
1994 					if (padiff == 0) {
1995 						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1996 						++rcount;
1997 						continue;
1998 					}
1999 					db_printf(" index(%ld)run(%d)pa(0x%lx)",
2000 						(long)fidx, rcount, (long)pa);
2001 					db_printf("pd(%ld)\n", (long)padiff);
2002 					if ( nl > 18) {
2003 						c = cngetc();
2004 						if (c != ' ')
2005 							return;
2006 						nl = 0;
2007 					}
2008 					nl++;
2009 				}
2010 				fidx = idx;
2011 				pa = VM_PAGE_TO_PHYS(m);
2012 				rcount = 1;
2013 			}
2014 			if (rcount) {
2015 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2016 					(long)fidx, rcount, (long)pa);
2017 				if ( nl > 18) {
2018 					c = cngetc();
2019 					if (c != ' ')
2020 						return;
2021 					nl = 0;
2022 				}
2023 				nl++;
2024 			}
2025 		}
2026 	}
2027 }
2028 #endif /* DDB */
2029