xref: /dragonfly/sys/vm/vm_object.c (revision 7d3e9a5b)
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62 
63 /*
64  *	Virtual memory object module.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>		/* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/malloc.h>
77 #include <sys/sysctl.h>
78 #include <sys/refcount.h>
79 
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_zone.h>
92 
93 #include <vm/vm_page2.h>
94 
95 #include <machine/specialreg.h>
96 
97 #define EASY_SCAN_FACTOR	8
98 
99 static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100 					     int pagerflags);
101 static void	vm_object_lock_init(vm_object_t);
102 
103 /*
104  *	Virtual memory objects maintain the actual data
105  *	associated with allocated virtual memory.  A given
106  *	page of memory exists within exactly one object.
107  *
108  *	An object is only deallocated when all "references"
109  *	are given up.  Only one "reference" to a given
110  *	region of an object should be writeable.
111  *
112  *	Associated with each object is a list of all resident
113  *	memory pages belonging to that object; this list is
114  *	maintained by the "vm_page" module, and locked by the object's
115  *	lock.
116  *
117  *	Each object also records a "pager" routine which is
118  *	used to retrieve (and store) pages to the proper backing
119  *	storage.  In addition, objects may be backed by other
120  *	objects from which they were virtual-copied.
121  *
122  *	The only items within the object structure which are
123  *	modified after time of creation are:
124  *		reference count		locked by object's lock
125  *		pager routine		locked by object's lock
126  *
127  */
128 
129 static struct vm_object kernel_object_store;
130 struct vm_object *kernel_object = &kernel_object_store;
131 
132 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
133 
134 static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object),
135 		"vm_object", "vm_object structures");
136 
137 #define VMOBJ_HASH_PRIME1	66555444443333333ULL
138 #define VMOBJ_HASH_PRIME2	989042931893ULL
139 
140 int vm_object_debug;
141 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
142 
143 static __inline
144 struct vm_object_hash *
145 vmobj_hash(vm_object_t obj)
146 {
147 	uintptr_t hash1;
148 	uintptr_t hash2;
149 
150 	hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
151 	hash1 %= VMOBJ_HASH_PRIME1;
152 	hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
153 	hash2 %= VMOBJ_HASH_PRIME2;
154 	return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
155 }
156 
157 #if defined(DEBUG_LOCKS)
158 
159 #define vm_object_vndeallocate(obj, vpp)	\
160                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
161 
162 /*
163  * Debug helper to track hold/drop/ref/deallocate calls.
164  */
165 static void
166 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
167 {
168 	int i;
169 
170 	i = atomic_fetchadd_int(&obj->debug_index, 1);
171 	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
172 	ksnprintf(obj->debug_hold_thrs[i],
173 		  sizeof(obj->debug_hold_thrs[i]),
174 		  "%c%d:(%d):%s",
175 		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
176 		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
177 		  obj->ref_count,
178 		  curthread->td_comm);
179 	obj->debug_hold_file[i] = file;
180 	obj->debug_hold_line[i] = line;
181 #if 0
182 	/* Uncomment for debugging obj refs/derefs in reproducable cases */
183 	if (strcmp(curthread->td_comm, "sshd") == 0) {
184 		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
185 			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
186 			obj, obj->ref_count, addrem, file, line);
187 	}
188 #endif
189 }
190 
191 #endif
192 
193 /*
194  * Misc low level routines
195  */
196 static void
197 vm_object_lock_init(vm_object_t obj)
198 {
199 #if defined(DEBUG_LOCKS)
200 	int i;
201 
202 	obj->debug_index = 0;
203 	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
204 		obj->debug_hold_thrs[i][0] = 0;
205 		obj->debug_hold_file[i] = NULL;
206 		obj->debug_hold_line[i] = 0;
207 	}
208 #endif
209 }
210 
211 void
212 vm_object_lock_swap(void)
213 {
214 	lwkt_token_swap();
215 }
216 
217 void
218 vm_object_lock(vm_object_t obj)
219 {
220 	lwkt_gettoken(&obj->token);
221 }
222 
223 /*
224  * Returns TRUE on sucesss
225  */
226 static int
227 vm_object_lock_try(vm_object_t obj)
228 {
229 	return(lwkt_trytoken(&obj->token));
230 }
231 
232 void
233 vm_object_lock_shared(vm_object_t obj)
234 {
235 	lwkt_gettoken_shared(&obj->token);
236 }
237 
238 void
239 vm_object_unlock(vm_object_t obj)
240 {
241 	lwkt_reltoken(&obj->token);
242 }
243 
244 void
245 vm_object_upgrade(vm_object_t obj)
246 {
247 	lwkt_reltoken(&obj->token);
248 	lwkt_gettoken(&obj->token);
249 }
250 
251 void
252 vm_object_downgrade(vm_object_t obj)
253 {
254 	lwkt_reltoken(&obj->token);
255 	lwkt_gettoken_shared(&obj->token);
256 }
257 
258 static __inline void
259 vm_object_assert_held(vm_object_t obj)
260 {
261 	ASSERT_LWKT_TOKEN_HELD(&obj->token);
262 }
263 
264 int
265 vm_quickcolor(void)
266 {
267 	globaldata_t gd = mycpu;
268 	int pg_color;
269 
270 	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
271 	pg_color += gd->gd_quick_color;
272 	gd->gd_quick_color += PQ_PRIME2;
273 
274 	return pg_color;
275 }
276 
277 void
278 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
279 {
280 	KKASSERT(obj != NULL);
281 
282 	/*
283 	 * Object must be held (object allocation is stable due to callers
284 	 * context, typically already holding the token on a parent object)
285 	 * prior to potentially blocking on the lock, otherwise the object
286 	 * can get ripped away from us.
287 	 */
288 	refcount_acquire(&obj->hold_count);
289 	vm_object_lock(obj);
290 
291 #if defined(DEBUG_LOCKS)
292 	debugvm_object_add(obj, file, line, 1);
293 #endif
294 }
295 
296 int
297 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
298 {
299 	KKASSERT(obj != NULL);
300 
301 	/*
302 	 * Object must be held (object allocation is stable due to callers
303 	 * context, typically already holding the token on a parent object)
304 	 * prior to potentially blocking on the lock, otherwise the object
305 	 * can get ripped away from us.
306 	 */
307 	refcount_acquire(&obj->hold_count);
308 	if (vm_object_lock_try(obj) == 0) {
309 		if (refcount_release(&obj->hold_count)) {
310 			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
311 				kfree_obj(obj, M_VM_OBJECT);
312 		}
313 		return(0);
314 	}
315 
316 #if defined(DEBUG_LOCKS)
317 	debugvm_object_add(obj, file, line, 1);
318 #endif
319 	return(1);
320 }
321 
322 void
323 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
324 {
325 	KKASSERT(obj != NULL);
326 
327 	/*
328 	 * Object must be held (object allocation is stable due to callers
329 	 * context, typically already holding the token on a parent object)
330 	 * prior to potentially blocking on the lock, otherwise the object
331 	 * can get ripped away from us.
332 	 */
333 	refcount_acquire(&obj->hold_count);
334 	vm_object_lock_shared(obj);
335 
336 #if defined(DEBUG_LOCKS)
337 	debugvm_object_add(obj, file, line, 1);
338 #endif
339 }
340 
341 /*
342  * Drop the token and hold_count on the object.
343  *
344  * WARNING! Token might be shared.
345  */
346 void
347 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
348 {
349 	if (obj == NULL)
350 		return;
351 
352 	/*
353 	 * No new holders should be possible once we drop hold_count 1->0 as
354 	 * there is no longer any way to reference the object.
355 	 */
356 	KKASSERT(obj->hold_count > 0);
357 	if (refcount_release(&obj->hold_count)) {
358 #if defined(DEBUG_LOCKS)
359 		debugvm_object_add(obj, file, line, -1);
360 #endif
361 
362 		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
363 			vm_object_unlock(obj);
364 			kfree_obj(obj, M_VM_OBJECT);
365 		} else {
366 			vm_object_unlock(obj);
367 		}
368 	} else {
369 #if defined(DEBUG_LOCKS)
370 		debugvm_object_add(obj, file, line, -1);
371 #endif
372 		vm_object_unlock(obj);
373 	}
374 }
375 
376 /*
377  * Initialize a freshly allocated object, returning a held object.
378  *
379  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
380  *
381  * No requirements.
382  */
383 void
384 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
385 		    const char *ident)
386 {
387 	struct vm_object_hash *hash;
388 
389 	RB_INIT(&object->rb_memq);
390 	lwkt_token_init(&object->token, ident);
391 
392 	TAILQ_INIT(&object->backing_list);
393 	lockinit(&object->backing_lk, "baclk", 0, 0);
394 
395 	object->type = type;
396 	object->size = size;
397 	object->ref_count = 1;
398 	object->memattr = VM_MEMATTR_DEFAULT;
399 	object->hold_count = 0;
400 	object->flags = 0;
401 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
402 		vm_object_set_flag(object, OBJ_ONEMAPPING);
403 	object->paging_in_progress = 0;
404 	object->resident_page_count = 0;
405 	/* cpu localization twist */
406 	object->pg_color = vm_quickcolor();
407 	object->handle = NULL;
408 
409 	atomic_add_int(&object->generation, 1);
410 	object->swblock_count = 0;
411 	RB_INIT(&object->swblock_root);
412 	vm_object_lock_init(object);
413 	pmap_object_init(object);
414 
415 	vm_object_hold(object);
416 
417 	hash = vmobj_hash(object);
418 	lwkt_gettoken(&hash->token);
419 	TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
420 	lwkt_reltoken(&hash->token);
421 }
422 
423 /*
424  * Initialize a VM object.
425  */
426 void
427 vm_object_init(vm_object_t object, vm_pindex_t size)
428 {
429 	_vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj");
430 	vm_object_drop(object);
431 }
432 
433 /*
434  * Initialize the VM objects module.
435  *
436  * Called from the low level boot code only.  Note that this occurs before
437  * kmalloc is initialized so we cannot allocate any VM objects.
438  */
439 void
440 vm_object_init1(void)
441 {
442 	int i;
443 
444 	for (i = 0; i < VMOBJ_HSIZE; ++i) {
445 		TAILQ_INIT(&vm_object_hash[i].list);
446 		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
447 	}
448 
449 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
450 			    kernel_object, "kobj");
451 	vm_object_drop(kernel_object);
452 }
453 
454 void
455 vm_object_init2(void)
456 {
457 	kmalloc_obj_set_unlimited(M_VM_OBJECT);
458 }
459 
460 /*
461  * Allocate and return a new object of the specified type and size.
462  *
463  * No requirements.
464  */
465 vm_object_t
466 vm_object_allocate(objtype_t type, vm_pindex_t size)
467 {
468 	vm_object_t obj;
469 
470 	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
471 	_vm_object_allocate(type, size, obj, "vmobj");
472 	vm_object_drop(obj);
473 
474 	return (obj);
475 }
476 
477 /*
478  * This version returns a held object, allowing further atomic initialization
479  * of the object.
480  */
481 vm_object_t
482 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
483 {
484 	vm_object_t obj;
485 
486 	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
487 	_vm_object_allocate(type, size, obj, "vmobj");
488 
489 	return (obj);
490 }
491 
492 /*
493  * Add an additional reference to a vm_object.  The object must already be
494  * held.  The original non-lock version is no longer supported.  The object
495  * must NOT be chain locked by anyone at the time the reference is added.
496  *
497  * The object must be held, but may be held shared if desired (hence why
498  * we use an atomic op).
499  */
500 void
501 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
502 {
503 	KKASSERT(object != NULL);
504 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
505 	atomic_add_int(&object->ref_count, 1);
506 	if (object->type == OBJT_VNODE) {
507 		vref(object->handle);
508 		/* XXX what if the vnode is being destroyed? */
509 	}
510 #if defined(DEBUG_LOCKS)
511 	debugvm_object_add(object, file, line, 1);
512 #endif
513 }
514 
515 /*
516  * This version is only allowed in situations where the caller
517  * already knows that the object is deterministically referenced
518  * (usually because its taken from a ref'd vnode, or during a map_entry
519  * replication).
520  */
521 void
522 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
523 {
524 	KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
525 	atomic_add_int(&object->ref_count, 1);
526 	if (object->type == OBJT_VNODE)
527 		vref(object->handle);
528 #if defined(DEBUG_LOCKS)
529 	debugvm_object_add(object, file, line, 1);
530 #endif
531 }
532 
533 /*
534  * Dereference an object and its underlying vnode.  The object may be
535  * held shared.  On return the object will remain held.
536  *
537  * This function may return a vnode in *vpp which the caller must release
538  * after the caller drops its own lock.  If vpp is NULL, we assume that
539  * the caller was holding an exclusive lock on the object and we vrele()
540  * the vp ourselves.
541  */
542 static void
543 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
544 				   VMOBJDBARGS)
545 {
546 	struct vnode *vp = (struct vnode *) object->handle;
547 	int count;
548 
549 	KASSERT(object->type == OBJT_VNODE,
550 	    ("vm_object_vndeallocate: not a vnode object"));
551 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
552 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
553 #ifdef INVARIANTS
554 	if (object->ref_count == 0) {
555 		vprint("vm_object_vndeallocate", vp);
556 		panic("vm_object_vndeallocate: bad object reference count");
557 	}
558 #endif
559 	count = object->ref_count;
560 	cpu_ccfence();
561 	for (;;) {
562 		if (count == 1) {
563 			vm_object_upgrade(object);
564 			if (atomic_fcmpset_int(&object->ref_count, &count, 0)) {
565 				vclrflags(vp, VTEXT);
566 				break;
567 			}
568 		} else {
569 			if (atomic_fcmpset_int(&object->ref_count,
570 					       &count, count - 1)) {
571 				break;
572 			}
573 		}
574 		cpu_pause();
575 		/* retry */
576 	}
577 #if defined(DEBUG_LOCKS)
578 	debugvm_object_add(object, file, line, -1);
579 #endif
580 
581 	/*
582 	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
583 	 * if the object was locked exclusively.  But there are two races
584 	 * here.
585 	 *
586 	 * We had to upgrade the object above to safely clear VTEXT
587 	 * but the alternative path where the shared lock is retained
588 	 * can STILL race to 0 in other paths and cause our own vrele()
589 	 * to terminate the vnode.  We can't allow that if the VM object
590 	 * is still locked shared.
591 	 */
592 	if (vpp)
593 		*vpp = vp;
594 	else
595 		vrele(vp);
596 }
597 
598 /*
599  * Release a reference to the specified object, gained either through a
600  * vm_object_allocate or a vm_object_reference call.  When all references
601  * are gone, storage associated with this object may be relinquished.
602  *
603  * The caller does not have to hold the object locked but must have control
604  * over the reference in question in order to guarantee that the object
605  * does not get ripped out from under us.
606  *
607  * XXX Currently all deallocations require an exclusive lock.
608  */
609 void
610 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
611 {
612 	struct vnode *vp;
613 	int count;
614 
615 	if (object == NULL)
616 		return;
617 
618 	count = object->ref_count;
619 	cpu_ccfence();
620 	for (;;) {
621 		/*
622 		 * If decrementing the count enters into special handling
623 		 * territory (0, 1, or 2) we have to do it the hard way.
624 		 * Fortunate though, objects with only a few refs like this
625 		 * are not likely to be heavily contended anyway.
626 		 *
627 		 * For vnode objects we only care about 1->0 transitions.
628 		 */
629 		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
630 #if defined(DEBUG_LOCKS)
631 			debugvm_object_add(object, file, line, 0);
632 #endif
633 			vm_object_hold(object);
634 			vm_object_deallocate_locked(object);
635 			vm_object_drop(object);
636 			break;
637 		}
638 
639 		/*
640 		 * Try to decrement ref_count without acquiring a hold on
641 		 * the object.  This is particularly important for the exec*()
642 		 * and exit*() code paths because the program binary may
643 		 * have a great deal of sharing and an exclusive lock will
644 		 * crowbar performance in those circumstances.
645 		 */
646 		if (object->type == OBJT_VNODE) {
647 			vp = (struct vnode *)object->handle;
648 			if (atomic_fcmpset_int(&object->ref_count,
649 					       &count, count - 1)) {
650 #if defined(DEBUG_LOCKS)
651 				debugvm_object_add(object, file, line, -1);
652 #endif
653 
654 				vrele(vp);
655 				break;
656 			}
657 			/* retry */
658 		} else {
659 			if (atomic_fcmpset_int(&object->ref_count,
660 					       &count, count - 1)) {
661 #if defined(DEBUG_LOCKS)
662 				debugvm_object_add(object, file, line, -1);
663 #endif
664 				break;
665 			}
666 			/* retry */
667 		}
668 		cpu_pause();
669 		/* retry */
670 	}
671 }
672 
673 void
674 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
675 {
676 	/*
677 	 * Degenerate case
678 	 */
679 	if (object == NULL)
680 		return;
681 
682 	/*
683 	 * vnode case, caller either locked the object exclusively
684 	 * or this is a recursion with must_drop != 0 and the vnode
685 	 * object will be locked shared.
686 	 *
687 	 * If locked shared we have to drop the object before we can
688 	 * call vrele() or risk a shared/exclusive livelock.
689 	 */
690 	if (object->type == OBJT_VNODE) {
691 		ASSERT_LWKT_TOKEN_HELD(&object->token);
692 		vm_object_vndeallocate(object, NULL);
693 		return;
694 	}
695 	ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
696 
697 	/*
698 	 * Normal case (object is locked exclusively)
699 	 */
700 	if (object->ref_count == 0) {
701 		panic("vm_object_deallocate: object deallocated "
702 		      "too many times: %d", object->type);
703 	}
704 	if (object->ref_count > 2) {
705 		atomic_add_int(&object->ref_count, -1);
706 #if defined(DEBUG_LOCKS)
707 		debugvm_object_add(object, file, line, -1);
708 #endif
709 		return;
710 	}
711 
712 	/*
713 	 * Drop the ref and handle termination on the 1->0 transition.
714 	 * We may have blocked above so we have to recheck.
715 	 */
716 	KKASSERT(object->ref_count != 0);
717 	if (object->ref_count >= 2) {
718 		atomic_add_int(&object->ref_count, -1);
719 #if defined(DEBUG_LOCKS)
720 		debugvm_object_add(object, file, line, -1);
721 #endif
722 		return;
723 	}
724 
725 	atomic_add_int(&object->ref_count, -1);
726 	if ((object->flags & OBJ_DEAD) == 0)
727 		vm_object_terminate(object);
728 }
729 
730 /*
731  * Destroy the specified object, freeing up related resources.
732  *
733  * The object must have zero references.
734  *
735  * The object must held.  The caller is responsible for dropping the object
736  * after terminate returns.  Terminate does NOT drop the object.
737  */
738 static int vm_object_terminate_callback(vm_page_t p, void *data);
739 
740 void
741 vm_object_terminate(vm_object_t object)
742 {
743 	struct rb_vm_page_scan_info info;
744 	struct vm_object_hash *hash;
745 
746 	/*
747 	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
748 	 * able to safely block.
749 	 */
750 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
751 	KKASSERT((object->flags & OBJ_DEAD) == 0);
752 	vm_object_set_flag(object, OBJ_DEAD);
753 
754 	/*
755 	 * Wait for the pageout daemon to be done with the object
756 	 */
757 	vm_object_pip_wait(object, "objtrm1");
758 
759 	KASSERT(!object->paging_in_progress,
760 		("vm_object_terminate: pageout in progress"));
761 
762 	/*
763 	 * Clean and free the pages, as appropriate. All references to the
764 	 * object are gone, so we don't need to lock it.
765 	 */
766 	if (object->type == OBJT_VNODE) {
767 		struct vnode *vp;
768 
769 		/*
770 		 * Clean pages and flush buffers.
771 		 *
772 		 * NOTE!  TMPFS buffer flushes do not typically flush the
773 		 *	  actual page to swap as this would be highly
774 		 *	  inefficient, and normal filesystems usually wrap
775 		 *	  page flushes with buffer cache buffers.
776 		 *
777 		 *	  To deal with this we have to call vinvalbuf() both
778 		 *	  before and after the vm_object_page_clean().
779 		 */
780 		vp = (struct vnode *) object->handle;
781 		vinvalbuf(vp, V_SAVE, 0, 0);
782 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
783 		vinvalbuf(vp, V_SAVE, 0, 0);
784 	}
785 
786 	/*
787 	 * Wait for any I/O to complete, after which there had better not
788 	 * be any references left on the object.
789 	 */
790 	vm_object_pip_wait(object, "objtrm2");
791 
792 	if (object->ref_count != 0) {
793 		panic("vm_object_terminate: object with references, "
794 		      "ref_count=%d", object->ref_count);
795 	}
796 
797 	/*
798 	 * Cleanup any shared pmaps associated with this object.
799 	 */
800 	pmap_object_free(object);
801 
802 	/*
803 	 * Now free any remaining pages. For internal objects, this also
804 	 * removes them from paging queues. Don't free wired pages, just
805 	 * remove them from the object.
806 	 */
807 	info.count = 0;
808 	info.object = object;
809 	do {
810 		info.error = 0;
811 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
812 					vm_object_terminate_callback, &info);
813 	} while (info.error);
814 
815 	/*
816 	 * Let the pager know object is dead.
817 	 */
818 	vm_pager_deallocate(object);
819 
820 	/*
821 	 * Wait for the object hold count to hit 1, clean out pages as
822 	 * we go.  vmobj_token interlocks any race conditions that might
823 	 * pick the object up from the vm_object_list after we have cleared
824 	 * rb_memq.
825 	 */
826 	for (;;) {
827 		if (RB_ROOT(&object->rb_memq) == NULL)
828 			break;
829 		kprintf("vm_object_terminate: Warning, object %p "
830 			"still has %ld pages\n",
831 			object, object->resident_page_count);
832 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
833 					vm_object_terminate_callback, &info);
834 	}
835 
836 	/*
837 	 * There had better not be any pages left
838 	 */
839 	KKASSERT(object->resident_page_count == 0);
840 
841 	/*
842 	 * Remove the object from the global object list.
843 	 */
844 	hash = vmobj_hash(object);
845 	lwkt_gettoken(&hash->token);
846 	TAILQ_REMOVE(&hash->list, object, object_entry);
847 	lwkt_reltoken(&hash->token);
848 
849 	if (object->ref_count != 0) {
850 		panic("vm_object_terminate2: object with references, "
851 		      "ref_count=%d", object->ref_count);
852 	}
853 
854 	/*
855 	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
856 	 *	 the object here.  See vm_object_drop().
857 	 */
858 }
859 
860 /*
861  * The caller must hold the object.
862  *
863  * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED
864  *	 or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync()
865  *	 is called, due to normal pmap operations.  This is because only
866  *	 global pmap operations on the vm_page can clear the bits and not
867  *	 just local operations on individual pmaps.
868  *
869  *	 Most interactions that necessitate the clearing of these bits
870  *	 proactively call vm_page_protect(), and we must do so here as well.
871  */
872 static int
873 vm_object_terminate_callback(vm_page_t p, void *data)
874 {
875 	struct rb_vm_page_scan_info *info = data;
876 	vm_object_t object;
877 
878 	object = p->object;
879 	KKASSERT(object == info->object);
880 	if (vm_page_busy_try(p, TRUE)) {
881 		vm_page_sleep_busy(p, TRUE, "vmotrm");
882 		info->error = 1;
883 		return 0;
884 	}
885 	if (object != p->object) {
886 		/* XXX remove once we determine it can't happen */
887 		kprintf("vm_object_terminate: Warning: Encountered "
888 			"busied page %p on queue %d\n", p, p->queue);
889 		vm_page_wakeup(p);
890 		info->error = 1;
891 	} else if (p->wire_count == 0) {
892 		/*
893 		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
894 		 */
895 		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
896 			vm_page_protect(p, VM_PROT_NONE);
897 		vm_page_free(p);
898 		mycpu->gd_cnt.v_pfree++;
899 	} else {
900 		if (p->queue != PQ_NONE) {
901 			kprintf("vm_object_terminate: Warning: Encountered "
902 				"wired page %p on queue %d\n", p, p->queue);
903 			if (vm_object_debug > 0) {
904 				--vm_object_debug;
905 				print_backtrace(10);
906 			}
907 		}
908 		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
909 			vm_page_protect(p, VM_PROT_NONE);
910 		vm_page_remove(p);
911 		vm_page_wakeup(p);
912 	}
913 
914 	/*
915 	 * Must be at end to avoid SMP races, caller holds object token
916 	 */
917 	if ((++info->count & 63) == 0)
918 		lwkt_user_yield();
919 	return(0);
920 }
921 
922 /*
923  * Clean all dirty pages in the specified range of object.  Leaves page
924  * on whatever queue it is currently on.   If NOSYNC is set then do not
925  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
926  * leaving the object dirty.
927  *
928  * When stuffing pages asynchronously, allow clustering.  XXX we need a
929  * synchronous clustering mode implementation.
930  *
931  * Odd semantics: if start == end, we clean everything.
932  *
933  * The object must be locked? XXX
934  */
935 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
936 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
937 
938 void
939 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
940 		     int flags)
941 {
942 	struct rb_vm_page_scan_info info;
943 	struct vnode *vp;
944 	int wholescan;
945 	int pagerflags;
946 	int generation;
947 
948 	vm_object_hold(object);
949 	if (object->type != OBJT_VNODE ||
950 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
951 		vm_object_drop(object);
952 		return;
953 	}
954 
955 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
956 			OBJPC_SYNC : OBJPC_CLUSTER_OK;
957 	pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0;
958 
959 	vp = object->handle;
960 
961 	/*
962 	 * Interlock other major object operations.  This allows us to
963 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
964 	 */
965 	vm_object_set_flag(object, OBJ_CLEANING);
966 
967 	/*
968 	 * Handle 'entire object' case
969 	 */
970 	info.start_pindex = start;
971 	if (end == 0) {
972 		info.end_pindex = object->size - 1;
973 	} else {
974 		info.end_pindex = end - 1;
975 	}
976 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
977 	info.limit = flags;
978 	info.pagerflags = pagerflags;
979 	info.object = object;
980 
981 	/*
982 	 * If cleaning the entire object do a pass to mark the pages read-only.
983 	 * If everything worked out ok, clear OBJ_WRITEABLE and
984 	 * OBJ_MIGHTBEDIRTY.
985 	 */
986 	if (wholescan) {
987 		info.error = 0;
988 		info.count = 0;
989 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
990 					vm_object_page_clean_pass1, &info);
991 		if (info.error == 0) {
992 			vm_object_clear_flag(object,
993 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
994 			if (object->type == OBJT_VNODE &&
995 			    (vp = (struct vnode *)object->handle) != NULL) {
996 				/*
997 				 * Use new-style interface to clear VISDIRTY
998 				 * because the vnode is not necessarily removed
999 				 * from the syncer list(s) as often as it was
1000 				 * under the old interface, which can leave
1001 				 * the vnode on the syncer list after reclaim.
1002 				 */
1003 				vclrobjdirty(vp);
1004 			}
1005 		}
1006 	}
1007 
1008 	/*
1009 	 * Do a pass to clean all the dirty pages we find.
1010 	 */
1011 	do {
1012 		info.error = 0;
1013 		info.count = 0;
1014 		generation = object->generation;
1015 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1016 					vm_object_page_clean_pass2, &info);
1017 	} while (info.error || generation != object->generation);
1018 
1019 	vm_object_clear_flag(object, OBJ_CLEANING);
1020 	vm_object_drop(object);
1021 }
1022 
1023 /*
1024  * The caller must hold the object.
1025  */
1026 static
1027 int
1028 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1029 {
1030 	struct rb_vm_page_scan_info *info = data;
1031 
1032 	KKASSERT(p->object == info->object);
1033 
1034 	vm_page_flag_set(p, PG_CLEANCHK);
1035 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1036 		info->error = 1;
1037 	} else if (vm_page_busy_try(p, FALSE)) {
1038 		info->error = 1;
1039 	} else {
1040 		KKASSERT(p->object == info->object);
1041 		vm_page_protect(p, VM_PROT_READ);
1042 		vm_page_wakeup(p);
1043 	}
1044 
1045 	/*
1046 	 * Must be at end to avoid SMP races, caller holds object token
1047 	 */
1048 	if ((++info->count & 63) == 0)
1049 		lwkt_user_yield();
1050 	return(0);
1051 }
1052 
1053 /*
1054  * The caller must hold the object
1055  */
1056 static
1057 int
1058 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1059 {
1060 	struct rb_vm_page_scan_info *info = data;
1061 	int generation;
1062 
1063 	KKASSERT(p->object == info->object);
1064 
1065 	/*
1066 	 * Do not mess with pages that were inserted after we started
1067 	 * the cleaning pass.
1068 	 */
1069 	if ((p->flags & PG_CLEANCHK) == 0)
1070 		goto done;
1071 
1072 	generation = info->object->generation;
1073 
1074 	if (vm_page_busy_try(p, TRUE)) {
1075 		vm_page_sleep_busy(p, TRUE, "vpcwai");
1076 		info->error = 1;
1077 		goto done;
1078 	}
1079 
1080 	KKASSERT(p->object == info->object &&
1081 		 info->object->generation == generation);
1082 
1083 	/*
1084 	 * Before wasting time traversing the pmaps, check for trivial
1085 	 * cases where the page cannot be dirty.
1086 	 */
1087 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1088 		KKASSERT((p->dirty & p->valid) == 0 &&
1089 			 (p->flags & PG_NEED_COMMIT) == 0);
1090 		vm_page_wakeup(p);
1091 		goto done;
1092 	}
1093 
1094 	/*
1095 	 * Check whether the page is dirty or not.  The page has been set
1096 	 * to be read-only so the check will not race a user dirtying the
1097 	 * page.
1098 	 */
1099 	vm_page_test_dirty(p);
1100 	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1101 		vm_page_flag_clear(p, PG_CLEANCHK);
1102 		vm_page_wakeup(p);
1103 		goto done;
1104 	}
1105 
1106 	/*
1107 	 * If we have been asked to skip nosync pages and this is a
1108 	 * nosync page, skip it.  Note that the object flags were
1109 	 * not cleared in this case (because pass1 will have returned an
1110 	 * error), so we do not have to set them.
1111 	 */
1112 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1113 		vm_page_flag_clear(p, PG_CLEANCHK);
1114 		vm_page_wakeup(p);
1115 		goto done;
1116 	}
1117 
1118 	/*
1119 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1120 	 * the pages that get successfully flushed.  Set info->error if
1121 	 * we raced an object modification.
1122 	 */
1123 	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1124 	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1125 
1126 	/*
1127 	 * Must be at end to avoid SMP races, caller holds object token
1128 	 */
1129 done:
1130 	if ((++info->count & 63) == 0)
1131 		lwkt_user_yield();
1132 	return(0);
1133 }
1134 
1135 /*
1136  * Collect the specified page and nearby pages and flush them out.
1137  * The number of pages flushed is returned.  The passed page is busied
1138  * by the caller and we are responsible for its disposition.
1139  *
1140  * The caller must hold the object.
1141  */
1142 static void
1143 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1144 {
1145 	int error;
1146 	int is;
1147 	int ib;
1148 	int i;
1149 	int page_base;
1150 	vm_pindex_t pi;
1151 	vm_page_t ma[BLIST_MAX_ALLOC];
1152 
1153 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1154 
1155 	pi = p->pindex;
1156 	page_base = pi % BLIST_MAX_ALLOC;
1157 	ma[page_base] = p;
1158 	ib = page_base - 1;
1159 	is = page_base + 1;
1160 
1161 	while (ib >= 0) {
1162 		vm_page_t tp;
1163 
1164 		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1165 					     TRUE, &error);
1166 		if (error)
1167 			break;
1168 		if (tp == NULL)
1169 			break;
1170 		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1171 		    (tp->flags & PG_CLEANCHK) == 0) {
1172 			vm_page_wakeup(tp);
1173 			break;
1174 		}
1175 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1176 			vm_page_flag_clear(tp, PG_CLEANCHK);
1177 			vm_page_wakeup(tp);
1178 			break;
1179 		}
1180 		vm_page_test_dirty(tp);
1181 		if ((tp->dirty & tp->valid) == 0 &&
1182 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1183 			vm_page_flag_clear(tp, PG_CLEANCHK);
1184 			vm_page_wakeup(tp);
1185 			break;
1186 		}
1187 		ma[ib] = tp;
1188 		--ib;
1189 	}
1190 	++ib;	/* fixup */
1191 
1192 	while (is < BLIST_MAX_ALLOC &&
1193 	       pi - page_base + is < object->size) {
1194 		vm_page_t tp;
1195 
1196 		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1197 					     TRUE, &error);
1198 		if (error)
1199 			break;
1200 		if (tp == NULL)
1201 			break;
1202 		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1203 		    (tp->flags & PG_CLEANCHK) == 0) {
1204 			vm_page_wakeup(tp);
1205 			break;
1206 		}
1207 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1208 			vm_page_flag_clear(tp, PG_CLEANCHK);
1209 			vm_page_wakeup(tp);
1210 			break;
1211 		}
1212 		vm_page_test_dirty(tp);
1213 		if ((tp->dirty & tp->valid) == 0 &&
1214 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1215 			vm_page_flag_clear(tp, PG_CLEANCHK);
1216 			vm_page_wakeup(tp);
1217 			break;
1218 		}
1219 		ma[is] = tp;
1220 		++is;
1221 	}
1222 
1223 	/*
1224 	 * All pages in the ma[] array are busied now
1225 	 */
1226 	for (i = ib; i < is; ++i) {
1227 		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1228 		vm_page_hold(ma[i]);	/* XXX need this any more? */
1229 	}
1230 	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1231 	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1232 		vm_page_unhold(ma[i]);
1233 }
1234 
1235 /*
1236  * Implements the madvise function at the object/page level.
1237  *
1238  * MADV_WILLNEED	(any object)
1239  *
1240  *	Activate the specified pages if they are resident.
1241  *
1242  * MADV_DONTNEED	(any object)
1243  *
1244  *	Deactivate the specified pages if they are resident.
1245  *
1246  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1247  *
1248  *	Deactivate and clean the specified pages if they are
1249  *	resident.  This permits the process to reuse the pages
1250  *	without faulting or the kernel to reclaim the pages
1251  *	without I/O.
1252  *
1253  * No requirements.
1254  */
1255 void
1256 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1257 		  vm_pindex_t count, int advise)
1258 {
1259 	vm_pindex_t end;
1260 	vm_page_t m;
1261 	int error;
1262 
1263 	if (object == NULL)
1264 		return;
1265 
1266 	end = pindex + count;
1267 
1268 	vm_object_hold(object);
1269 
1270 	/*
1271 	 * Locate and adjust resident pages.  This only applies to the
1272 	 * primary object in the mapping.
1273 	 */
1274 	for (; pindex < end; pindex += 1) {
1275 relookup:
1276 		/*
1277 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1278 		 * and those pages must be OBJ_ONEMAPPING.
1279 		 */
1280 		if (advise == MADV_FREE) {
1281 			if ((object->type != OBJT_DEFAULT &&
1282 			     object->type != OBJT_SWAP) ||
1283 			    (object->flags & OBJ_ONEMAPPING) == 0) {
1284 				continue;
1285 			}
1286 		}
1287 
1288 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1289 
1290 		if (error) {
1291 			vm_page_sleep_busy(m, TRUE, "madvpo");
1292 			goto relookup;
1293 		}
1294 		if (m == NULL) {
1295 			/*
1296 			 * There may be swap even if there is no backing page
1297 			 */
1298 			if (advise == MADV_FREE && object->type == OBJT_SWAP)
1299 				swap_pager_freespace(object, pindex, 1);
1300 			continue;
1301 		}
1302 
1303 		/*
1304 		 * If the page is not in a normal active state, we skip it.
1305 		 * If the page is not managed there are no page queues to
1306 		 * mess with.  Things can break if we mess with pages in
1307 		 * any of the below states.
1308 		 */
1309 		if (m->wire_count ||
1310 		    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1311 				 PG_NEED_COMMIT)) ||
1312 		    m->valid != VM_PAGE_BITS_ALL
1313 		) {
1314 			vm_page_wakeup(m);
1315 			continue;
1316 		}
1317 
1318 		/*
1319 		 * Theoretically once a page is known not to be busy, an
1320 		 * interrupt cannot come along and rip it out from under us.
1321 		 */
1322 		if (advise == MADV_WILLNEED) {
1323 			vm_page_activate(m);
1324 		} else if (advise == MADV_DONTNEED) {
1325 			vm_page_dontneed(m);
1326 		} else if (advise == MADV_FREE) {
1327 			/*
1328 			 * Mark the page clean.  This will allow the page
1329 			 * to be freed up by the system.  However, such pages
1330 			 * are often reused quickly by malloc()/free()
1331 			 * so we do not do anything that would cause
1332 			 * a page fault if we can help it.
1333 			 *
1334 			 * Specifically, we do not try to actually free
1335 			 * the page now nor do we try to put it in the
1336 			 * cache (which would cause a page fault on reuse).
1337 			 *
1338 			 * But we do make the page is freeable as we
1339 			 * can without actually taking the step of unmapping
1340 			 * it.
1341 			 */
1342 			pmap_clear_modify(m);
1343 			m->dirty = 0;
1344 			m->act_count = 0;
1345 			vm_page_dontneed(m);
1346 			if (object->type == OBJT_SWAP)
1347 				swap_pager_freespace(object, pindex, 1);
1348 		}
1349 		vm_page_wakeup(m);
1350 	}
1351 	vm_object_drop(object);
1352 }
1353 
1354 /*
1355  * Removes all physical pages in the specified object range from the
1356  * object's list of pages.
1357  *
1358  * No requirements.
1359  */
1360 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1361 
1362 void
1363 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1364 		      boolean_t clean_only)
1365 {
1366 	struct rb_vm_page_scan_info info;
1367 	int all;
1368 
1369 	/*
1370 	 * Degenerate cases and assertions.
1371 	 *
1372 	 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects.
1373 	 *	 These objects do not have to have their pages entered into
1374 	 *	 them and are handled via their vm_map_backing lists.
1375 	 */
1376 	vm_object_hold(object);
1377 	if (object == NULL ||
1378 	    (object->type != OBJT_MGTDEVICE &&
1379 	     object->resident_page_count == 0 && object->swblock_count == 0)) {
1380 		vm_object_drop(object);
1381 		return;
1382 	}
1383 	KASSERT(object->type != OBJT_PHYS,
1384 		("attempt to remove pages from a physical object"));
1385 
1386 	/*
1387 	 * Indicate that paging is occuring on the object
1388 	 */
1389 	vm_object_pip_add(object, 1);
1390 
1391 	/*
1392 	 * Figure out the actual removal range and whether we are removing
1393 	 * the entire contents of the object or not.  If removing the entire
1394 	 * contents, be sure to get all pages, even those that might be
1395 	 * beyond the end of the object.
1396 	 *
1397 	 * NOTE: end is non-inclusive, but info.end_pindex is inclusive.
1398 	 */
1399 	info.object = object;
1400 	info.start_pindex = start;
1401 	if (end == 0 || end == (vm_pindex_t)-1) {
1402 		info.end_pindex = (vm_pindex_t)-1;
1403 		end = object->size;
1404 	} else {
1405 		info.end_pindex = end - 1;
1406 	}
1407 	info.limit = clean_only;
1408 	info.count = 0;
1409 	all = (start == 0 && info.end_pindex >= object->size - 1);
1410 
1411 	/*
1412 	 * Efficiently remove pages from the pmap via a backing scan.
1413 	 *
1414 	 * NOTE: This is the only way pages can be removed and unwired
1415 	 *	 from OBJT_MGTDEVICE devices which typically do not enter
1416 	 *	 their pages into the vm_object's RB tree.  And possibly
1417 	 *	 other OBJT_* types in the future.
1418 	 */
1419 	{
1420 		vm_map_backing_t ba;
1421 		vm_pindex_t sba, eba;
1422 		vm_offset_t sva, eva;
1423 
1424 		lockmgr(&object->backing_lk, LK_EXCLUSIVE);
1425 		TAILQ_FOREACH(ba, &object->backing_list, entry) {
1426 			/*
1427 			 * object offset range within the ba, intersectioned
1428 			 * with the page range specified for the object
1429 			 */
1430 			sba = OFF_TO_IDX(ba->offset);
1431 			eba = sba + OFF_TO_IDX(ba->end - ba->start);
1432 			if (sba < start)
1433 				sba = start;
1434 			if (eba > end)
1435 				eba = end;
1436 
1437 			/*
1438 			 * If the intersection is valid, remove the related
1439 			 * pages.
1440 			 *
1441 			 * NOTE! This may also remove other incidental pages
1442 			 *	 in the pmap, as the backing area may be
1443 			 *	 overloaded.
1444 			 *
1445 			 * NOTE! pages for MGTDEVICE objects are only removed
1446 			 *	 here, they aren't entered into rb_memq, so
1447 			 *	 we must use pmap_remove() instead of
1448 			 *	 the non-TLB-invalidating pmap_remove_pages().
1449 			 */
1450 			if (sba < eba) {
1451 				sva = ba->start + IDX_TO_OFF(sba) - ba->offset;
1452 				eva = sva + IDX_TO_OFF(eba - sba);
1453 #if 0
1454 				kprintf("VM_OBJECT_PAGE_REMOVE "
1455 					"%p[%016jx] %016jx-%016jx\n",
1456 					ba->pmap, ba->start, sva, eva);
1457 #endif
1458 				pmap_remove(ba->pmap, sva, eva);
1459 			}
1460 		}
1461 		lockmgr(&object->backing_lk, LK_RELEASE);
1462 	}
1463 
1464 	/*
1465 	 * Remove and free pages entered onto the object list.  Note that
1466 	 * for OBJT_MGTDEVICE objects, there are typically no pages entered.
1467 	 *
1468 	 * Loop until we are sure we have gotten them all.
1469 	 */
1470 	do {
1471 		info.error = 0;
1472 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1473 					vm_object_page_remove_callback, &info);
1474 	} while (info.error);
1475 
1476 	/*
1477 	 * Remove any related swap if throwing away pages, or for
1478 	 * non-swap objects (the swap is a clean copy in that case).
1479 	 */
1480 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1481 		if (all)
1482 			swap_pager_freespace_all(object);
1483 		else
1484 			swap_pager_freespace(object, info.start_pindex,
1485 			     info.end_pindex - info.start_pindex + 1);
1486 	}
1487 
1488 	/*
1489 	 * Cleanup
1490 	 */
1491 	vm_object_pip_wakeup(object);
1492 	vm_object_drop(object);
1493 }
1494 
1495 /*
1496  * The caller must hold the object.
1497  *
1498  * NOTE: User yields are allowed when removing more than one page, but not
1499  *	 allowed if only removing one page (the path for single page removals
1500  *	 might hold a spinlock).
1501  */
1502 static int
1503 vm_object_page_remove_callback(vm_page_t p, void *data)
1504 {
1505 	struct rb_vm_page_scan_info *info = data;
1506 
1507 	if (info->object != p->object ||
1508 	    p->pindex < info->start_pindex ||
1509 	    p->pindex > info->end_pindex) {
1510 		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1511 			info->object, p);
1512 		return(0);
1513 	}
1514 	if (vm_page_busy_try(p, TRUE)) {
1515 		vm_page_sleep_busy(p, TRUE, "vmopar");
1516 		info->error = 1;
1517 		return(0);
1518 	}
1519 	if (info->object != p->object) {
1520 		/* this should never happen */
1521 		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1522 			info->object, p);
1523 		vm_page_wakeup(p);
1524 		return(0);
1525 	}
1526 
1527 	/*
1528 	 * Wired pages cannot be destroyed, but they can be invalidated
1529 	 * and we do so if clean_only (limit) is not set.
1530 	 *
1531 	 * WARNING!  The page may be wired due to being part of a buffer
1532 	 *	     cache buffer, and the buffer might be marked B_CACHE.
1533 	 *	     This is fine as part of a truncation but VFSs must be
1534 	 *	     sure to fix the buffer up when re-extending the file.
1535 	 *
1536 	 * NOTE!     PG_NEED_COMMIT is ignored.
1537 	 */
1538 	if (p->wire_count != 0) {
1539 		vm_page_protect(p, VM_PROT_NONE);
1540 		if (info->limit == 0)
1541 			p->valid = 0;
1542 		vm_page_wakeup(p);
1543 		goto done;
1544 	}
1545 
1546 	/*
1547 	 * limit is our clean_only flag.  If set and the page is dirty or
1548 	 * requires a commit, do not free it.  If set and the page is being
1549 	 * held by someone, do not free it.
1550 	 */
1551 	if (info->limit && p->valid) {
1552 		vm_page_test_dirty(p);
1553 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1554 			vm_page_wakeup(p);
1555 			goto done;
1556 		}
1557 	}
1558 
1559 	/*
1560 	 * Destroy the page.  But we have to re-test whether its dirty after
1561 	 * removing it from its pmaps.
1562 	 */
1563 	vm_page_protect(p, VM_PROT_NONE);
1564 	if (info->limit && p->valid) {
1565 		vm_page_test_dirty(p);
1566 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1567 			vm_page_wakeup(p);
1568 			goto done;
1569 		}
1570 	}
1571 	vm_page_free(p);
1572 
1573 	/*
1574 	 * Must be at end to avoid SMP races, caller holds object token
1575 	 */
1576 done:
1577 	if ((++info->count & 63) == 0)
1578 		lwkt_user_yield();
1579 
1580 	return(0);
1581 }
1582 
1583 /*
1584  * Try to extend prev_object into an adjoining region of virtual
1585  * memory, return TRUE on success.
1586  *
1587  * The caller does not need to hold (prev_object) but must have a stable
1588  * pointer to it (typically by holding the vm_map locked).
1589  *
1590  * This function only works for anonymous memory objects which either
1591  * have (a) one reference or (b) we are extending the object's size.
1592  * Otherwise the related VM pages we want to use for the object might
1593  * be in use by another mapping.
1594  */
1595 boolean_t
1596 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1597 		   vm_size_t prev_size, vm_size_t next_size)
1598 {
1599 	vm_pindex_t next_pindex;
1600 
1601 	if (prev_object == NULL)
1602 		return (TRUE);
1603 
1604 	vm_object_hold(prev_object);
1605 
1606 	if (prev_object->type != OBJT_DEFAULT &&
1607 	    prev_object->type != OBJT_SWAP) {
1608 		vm_object_drop(prev_object);
1609 		return (FALSE);
1610 	}
1611 
1612 #if 0
1613 	/* caller now checks this */
1614 	/*
1615 	 * Try to collapse the object first
1616 	 */
1617 	vm_object_collapse(prev_object, NULL);
1618 #endif
1619 
1620 #if 0
1621 	/* caller now checks this */
1622 	/*
1623 	 * We can't coalesce if we shadow another object (figuring out the
1624 	 * relationships become too complex).
1625 	 */
1626 	if (prev_object->backing_object != NULL) {
1627 		vm_object_chain_release(prev_object);
1628 		vm_object_drop(prev_object);
1629 		return (FALSE);
1630 	}
1631 #endif
1632 
1633 	prev_size >>= PAGE_SHIFT;
1634 	next_size >>= PAGE_SHIFT;
1635 	next_pindex = prev_pindex + prev_size;
1636 
1637 	/*
1638 	 * We can't if the object has more than one ref count unless we
1639 	 * are extending it into newly minted space.
1640 	 */
1641 	if (prev_object->ref_count > 1 &&
1642 	    prev_object->size != next_pindex) {
1643 		vm_object_drop(prev_object);
1644 		return (FALSE);
1645 	}
1646 
1647 	/*
1648 	 * Remove any pages that may still be in the object from a previous
1649 	 * deallocation.
1650 	 */
1651 	if (next_pindex < prev_object->size) {
1652 		vm_object_page_remove(prev_object,
1653 				      next_pindex,
1654 				      next_pindex + next_size, FALSE);
1655 		if (prev_object->type == OBJT_SWAP)
1656 			swap_pager_freespace(prev_object,
1657 					     next_pindex, next_size);
1658 	}
1659 
1660 	/*
1661 	 * Extend the object if necessary.
1662 	 */
1663 	if (next_pindex + next_size > prev_object->size)
1664 		prev_object->size = next_pindex + next_size;
1665 	vm_object_drop(prev_object);
1666 
1667 	return (TRUE);
1668 }
1669 
1670 /*
1671  * Make the object writable and flag is being possibly dirty.
1672  *
1673  * The object might not be held (or might be held but held shared),
1674  * the related vnode is probably not held either.  Object and vnode are
1675  * stable by virtue of the vm_page busied by the caller preventing
1676  * destruction.
1677  *
1678  * If the related mount is flagged MNTK_THR_SYNC we need to call
1679  * vsetobjdirty().  Filesystems using this option usually shortcut
1680  * synchronization by only scanning the syncer list.
1681  */
1682 void
1683 vm_object_set_writeable_dirty(vm_object_t object)
1684 {
1685 	struct vnode *vp;
1686 
1687 	/*vm_object_assert_held(object);*/
1688 	/*
1689 	 * Avoid contention in vm fault path by checking the state before
1690 	 * issuing an atomic op on it.
1691 	 */
1692 	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1693 	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1694 		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1695 	}
1696 	if (object->type == OBJT_VNODE &&
1697 	    (vp = (struct vnode *)object->handle) != NULL) {
1698 		if ((vp->v_flag & VOBJDIRTY) == 0) {
1699 			if (vp->v_mount &&
1700 			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1701 				/*
1702 				 * New style THR_SYNC places vnodes on the
1703 				 * syncer list more deterministically.
1704 				 */
1705 				vsetobjdirty(vp);
1706 			} else {
1707 				/*
1708 				 * Old style scan would not necessarily place
1709 				 * a vnode on the syncer list when possibly
1710 				 * modified via mmap.
1711 				 */
1712 				vsetflags(vp, VOBJDIRTY);
1713 			}
1714 		}
1715 	}
1716 }
1717 
1718 #include "opt_ddb.h"
1719 #ifdef DDB
1720 #include <sys/cons.h>
1721 
1722 #include <ddb/ddb.h>
1723 
1724 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1725 				       vm_map_entry_t entry);
1726 static int	vm_object_in_map (vm_object_t object);
1727 
1728 /*
1729  * The caller must hold the object.
1730  */
1731 static int
1732 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1733 {
1734 	vm_map_backing_t ba;
1735 	vm_map_t tmpm;
1736 	vm_map_entry_t tmpe;
1737 	int entcount;
1738 
1739 	if (map == NULL)
1740 		return 0;
1741 	if (entry == NULL) {
1742 		tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1743 		entcount = map->nentries;
1744 		while (entcount-- && tmpe) {
1745 			if( _vm_object_in_map(map, object, tmpe)) {
1746 				return 1;
1747 			}
1748 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1749 		}
1750 		return (0);
1751 	}
1752 	switch(entry->maptype) {
1753 	case VM_MAPTYPE_SUBMAP:
1754 		tmpm = entry->ba.sub_map;
1755 		tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1756 		entcount = tmpm->nentries;
1757 		while (entcount-- && tmpe) {
1758 			if( _vm_object_in_map(tmpm, object, tmpe)) {
1759 				return 1;
1760 			}
1761 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1762 		}
1763 		break;
1764 	case VM_MAPTYPE_NORMAL:
1765 		ba = &entry->ba;
1766 		while (ba) {
1767 			if (ba->object == object)
1768 				return TRUE;
1769 			ba = ba->backing_ba;
1770 		}
1771 		break;
1772 	default:
1773 		break;
1774 	}
1775 	return 0;
1776 }
1777 
1778 static int vm_object_in_map_callback(struct proc *p, void *data);
1779 
1780 struct vm_object_in_map_info {
1781 	vm_object_t object;
1782 	int rv;
1783 };
1784 
1785 /*
1786  * Debugging only
1787  */
1788 static int
1789 vm_object_in_map(vm_object_t object)
1790 {
1791 	struct vm_object_in_map_info info;
1792 
1793 	info.rv = 0;
1794 	info.object = object;
1795 
1796 	allproc_scan(vm_object_in_map_callback, &info, 0);
1797 	if (info.rv)
1798 		return 1;
1799 	if( _vm_object_in_map(kernel_map, object, 0))
1800 		return 1;
1801 	if( _vm_object_in_map(pager_map, object, 0))
1802 		return 1;
1803 	if( _vm_object_in_map(buffer_map, object, 0))
1804 		return 1;
1805 	return 0;
1806 }
1807 
1808 /*
1809  * Debugging only
1810  */
1811 static int
1812 vm_object_in_map_callback(struct proc *p, void *data)
1813 {
1814 	struct vm_object_in_map_info *info = data;
1815 
1816 	if (p->p_vmspace) {
1817 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1818 			info->rv = 1;
1819 			return -1;
1820 		}
1821 	}
1822 	return (0);
1823 }
1824 
1825 DB_SHOW_COMMAND(vmochk, vm_object_check)
1826 {
1827 	struct vm_object_hash *hash;
1828 	vm_object_t object;
1829 	int n;
1830 
1831 	/*
1832 	 * make sure that internal objs are in a map somewhere
1833 	 * and none have zero ref counts.
1834 	 */
1835 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1836 		hash = &vm_object_hash[n];
1837 		for (object = TAILQ_FIRST(&hash->list);
1838 				object != NULL;
1839 				object = TAILQ_NEXT(object, object_entry)) {
1840 			if (object->type == OBJT_MARKER)
1841 				continue;
1842 			if (object->handle != NULL ||
1843 			    (object->type != OBJT_DEFAULT &&
1844 			     object->type != OBJT_SWAP)) {
1845 				continue;
1846 			}
1847 			if (object->ref_count == 0) {
1848 				db_printf("vmochk: internal obj has "
1849 					  "zero ref count: %ld\n",
1850 					  (long)object->size);
1851 			}
1852 			if (vm_object_in_map(object))
1853 				continue;
1854 			db_printf("vmochk: internal obj is not in a map: "
1855 				  "ref: %d, size: %lu: 0x%lx\n",
1856 				  object->ref_count, (u_long)object->size,
1857 				  (u_long)object->size);
1858 		}
1859 	}
1860 }
1861 
1862 /*
1863  * Debugging only
1864  */
1865 DB_SHOW_COMMAND(object, vm_object_print_static)
1866 {
1867 	/* XXX convert args. */
1868 	vm_object_t object = (vm_object_t)addr;
1869 	boolean_t full = have_addr;
1870 
1871 	vm_page_t p;
1872 
1873 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1874 #define	count	was_count
1875 
1876 	int count;
1877 
1878 	if (object == NULL)
1879 		return;
1880 
1881 	db_iprintf(
1882 	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1883 	    object, (int)object->type, (u_long)object->size,
1884 	    object->resident_page_count, object->ref_count, object->flags);
1885 	/*
1886 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1887 	 */
1888 	db_iprintf("\n");
1889 
1890 	if (!full)
1891 		return;
1892 
1893 	db_indent += 2;
1894 	count = 0;
1895 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1896 		if (count == 0)
1897 			db_iprintf("memory:=");
1898 		else if (count == 6) {
1899 			db_printf("\n");
1900 			db_iprintf(" ...");
1901 			count = 0;
1902 		} else
1903 			db_printf(",");
1904 		count++;
1905 
1906 		db_printf("(off=0x%lx,page=0x%lx)",
1907 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1908 	}
1909 	if (count != 0)
1910 		db_printf("\n");
1911 	db_indent -= 2;
1912 }
1913 
1914 /* XXX. */
1915 #undef count
1916 
1917 /*
1918  * XXX need this non-static entry for calling from vm_map_print.
1919  *
1920  * Debugging only
1921  */
1922 void
1923 vm_object_print(/* db_expr_t */ long addr,
1924 		boolean_t have_addr,
1925 		/* db_expr_t */ long count,
1926 		char *modif)
1927 {
1928 	vm_object_print_static(addr, have_addr, count, modif);
1929 }
1930 
1931 /*
1932  * Debugging only
1933  */
1934 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1935 {
1936 	struct vm_object_hash *hash;
1937 	vm_object_t object;
1938 	int nl = 0;
1939 	int c;
1940 	int n;
1941 
1942 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1943 		hash = &vm_object_hash[n];
1944 		for (object = TAILQ_FIRST(&hash->list);
1945 				object != NULL;
1946 				object = TAILQ_NEXT(object, object_entry)) {
1947 			vm_pindex_t idx, fidx;
1948 			vm_pindex_t osize;
1949 			vm_paddr_t pa = -1, padiff;
1950 			int rcount;
1951 			vm_page_t m;
1952 
1953 			if (object->type == OBJT_MARKER)
1954 				continue;
1955 			db_printf("new object: %p\n", (void *)object);
1956 			if ( nl > 18) {
1957 				c = cngetc();
1958 				if (c != ' ')
1959 					return;
1960 				nl = 0;
1961 			}
1962 			nl++;
1963 			rcount = 0;
1964 			fidx = 0;
1965 			osize = object->size;
1966 			if (osize > 128)
1967 				osize = 128;
1968 			for (idx = 0; idx < osize; idx++) {
1969 				m = vm_page_lookup(object, idx);
1970 				if (m == NULL) {
1971 					if (rcount) {
1972 						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1973 							(long)fidx, rcount, (long)pa);
1974 						if ( nl > 18) {
1975 							c = cngetc();
1976 							if (c != ' ')
1977 								return;
1978 							nl = 0;
1979 						}
1980 						nl++;
1981 						rcount = 0;
1982 					}
1983 					continue;
1984 				}
1985 
1986 				if (rcount &&
1987 					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1988 					++rcount;
1989 					continue;
1990 				}
1991 				if (rcount) {
1992 					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1993 					padiff >>= PAGE_SHIFT;
1994 					padiff &= PQ_L2_MASK;
1995 					if (padiff == 0) {
1996 						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1997 						++rcount;
1998 						continue;
1999 					}
2000 					db_printf(" index(%ld)run(%d)pa(0x%lx)",
2001 						(long)fidx, rcount, (long)pa);
2002 					db_printf("pd(%ld)\n", (long)padiff);
2003 					if ( nl > 18) {
2004 						c = cngetc();
2005 						if (c != ' ')
2006 							return;
2007 						nl = 0;
2008 					}
2009 					nl++;
2010 				}
2011 				fidx = idx;
2012 				pa = VM_PAGE_TO_PHYS(m);
2013 				rcount = 1;
2014 			}
2015 			if (rcount) {
2016 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2017 					(long)fidx, rcount, (long)pa);
2018 				if ( nl > 18) {
2019 					c = cngetc();
2020 					if (c != ' ')
2021 						return;
2022 					nl = 0;
2023 				}
2024 				nl++;
2025 			}
2026 		}
2027 	}
2028 }
2029 #endif /* DDB */
2030