xref: /dragonfly/sys/vm/vm_object.c (revision b866b1da)
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62 
63 /*
64  *	Virtual memory object module.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>		/* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/malloc.h>
77 #include <sys/sysctl.h>
78 #include <sys/refcount.h>
79 
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_zone.h>
92 
93 #include <vm/vm_page2.h>
94 
95 #include <machine/specialreg.h>
96 
97 #define EASY_SCAN_FACTOR	8
98 
99 static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100 					     int pagerflags);
101 static void	vm_object_lock_init(vm_object_t);
102 
103 /*
104  *	Virtual memory objects maintain the actual data
105  *	associated with allocated virtual memory.  A given
106  *	page of memory exists within exactly one object.
107  *
108  *	An object is only deallocated when all "references"
109  *	are given up.  Only one "reference" to a given
110  *	region of an object should be writeable.
111  *
112  *	Associated with each object is a list of all resident
113  *	memory pages belonging to that object; this list is
114  *	maintained by the "vm_page" module, and locked by the object's
115  *	lock.
116  *
117  *	Each object also records a "pager" routine which is
118  *	used to retrieve (and store) pages to the proper backing
119  *	storage.  In addition, objects may be backed by other
120  *	objects from which they were virtual-copied.
121  *
122  *	The only items within the object structure which are
123  *	modified after time of creation are:
124  *		reference count		locked by object's lock
125  *		pager routine		locked by object's lock
126  *
127  */
128 
129 static struct vm_object kernel_object_store;
130 struct vm_object *kernel_object = &kernel_object_store;
131 
132 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
133 
134 static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object),
135 		"vm_object", "vm_object structures");
136 
137 #define VMOBJ_HASH_PRIME1	66555444443333333ULL
138 #define VMOBJ_HASH_PRIME2	989042931893ULL
139 
140 int vm_object_debug;
141 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
142 
143 static __inline
144 struct vm_object_hash *
145 vmobj_hash(vm_object_t obj)
146 {
147 	uintptr_t hash1;
148 	uintptr_t hash2;
149 
150 	hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
151 	hash1 %= VMOBJ_HASH_PRIME1;
152 	hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
153 	hash2 %= VMOBJ_HASH_PRIME2;
154 	return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
155 }
156 
157 #if defined(DEBUG_LOCKS)
158 
159 #define vm_object_vndeallocate(obj, vpp)	\
160                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
161 
162 /*
163  * Debug helper to track hold/drop/ref/deallocate calls.
164  */
165 static void
166 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
167 {
168 	int i;
169 
170 	i = atomic_fetchadd_int(&obj->debug_index, 1);
171 	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
172 	ksnprintf(obj->debug_hold_thrs[i],
173 		  sizeof(obj->debug_hold_thrs[i]),
174 		  "%c%d:(%d):%s",
175 		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
176 		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
177 		  obj->ref_count,
178 		  curthread->td_comm);
179 	obj->debug_hold_file[i] = file;
180 	obj->debug_hold_line[i] = line;
181 #if 0
182 	/* Uncomment for debugging obj refs/derefs in reproducable cases */
183 	if (strcmp(curthread->td_comm, "sshd") == 0) {
184 		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
185 			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
186 			obj, obj->ref_count, addrem, file, line);
187 	}
188 #endif
189 }
190 
191 #endif
192 
193 /*
194  * Misc low level routines
195  */
196 static void
197 vm_object_lock_init(vm_object_t obj)
198 {
199 #if defined(DEBUG_LOCKS)
200 	int i;
201 
202 	obj->debug_index = 0;
203 	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
204 		obj->debug_hold_thrs[i][0] = 0;
205 		obj->debug_hold_file[i] = NULL;
206 		obj->debug_hold_line[i] = 0;
207 	}
208 #endif
209 }
210 
211 void
212 vm_object_lock_swap(void)
213 {
214 	lwkt_token_swap();
215 }
216 
217 void
218 vm_object_lock(vm_object_t obj)
219 {
220 	lwkt_gettoken(&obj->token);
221 }
222 
223 /*
224  * Returns TRUE on sucesss
225  */
226 static int
227 vm_object_lock_try(vm_object_t obj)
228 {
229 	return(lwkt_trytoken(&obj->token));
230 }
231 
232 void
233 vm_object_lock_shared(vm_object_t obj)
234 {
235 	lwkt_gettoken_shared(&obj->token);
236 }
237 
238 void
239 vm_object_unlock(vm_object_t obj)
240 {
241 	lwkt_reltoken(&obj->token);
242 }
243 
244 void
245 vm_object_upgrade(vm_object_t obj)
246 {
247 	lwkt_reltoken(&obj->token);
248 	lwkt_gettoken(&obj->token);
249 }
250 
251 void
252 vm_object_downgrade(vm_object_t obj)
253 {
254 	lwkt_reltoken(&obj->token);
255 	lwkt_gettoken_shared(&obj->token);
256 }
257 
258 static __inline void
259 vm_object_assert_held(vm_object_t obj)
260 {
261 	ASSERT_LWKT_TOKEN_HELD(&obj->token);
262 }
263 
264 /*
265  * Aquire a semi-random base page color for a new object.  Our main concern
266  * is that the color be spread out a bit.  Further spreading out occurs in
267  * bio_page_alloc().
268  */
269 int
270 vm_quickcolor(void)
271 {
272 	globaldata_t gd = mycpu;
273 	int pg_color;
274 
275 	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
276 	pg_color += gd->gd_quick_color;
277 	gd->gd_quick_color += PQ_PRIME2;
278 
279 	return pg_color;
280 }
281 
282 void
283 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
284 {
285 	KKASSERT(obj != NULL);
286 
287 	/*
288 	 * Object must be held (object allocation is stable due to callers
289 	 * context, typically already holding the token on a parent object)
290 	 * prior to potentially blocking on the lock, otherwise the object
291 	 * can get ripped away from us.
292 	 */
293 	refcount_acquire(&obj->hold_count);
294 	vm_object_lock(obj);
295 
296 #if defined(DEBUG_LOCKS)
297 	debugvm_object_add(obj, file, line, 1);
298 #endif
299 }
300 
301 int
302 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
303 {
304 	KKASSERT(obj != NULL);
305 
306 	/*
307 	 * Object must be held (object allocation is stable due to callers
308 	 * context, typically already holding the token on a parent object)
309 	 * prior to potentially blocking on the lock, otherwise the object
310 	 * can get ripped away from us.
311 	 */
312 	refcount_acquire(&obj->hold_count);
313 	if (vm_object_lock_try(obj) == 0) {
314 		if (refcount_release(&obj->hold_count)) {
315 			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
316 				kfree_obj(obj, M_VM_OBJECT);
317 		}
318 		return(0);
319 	}
320 
321 #if defined(DEBUG_LOCKS)
322 	debugvm_object_add(obj, file, line, 1);
323 #endif
324 	return(1);
325 }
326 
327 void
328 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
329 {
330 	KKASSERT(obj != NULL);
331 
332 	/*
333 	 * Object must be held (object allocation is stable due to callers
334 	 * context, typically already holding the token on a parent object)
335 	 * prior to potentially blocking on the lock, otherwise the object
336 	 * can get ripped away from us.
337 	 */
338 	refcount_acquire(&obj->hold_count);
339 	vm_object_lock_shared(obj);
340 
341 #if defined(DEBUG_LOCKS)
342 	debugvm_object_add(obj, file, line, 1);
343 #endif
344 }
345 
346 /*
347  * Drop the token and hold_count on the object.
348  *
349  * WARNING! Token might be shared.
350  */
351 void
352 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
353 {
354 	if (obj == NULL)
355 		return;
356 
357 	/*
358 	 * No new holders should be possible once we drop hold_count 1->0 as
359 	 * there is no longer any way to reference the object.
360 	 */
361 	KKASSERT(obj->hold_count > 0);
362 	if (refcount_release(&obj->hold_count)) {
363 #if defined(DEBUG_LOCKS)
364 		debugvm_object_add(obj, file, line, -1);
365 #endif
366 
367 		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
368 			vm_object_unlock(obj);
369 			kfree_obj(obj, M_VM_OBJECT);
370 		} else {
371 			vm_object_unlock(obj);
372 		}
373 	} else {
374 #if defined(DEBUG_LOCKS)
375 		debugvm_object_add(obj, file, line, -1);
376 #endif
377 		vm_object_unlock(obj);
378 	}
379 }
380 
381 /*
382  * Initialize a freshly allocated object, returning a held object.
383  *
384  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
385  *
386  * No requirements.
387  */
388 void
389 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
390 		    const char *ident)
391 {
392 	struct vm_object_hash *hash;
393 
394 	RB_INIT(&object->rb_memq);
395 	lwkt_token_init(&object->token, ident);
396 
397 	TAILQ_INIT(&object->backing_list);
398 	lockinit(&object->backing_lk, "baclk", 0, 0);
399 
400 	object->type = type;
401 	object->size = size;
402 	object->ref_count = 1;
403 	object->memattr = VM_MEMATTR_DEFAULT;
404 	object->hold_count = 0;
405 	object->flags = 0;
406 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
407 		vm_object_set_flag(object, OBJ_ONEMAPPING);
408 	object->paging_in_progress = 0;
409 	object->resident_page_count = 0;
410 	/* cpu localization twist */
411 	object->pg_color = vm_quickcolor();
412 	object->handle = NULL;
413 
414 	atomic_add_int(&object->generation, 1);
415 	object->swblock_count = 0;
416 	RB_INIT(&object->swblock_root);
417 	vm_object_lock_init(object);
418 	pmap_object_init(object);
419 
420 	vm_object_hold(object);
421 
422 	hash = vmobj_hash(object);
423 	lwkt_gettoken(&hash->token);
424 	TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
425 	lwkt_reltoken(&hash->token);
426 }
427 
428 /*
429  * Initialize a VM object.
430  */
431 void
432 vm_object_init(vm_object_t object, vm_pindex_t size)
433 {
434 	_vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj");
435 	vm_object_drop(object);
436 }
437 
438 /*
439  * Initialize the VM objects module.
440  *
441  * Called from the low level boot code only.  Note that this occurs before
442  * kmalloc is initialized so we cannot allocate any VM objects.
443  */
444 void
445 vm_object_init1(void)
446 {
447 	int i;
448 
449 	for (i = 0; i < VMOBJ_HSIZE; ++i) {
450 		TAILQ_INIT(&vm_object_hash[i].list);
451 		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
452 	}
453 
454 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
455 			    kernel_object, "kobj");
456 	vm_object_drop(kernel_object);
457 }
458 
459 void
460 vm_object_init2(void)
461 {
462 	kmalloc_obj_set_unlimited(M_VM_OBJECT);
463 }
464 
465 /*
466  * Allocate and return a new object of the specified type and size.
467  *
468  * No requirements.
469  */
470 vm_object_t
471 vm_object_allocate(objtype_t type, vm_pindex_t size)
472 {
473 	vm_object_t obj;
474 
475 	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
476 	_vm_object_allocate(type, size, obj, "vmobj");
477 	vm_object_drop(obj);
478 
479 	return (obj);
480 }
481 
482 /*
483  * This version returns a held object, allowing further atomic initialization
484  * of the object.
485  */
486 vm_object_t
487 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
488 {
489 	vm_object_t obj;
490 
491 	obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
492 	_vm_object_allocate(type, size, obj, "vmobj");
493 
494 	return (obj);
495 }
496 
497 /*
498  * Add an additional reference to a vm_object.  The object must already be
499  * held.  The original non-lock version is no longer supported.  The object
500  * must NOT be chain locked by anyone at the time the reference is added.
501  *
502  * The object must be held, but may be held shared if desired (hence why
503  * we use an atomic op).
504  */
505 void
506 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
507 {
508 	KKASSERT(object != NULL);
509 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
510 	atomic_add_int(&object->ref_count, 1);
511 	if (object->type == OBJT_VNODE) {
512 		vref(object->handle);
513 		/* XXX what if the vnode is being destroyed? */
514 	}
515 #if defined(DEBUG_LOCKS)
516 	debugvm_object_add(object, file, line, 1);
517 #endif
518 }
519 
520 /*
521  * This version is only allowed in situations where the caller
522  * already knows that the object is deterministically referenced
523  * (usually because its taken from a ref'd vnode, or during a map_entry
524  * replication).
525  */
526 void
527 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
528 {
529 	KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
530 	atomic_add_int(&object->ref_count, 1);
531 	if (object->type == OBJT_VNODE)
532 		vref(object->handle);
533 #if defined(DEBUG_LOCKS)
534 	debugvm_object_add(object, file, line, 1);
535 #endif
536 }
537 
538 /*
539  * Dereference an object and its underlying vnode.  The object may be
540  * held shared.  On return the object will remain held.
541  *
542  * This function may return a vnode in *vpp which the caller must release
543  * after the caller drops its own lock.  If vpp is NULL, we assume that
544  * the caller was holding an exclusive lock on the object and we vrele()
545  * the vp ourselves.
546  */
547 static void
548 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
549 				   VMOBJDBARGS)
550 {
551 	struct vnode *vp = (struct vnode *) object->handle;
552 	int count;
553 
554 	KASSERT(object->type == OBJT_VNODE,
555 	    ("vm_object_vndeallocate: not a vnode object"));
556 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
557 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
558 #ifdef INVARIANTS
559 	if (object->ref_count == 0) {
560 		vprint("vm_object_vndeallocate", vp);
561 		panic("vm_object_vndeallocate: bad object reference count");
562 	}
563 #endif
564 	count = object->ref_count;
565 	cpu_ccfence();
566 	for (;;) {
567 		if (count == 1) {
568 			vm_object_upgrade(object);
569 			if (atomic_fcmpset_int(&object->ref_count, &count, 0)) {
570 				vclrflags(vp, VTEXT);
571 				break;
572 			}
573 		} else {
574 			if (atomic_fcmpset_int(&object->ref_count,
575 					       &count, count - 1)) {
576 				break;
577 			}
578 		}
579 		cpu_pause();
580 		/* retry */
581 	}
582 #if defined(DEBUG_LOCKS)
583 	debugvm_object_add(object, file, line, -1);
584 #endif
585 
586 	/*
587 	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
588 	 * if the object was locked exclusively.  But there are two races
589 	 * here.
590 	 *
591 	 * We had to upgrade the object above to safely clear VTEXT
592 	 * but the alternative path where the shared lock is retained
593 	 * can STILL race to 0 in other paths and cause our own vrele()
594 	 * to terminate the vnode.  We can't allow that if the VM object
595 	 * is still locked shared.
596 	 */
597 	if (vpp)
598 		*vpp = vp;
599 	else
600 		vrele(vp);
601 }
602 
603 /*
604  * Release a reference to the specified object, gained either through a
605  * vm_object_allocate or a vm_object_reference call.  When all references
606  * are gone, storage associated with this object may be relinquished.
607  *
608  * The caller does not have to hold the object locked but must have control
609  * over the reference in question in order to guarantee that the object
610  * does not get ripped out from under us.
611  *
612  * XXX Currently all deallocations require an exclusive lock.
613  */
614 void
615 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
616 {
617 	struct vnode *vp;
618 	int count;
619 
620 	if (object == NULL)
621 		return;
622 
623 	count = object->ref_count;
624 	cpu_ccfence();
625 	for (;;) {
626 		/*
627 		 * If decrementing the count enters into special handling
628 		 * territory (0, 1, or 2) we have to do it the hard way.
629 		 * Fortunate though, objects with only a few refs like this
630 		 * are not likely to be heavily contended anyway.
631 		 *
632 		 * For vnode objects we only care about 1->0 transitions.
633 		 */
634 		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
635 #if defined(DEBUG_LOCKS)
636 			debugvm_object_add(object, file, line, 0);
637 #endif
638 			vm_object_hold(object);
639 			vm_object_deallocate_locked(object);
640 			vm_object_drop(object);
641 			break;
642 		}
643 
644 		/*
645 		 * Try to decrement ref_count without acquiring a hold on
646 		 * the object.  This is particularly important for the exec*()
647 		 * and exit*() code paths because the program binary may
648 		 * have a great deal of sharing and an exclusive lock will
649 		 * crowbar performance in those circumstances.
650 		 */
651 		if (object->type == OBJT_VNODE) {
652 			vp = (struct vnode *)object->handle;
653 			if (atomic_fcmpset_int(&object->ref_count,
654 					       &count, count - 1)) {
655 #if defined(DEBUG_LOCKS)
656 				debugvm_object_add(object, file, line, -1);
657 #endif
658 
659 				vrele(vp);
660 				break;
661 			}
662 			/* retry */
663 		} else {
664 			if (atomic_fcmpset_int(&object->ref_count,
665 					       &count, count - 1)) {
666 #if defined(DEBUG_LOCKS)
667 				debugvm_object_add(object, file, line, -1);
668 #endif
669 				break;
670 			}
671 			/* retry */
672 		}
673 		cpu_pause();
674 		/* retry */
675 	}
676 }
677 
678 void
679 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
680 {
681 	/*
682 	 * Degenerate case
683 	 */
684 	if (object == NULL)
685 		return;
686 
687 	/*
688 	 * vnode case, caller either locked the object exclusively
689 	 * or this is a recursion with must_drop != 0 and the vnode
690 	 * object will be locked shared.
691 	 *
692 	 * If locked shared we have to drop the object before we can
693 	 * call vrele() or risk a shared/exclusive livelock.
694 	 */
695 	if (object->type == OBJT_VNODE) {
696 		ASSERT_LWKT_TOKEN_HELD(&object->token);
697 		vm_object_vndeallocate(object, NULL);
698 		return;
699 	}
700 	ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
701 
702 	/*
703 	 * Normal case (object is locked exclusively)
704 	 */
705 	if (object->ref_count == 0) {
706 		panic("vm_object_deallocate: object deallocated "
707 		      "too many times: %d", object->type);
708 	}
709 	if (object->ref_count > 2) {
710 		atomic_add_int(&object->ref_count, -1);
711 #if defined(DEBUG_LOCKS)
712 		debugvm_object_add(object, file, line, -1);
713 #endif
714 		return;
715 	}
716 
717 	/*
718 	 * Drop the ref and handle termination on the 1->0 transition.
719 	 * We may have blocked above so we have to recheck.
720 	 */
721 	KKASSERT(object->ref_count != 0);
722 	if (object->ref_count >= 2) {
723 		atomic_add_int(&object->ref_count, -1);
724 #if defined(DEBUG_LOCKS)
725 		debugvm_object_add(object, file, line, -1);
726 #endif
727 		return;
728 	}
729 
730 	atomic_add_int(&object->ref_count, -1);
731 	if ((object->flags & OBJ_DEAD) == 0)
732 		vm_object_terminate(object);
733 }
734 
735 /*
736  * Destroy the specified object, freeing up related resources.
737  *
738  * The object must have zero references.
739  *
740  * The object must held.  The caller is responsible for dropping the object
741  * after terminate returns.  Terminate does NOT drop the object.
742  */
743 static int vm_object_terminate_callback(vm_page_t p, void *data);
744 
745 void
746 vm_object_terminate(vm_object_t object)
747 {
748 	struct rb_vm_page_scan_info info;
749 	struct vm_object_hash *hash;
750 
751 	/*
752 	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
753 	 * able to safely block.
754 	 */
755 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
756 	KKASSERT((object->flags & OBJ_DEAD) == 0);
757 	vm_object_set_flag(object, OBJ_DEAD);
758 
759 	/*
760 	 * Wait for the pageout daemon to be done with the object
761 	 */
762 	vm_object_pip_wait(object, "objtrm1");
763 
764 	KASSERT(!object->paging_in_progress,
765 		("vm_object_terminate: pageout in progress"));
766 
767 	/*
768 	 * Clean and free the pages, as appropriate. All references to the
769 	 * object are gone, so we don't need to lock it.
770 	 */
771 	if (object->type == OBJT_VNODE) {
772 		struct vnode *vp;
773 
774 		/*
775 		 * Clean pages and flush buffers.
776 		 *
777 		 * NOTE!  TMPFS buffer flushes do not typically flush the
778 		 *	  actual page to swap as this would be highly
779 		 *	  inefficient, and normal filesystems usually wrap
780 		 *	  page flushes with buffer cache buffers.
781 		 *
782 		 *	  To deal with this we have to call vinvalbuf() both
783 		 *	  before and after the vm_object_page_clean().
784 		 */
785 		vp = (struct vnode *) object->handle;
786 		vinvalbuf(vp, V_SAVE, 0, 0);
787 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
788 		vinvalbuf(vp, V_SAVE, 0, 0);
789 	}
790 
791 	/*
792 	 * Wait for any I/O to complete, after which there had better not
793 	 * be any references left on the object.
794 	 */
795 	vm_object_pip_wait(object, "objtrm2");
796 
797 	if (object->ref_count != 0) {
798 		panic("vm_object_terminate: object with references, "
799 		      "ref_count=%d", object->ref_count);
800 	}
801 
802 	/*
803 	 * Cleanup any shared pmaps associated with this object.
804 	 */
805 	pmap_object_free(object);
806 
807 	/*
808 	 * Now free any remaining pages. For internal objects, this also
809 	 * removes them from paging queues. Don't free wired pages, just
810 	 * remove them from the object.
811 	 */
812 	info.count = 0;
813 	info.object = object;
814 	do {
815 		info.error = 0;
816 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
817 					vm_object_terminate_callback, &info);
818 	} while (info.error);
819 
820 	/*
821 	 * Let the pager know object is dead.
822 	 */
823 	vm_pager_deallocate(object);
824 
825 	/*
826 	 * Wait for the object hold count to hit 1, clean out pages as
827 	 * we go.  vmobj_token interlocks any race conditions that might
828 	 * pick the object up from the vm_object_list after we have cleared
829 	 * rb_memq.
830 	 */
831 	for (;;) {
832 		if (RB_ROOT(&object->rb_memq) == NULL)
833 			break;
834 		kprintf("vm_object_terminate: Warning, object %p "
835 			"still has %ld pages\n",
836 			object, object->resident_page_count);
837 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
838 					vm_object_terminate_callback, &info);
839 	}
840 
841 	/*
842 	 * There had better not be any pages left
843 	 */
844 	KKASSERT(object->resident_page_count == 0);
845 
846 	/*
847 	 * Remove the object from the global object list.
848 	 */
849 	hash = vmobj_hash(object);
850 	lwkt_gettoken(&hash->token);
851 	TAILQ_REMOVE(&hash->list, object, object_entry);
852 	lwkt_reltoken(&hash->token);
853 
854 	if (object->ref_count != 0) {
855 		panic("vm_object_terminate2: object with references, "
856 		      "ref_count=%d", object->ref_count);
857 	}
858 
859 	/*
860 	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
861 	 *	 the object here.  See vm_object_drop().
862 	 */
863 }
864 
865 /*
866  * The caller must hold the object.
867  *
868  * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED
869  *	 or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync()
870  *	 is called, due to normal pmap operations.  This is because only
871  *	 global pmap operations on the vm_page can clear the bits and not
872  *	 just local operations on individual pmaps.
873  *
874  *	 Most interactions that necessitate the clearing of these bits
875  *	 proactively call vm_page_protect(), and we must do so here as well.
876  */
877 static int
878 vm_object_terminate_callback(vm_page_t p, void *data)
879 {
880 	struct rb_vm_page_scan_info *info = data;
881 	vm_object_t object;
882 
883 	object = p->object;
884 	KKASSERT(object == info->object);
885 	if (vm_page_busy_try(p, TRUE)) {
886 		vm_page_sleep_busy(p, TRUE, "vmotrm");
887 		info->error = 1;
888 		return 0;
889 	}
890 	if (object != p->object) {
891 		/* XXX remove once we determine it can't happen */
892 		kprintf("vm_object_terminate: Warning: Encountered "
893 			"busied page %p on queue %d\n", p, p->queue);
894 		vm_page_wakeup(p);
895 		info->error = 1;
896 	} else if (p->wire_count == 0) {
897 		/*
898 		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
899 		 */
900 		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
901 			vm_page_protect(p, VM_PROT_NONE);
902 		vm_page_free(p);
903 		mycpu->gd_cnt.v_pfree++;
904 	} else {
905 		if (p->queue != PQ_NONE) {
906 			kprintf("vm_object_terminate: Warning: Encountered "
907 				"wired page %p on queue %d\n", p, p->queue);
908 			if (vm_object_debug > 0) {
909 				--vm_object_debug;
910 				print_backtrace(10);
911 			}
912 		}
913 		if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
914 			vm_page_protect(p, VM_PROT_NONE);
915 		vm_page_remove(p);
916 		vm_page_wakeup(p);
917 	}
918 
919 	/*
920 	 * Must be at end to avoid SMP races, caller holds object token
921 	 */
922 	if ((++info->count & 63) == 0)
923 		lwkt_user_yield();
924 	return(0);
925 }
926 
927 /*
928  * Clean all dirty pages in the specified range of object.  Leaves page
929  * on whatever queue it is currently on.   If NOSYNC is set then do not
930  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
931  * leaving the object dirty.
932  *
933  * When stuffing pages asynchronously, allow clustering.  XXX we need a
934  * synchronous clustering mode implementation.
935  *
936  * Odd semantics: if start == end, we clean everything.
937  *
938  * The object must be locked? XXX
939  */
940 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
941 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
942 
943 void
944 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
945 		     int flags)
946 {
947 	struct rb_vm_page_scan_info info;
948 	struct vnode *vp;
949 	int wholescan;
950 	int pagerflags;
951 	int generation;
952 
953 	vm_object_hold(object);
954 	if (object->type != OBJT_VNODE ||
955 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
956 		vm_object_drop(object);
957 		return;
958 	}
959 
960 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
961 			OBJPC_SYNC : OBJPC_CLUSTER_OK;
962 	pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0;
963 
964 	vp = object->handle;
965 
966 	/*
967 	 * Interlock other major object operations.  This allows us to
968 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
969 	 */
970 	vm_object_set_flag(object, OBJ_CLEANING);
971 
972 	/*
973 	 * Handle 'entire object' case
974 	 */
975 	info.start_pindex = start;
976 	if (end == 0) {
977 		info.end_pindex = object->size - 1;
978 	} else {
979 		info.end_pindex = end - 1;
980 	}
981 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
982 	info.limit = flags;
983 	info.pagerflags = pagerflags;
984 	info.object = object;
985 
986 	/*
987 	 * If cleaning the entire object do a pass to mark the pages read-only.
988 	 * If everything worked out ok, clear OBJ_WRITEABLE and
989 	 * OBJ_MIGHTBEDIRTY.
990 	 */
991 	if (wholescan) {
992 		info.error = 0;
993 		info.count = 0;
994 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
995 					vm_object_page_clean_pass1, &info);
996 		if (info.error == 0) {
997 			vm_object_clear_flag(object,
998 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
999 			if (object->type == OBJT_VNODE &&
1000 			    (vp = (struct vnode *)object->handle) != NULL) {
1001 				/*
1002 				 * Use new-style interface to clear VISDIRTY
1003 				 * because the vnode is not necessarily removed
1004 				 * from the syncer list(s) as often as it was
1005 				 * under the old interface, which can leave
1006 				 * the vnode on the syncer list after reclaim.
1007 				 */
1008 				vclrobjdirty(vp);
1009 			}
1010 		}
1011 	}
1012 
1013 	/*
1014 	 * Do a pass to clean all the dirty pages we find.
1015 	 */
1016 	do {
1017 		info.error = 0;
1018 		info.count = 0;
1019 		generation = object->generation;
1020 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1021 					vm_object_page_clean_pass2, &info);
1022 	} while (info.error || generation != object->generation);
1023 
1024 	vm_object_clear_flag(object, OBJ_CLEANING);
1025 	vm_object_drop(object);
1026 }
1027 
1028 /*
1029  * The caller must hold the object.
1030  */
1031 static
1032 int
1033 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1034 {
1035 	struct rb_vm_page_scan_info *info = data;
1036 
1037 	KKASSERT(p->object == info->object);
1038 
1039 	vm_page_flag_set(p, PG_CLEANCHK);
1040 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1041 		info->error = 1;
1042 	} else if (vm_page_busy_try(p, FALSE)) {
1043 		info->error = 1;
1044 	} else {
1045 		KKASSERT(p->object == info->object);
1046 		vm_page_protect(p, VM_PROT_READ);
1047 		vm_page_wakeup(p);
1048 	}
1049 
1050 	/*
1051 	 * Must be at end to avoid SMP races, caller holds object token
1052 	 */
1053 	if ((++info->count & 63) == 0)
1054 		lwkt_user_yield();
1055 	return(0);
1056 }
1057 
1058 /*
1059  * The caller must hold the object
1060  */
1061 static
1062 int
1063 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1064 {
1065 	struct rb_vm_page_scan_info *info = data;
1066 	int generation;
1067 
1068 	KKASSERT(p->object == info->object);
1069 
1070 	/*
1071 	 * Do not mess with pages that were inserted after we started
1072 	 * the cleaning pass.
1073 	 */
1074 	if ((p->flags & PG_CLEANCHK) == 0)
1075 		goto done;
1076 
1077 	generation = info->object->generation;
1078 
1079 	if (vm_page_busy_try(p, TRUE)) {
1080 		vm_page_sleep_busy(p, TRUE, "vpcwai");
1081 		info->error = 1;
1082 		goto done;
1083 	}
1084 
1085 	KKASSERT(p->object == info->object &&
1086 		 info->object->generation == generation);
1087 
1088 	/*
1089 	 * Before wasting time traversing the pmaps, check for trivial
1090 	 * cases where the page cannot be dirty.
1091 	 */
1092 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1093 		KKASSERT((p->dirty & p->valid) == 0 &&
1094 			 (p->flags & PG_NEED_COMMIT) == 0);
1095 		vm_page_wakeup(p);
1096 		goto done;
1097 	}
1098 
1099 	/*
1100 	 * Check whether the page is dirty or not.  The page has been set
1101 	 * to be read-only so the check will not race a user dirtying the
1102 	 * page.
1103 	 */
1104 	vm_page_test_dirty(p);
1105 	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1106 		vm_page_flag_clear(p, PG_CLEANCHK);
1107 		vm_page_wakeup(p);
1108 		goto done;
1109 	}
1110 
1111 	/*
1112 	 * If we have been asked to skip nosync pages and this is a
1113 	 * nosync page, skip it.  Note that the object flags were
1114 	 * not cleared in this case (because pass1 will have returned an
1115 	 * error), so we do not have to set them.
1116 	 */
1117 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1118 		vm_page_flag_clear(p, PG_CLEANCHK);
1119 		vm_page_wakeup(p);
1120 		goto done;
1121 	}
1122 
1123 	/*
1124 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1125 	 * the pages that get successfully flushed.  Set info->error if
1126 	 * we raced an object modification.
1127 	 */
1128 	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1129 	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1130 
1131 	/*
1132 	 * Must be at end to avoid SMP races, caller holds object token
1133 	 */
1134 done:
1135 	if ((++info->count & 63) == 0)
1136 		lwkt_user_yield();
1137 	return(0);
1138 }
1139 
1140 /*
1141  * Collect the specified page and nearby pages and flush them out.
1142  * The number of pages flushed is returned.  The passed page is busied
1143  * by the caller and we are responsible for its disposition.
1144  *
1145  * The caller must hold the object.
1146  */
1147 static void
1148 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1149 {
1150 	int error;
1151 	int is;
1152 	int ib;
1153 	int i;
1154 	int page_base;
1155 	vm_pindex_t pi;
1156 	vm_page_t ma[BLIST_MAX_ALLOC];
1157 
1158 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1159 
1160 	pi = p->pindex;
1161 	page_base = pi % BLIST_MAX_ALLOC;
1162 	ma[page_base] = p;
1163 	ib = page_base - 1;
1164 	is = page_base + 1;
1165 
1166 	while (ib >= 0) {
1167 		vm_page_t tp;
1168 
1169 		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1170 					     TRUE, &error);
1171 		if (error)
1172 			break;
1173 		if (tp == NULL)
1174 			break;
1175 		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1176 		    (tp->flags & PG_CLEANCHK) == 0) {
1177 			vm_page_wakeup(tp);
1178 			break;
1179 		}
1180 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1181 			vm_page_flag_clear(tp, PG_CLEANCHK);
1182 			vm_page_wakeup(tp);
1183 			break;
1184 		}
1185 		vm_page_test_dirty(tp);
1186 		if ((tp->dirty & tp->valid) == 0 &&
1187 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1188 			vm_page_flag_clear(tp, PG_CLEANCHK);
1189 			vm_page_wakeup(tp);
1190 			break;
1191 		}
1192 		ma[ib] = tp;
1193 		--ib;
1194 	}
1195 	++ib;	/* fixup */
1196 
1197 	while (is < BLIST_MAX_ALLOC &&
1198 	       pi - page_base + is < object->size) {
1199 		vm_page_t tp;
1200 
1201 		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1202 					     TRUE, &error);
1203 		if (error)
1204 			break;
1205 		if (tp == NULL)
1206 			break;
1207 		if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1208 		    (tp->flags & PG_CLEANCHK) == 0) {
1209 			vm_page_wakeup(tp);
1210 			break;
1211 		}
1212 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1213 			vm_page_flag_clear(tp, PG_CLEANCHK);
1214 			vm_page_wakeup(tp);
1215 			break;
1216 		}
1217 		vm_page_test_dirty(tp);
1218 		if ((tp->dirty & tp->valid) == 0 &&
1219 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1220 			vm_page_flag_clear(tp, PG_CLEANCHK);
1221 			vm_page_wakeup(tp);
1222 			break;
1223 		}
1224 		ma[is] = tp;
1225 		++is;
1226 	}
1227 
1228 	/*
1229 	 * All pages in the ma[] array are busied now
1230 	 */
1231 	for (i = ib; i < is; ++i) {
1232 		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1233 		vm_page_hold(ma[i]);	/* XXX need this any more? */
1234 	}
1235 	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1236 	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1237 		vm_page_unhold(ma[i]);
1238 }
1239 
1240 /*
1241  * Implements the madvise function at the object/page level.
1242  *
1243  * MADV_WILLNEED	(any object)
1244  *
1245  *	Activate the specified pages if they are resident.
1246  *
1247  * MADV_DONTNEED	(any object)
1248  *
1249  *	Deactivate the specified pages if they are resident.
1250  *
1251  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1252  *
1253  *	Deactivate and clean the specified pages if they are
1254  *	resident.  This permits the process to reuse the pages
1255  *	without faulting or the kernel to reclaim the pages
1256  *	without I/O.
1257  *
1258  * No requirements.
1259  */
1260 void
1261 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1262 		  vm_pindex_t count, int advise)
1263 {
1264 	vm_pindex_t end;
1265 	vm_page_t m;
1266 	int error;
1267 
1268 	if (object == NULL)
1269 		return;
1270 
1271 	end = pindex + count;
1272 
1273 	vm_object_hold(object);
1274 
1275 	/*
1276 	 * Locate and adjust resident pages.  This only applies to the
1277 	 * primary object in the mapping.
1278 	 */
1279 	for (; pindex < end; pindex += 1) {
1280 relookup:
1281 		/*
1282 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1283 		 * and those pages must be OBJ_ONEMAPPING.
1284 		 */
1285 		if (advise == MADV_FREE) {
1286 			if ((object->type != OBJT_DEFAULT &&
1287 			     object->type != OBJT_SWAP) ||
1288 			    (object->flags & OBJ_ONEMAPPING) == 0) {
1289 				continue;
1290 			}
1291 		}
1292 
1293 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1294 
1295 		if (error) {
1296 			vm_page_sleep_busy(m, TRUE, "madvpo");
1297 			goto relookup;
1298 		}
1299 		if (m == NULL) {
1300 			/*
1301 			 * There may be swap even if there is no backing page
1302 			 */
1303 			if (advise == MADV_FREE && object->type == OBJT_SWAP)
1304 				swap_pager_freespace(object, pindex, 1);
1305 			continue;
1306 		}
1307 
1308 		/*
1309 		 * If the page is not in a normal active state, we skip it.
1310 		 * If the page is not managed there are no page queues to
1311 		 * mess with.  Things can break if we mess with pages in
1312 		 * any of the below states.
1313 		 */
1314 		if (m->wire_count ||
1315 		    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1316 				 PG_NEED_COMMIT)) ||
1317 		    m->valid != VM_PAGE_BITS_ALL
1318 		) {
1319 			vm_page_wakeup(m);
1320 			continue;
1321 		}
1322 
1323 		/*
1324 		 * Theoretically once a page is known not to be busy, an
1325 		 * interrupt cannot come along and rip it out from under us.
1326 		 */
1327 		if (advise == MADV_WILLNEED) {
1328 			vm_page_activate(m);
1329 		} else if (advise == MADV_DONTNEED) {
1330 			vm_page_dontneed(m);
1331 		} else if (advise == MADV_FREE) {
1332 			/*
1333 			 * Mark the page clean.  This will allow the page
1334 			 * to be freed up by the system.  However, such pages
1335 			 * are often reused quickly by malloc()/free()
1336 			 * so we do not do anything that would cause
1337 			 * a page fault if we can help it.
1338 			 *
1339 			 * Specifically, we do not try to actually free
1340 			 * the page now nor do we try to put it in the
1341 			 * cache (which would cause a page fault on reuse).
1342 			 *
1343 			 * But we do make the page is freeable as we
1344 			 * can without actually taking the step of unmapping
1345 			 * it.
1346 			 */
1347 			pmap_clear_modify(m);
1348 			m->dirty = 0;
1349 			m->act_count = 0;
1350 			vm_page_dontneed(m);
1351 			if (object->type == OBJT_SWAP)
1352 				swap_pager_freespace(object, pindex, 1);
1353 		}
1354 		vm_page_wakeup(m);
1355 	}
1356 	vm_object_drop(object);
1357 }
1358 
1359 /*
1360  * Removes all physical pages in the specified object range from the
1361  * object's list of pages.
1362  *
1363  * No requirements.
1364  */
1365 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1366 
1367 void
1368 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1369 		      boolean_t clean_only)
1370 {
1371 	struct rb_vm_page_scan_info info;
1372 	int all;
1373 
1374 	/*
1375 	 * Degenerate cases and assertions.
1376 	 *
1377 	 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects.
1378 	 *	 These objects do not have to have their pages entered into
1379 	 *	 them and are handled via their vm_map_backing lists.
1380 	 */
1381 	vm_object_hold(object);
1382 	if (object == NULL ||
1383 	    (object->type != OBJT_MGTDEVICE &&
1384 	     object->resident_page_count == 0 && object->swblock_count == 0)) {
1385 		vm_object_drop(object);
1386 		return;
1387 	}
1388 	KASSERT(object->type != OBJT_PHYS,
1389 		("attempt to remove pages from a physical object"));
1390 
1391 	/*
1392 	 * Indicate that paging is occuring on the object
1393 	 */
1394 	vm_object_pip_add(object, 1);
1395 
1396 	/*
1397 	 * Figure out the actual removal range and whether we are removing
1398 	 * the entire contents of the object or not.  If removing the entire
1399 	 * contents, be sure to get all pages, even those that might be
1400 	 * beyond the end of the object.
1401 	 *
1402 	 * NOTE: end is non-inclusive, but info.end_pindex is inclusive.
1403 	 */
1404 	info.object = object;
1405 	info.start_pindex = start;
1406 	if (end == 0 || end == (vm_pindex_t)-1) {
1407 		info.end_pindex = (vm_pindex_t)-1;
1408 		end = object->size;
1409 	} else {
1410 		info.end_pindex = end - 1;
1411 	}
1412 	info.limit = clean_only;
1413 	info.count = 0;
1414 	all = (start == 0 && info.end_pindex >= object->size - 1);
1415 
1416 	/*
1417 	 * Efficiently remove pages from the pmap via a backing scan.
1418 	 *
1419 	 * NOTE: This is the only way pages can be removed and unwired
1420 	 *	 from OBJT_MGTDEVICE devices which typically do not enter
1421 	 *	 their pages into the vm_object's RB tree.  And possibly
1422 	 *	 other OBJT_* types in the future.
1423 	 */
1424 	{
1425 		vm_map_backing_t ba;
1426 		vm_pindex_t sba, eba;
1427 		vm_offset_t sva, eva;
1428 
1429 		lockmgr(&object->backing_lk, LK_EXCLUSIVE);
1430 		TAILQ_FOREACH(ba, &object->backing_list, entry) {
1431 			/*
1432 			 * object offset range within the ba, intersectioned
1433 			 * with the page range specified for the object
1434 			 */
1435 			sba = OFF_TO_IDX(ba->offset);
1436 			eba = sba + OFF_TO_IDX(ba->end - ba->start);
1437 			if (sba < start)
1438 				sba = start;
1439 			if (eba > end)
1440 				eba = end;
1441 
1442 			/*
1443 			 * If the intersection is valid, remove the related
1444 			 * pages.
1445 			 *
1446 			 * NOTE! This may also remove other incidental pages
1447 			 *	 in the pmap, as the backing area may be
1448 			 *	 overloaded.
1449 			 *
1450 			 * NOTE! pages for MGTDEVICE objects are only removed
1451 			 *	 here, they aren't entered into rb_memq, so
1452 			 *	 we must use pmap_remove() instead of
1453 			 *	 the non-TLB-invalidating pmap_remove_pages().
1454 			 */
1455 			if (sba < eba) {
1456 				sva = ba->start + IDX_TO_OFF(sba) - ba->offset;
1457 				eva = sva + IDX_TO_OFF(eba - sba);
1458 #if 0
1459 				kprintf("VM_OBJECT_PAGE_REMOVE "
1460 					"%p[%016jx] %016jx-%016jx\n",
1461 					ba->pmap, ba->start, sva, eva);
1462 #endif
1463 				pmap_remove(ba->pmap, sva, eva);
1464 			}
1465 		}
1466 		lockmgr(&object->backing_lk, LK_RELEASE);
1467 	}
1468 
1469 	/*
1470 	 * Remove and free pages entered onto the object list.  Note that
1471 	 * for OBJT_MGTDEVICE objects, there are typically no pages entered.
1472 	 *
1473 	 * Loop until we are sure we have gotten them all.
1474 	 */
1475 	do {
1476 		info.error = 0;
1477 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1478 					vm_object_page_remove_callback, &info);
1479 	} while (info.error);
1480 
1481 	/*
1482 	 * Remove any related swap if throwing away pages, or for
1483 	 * non-swap objects (the swap is a clean copy in that case).
1484 	 */
1485 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1486 		if (all)
1487 			swap_pager_freespace_all(object);
1488 		else
1489 			swap_pager_freespace(object, info.start_pindex,
1490 			     info.end_pindex - info.start_pindex + 1);
1491 	}
1492 
1493 	/*
1494 	 * Cleanup
1495 	 */
1496 	vm_object_pip_wakeup(object);
1497 	vm_object_drop(object);
1498 }
1499 
1500 /*
1501  * The caller must hold the object.
1502  *
1503  * NOTE: User yields are allowed when removing more than one page, but not
1504  *	 allowed if only removing one page (the path for single page removals
1505  *	 might hold a spinlock).
1506  */
1507 static int
1508 vm_object_page_remove_callback(vm_page_t p, void *data)
1509 {
1510 	struct rb_vm_page_scan_info *info = data;
1511 
1512 	if (info->object != p->object ||
1513 	    p->pindex < info->start_pindex ||
1514 	    p->pindex > info->end_pindex) {
1515 		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1516 			info->object, p);
1517 		return(0);
1518 	}
1519 	if (vm_page_busy_try(p, TRUE)) {
1520 		vm_page_sleep_busy(p, TRUE, "vmopar");
1521 		info->error = 1;
1522 		return(0);
1523 	}
1524 	if (info->object != p->object) {
1525 		/* this should never happen */
1526 		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1527 			info->object, p);
1528 		vm_page_wakeup(p);
1529 		return(0);
1530 	}
1531 
1532 	/*
1533 	 * Wired pages cannot be destroyed, but they can be invalidated
1534 	 * and we do so if clean_only (limit) is not set.
1535 	 *
1536 	 * WARNING!  The page may be wired due to being part of a buffer
1537 	 *	     cache buffer, and the buffer might be marked B_CACHE.
1538 	 *	     This is fine as part of a truncation but VFSs must be
1539 	 *	     sure to fix the buffer up when re-extending the file.
1540 	 *
1541 	 * NOTE!     PG_NEED_COMMIT is ignored.
1542 	 */
1543 	if (p->wire_count != 0) {
1544 		vm_page_protect(p, VM_PROT_NONE);
1545 		if (info->limit == 0)
1546 			p->valid = 0;
1547 		vm_page_wakeup(p);
1548 		goto done;
1549 	}
1550 
1551 	/*
1552 	 * limit is our clean_only flag.  If set and the page is dirty or
1553 	 * requires a commit, do not free it.  If set and the page is being
1554 	 * held by someone, do not free it.
1555 	 */
1556 	if (info->limit && p->valid) {
1557 		vm_page_test_dirty(p);
1558 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1559 			vm_page_wakeup(p);
1560 			goto done;
1561 		}
1562 	}
1563 
1564 	/*
1565 	 * Destroy the page.  But we have to re-test whether its dirty after
1566 	 * removing it from its pmaps.
1567 	 */
1568 	vm_page_protect(p, VM_PROT_NONE);
1569 	if (info->limit && p->valid) {
1570 		vm_page_test_dirty(p);
1571 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1572 			vm_page_wakeup(p);
1573 			goto done;
1574 		}
1575 	}
1576 	vm_page_free(p);
1577 
1578 	/*
1579 	 * Must be at end to avoid SMP races, caller holds object token
1580 	 */
1581 done:
1582 	if ((++info->count & 63) == 0)
1583 		lwkt_user_yield();
1584 
1585 	return(0);
1586 }
1587 
1588 /*
1589  * Try to extend prev_object into an adjoining region of virtual
1590  * memory, return TRUE on success.
1591  *
1592  * The caller does not need to hold (prev_object) but must have a stable
1593  * pointer to it (typically by holding the vm_map locked).
1594  *
1595  * This function only works for anonymous memory objects which either
1596  * have (a) one reference or (b) we are extending the object's size.
1597  * Otherwise the related VM pages we want to use for the object might
1598  * be in use by another mapping.
1599  */
1600 boolean_t
1601 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1602 		   vm_size_t prev_size, vm_size_t next_size)
1603 {
1604 	vm_pindex_t next_pindex;
1605 
1606 	if (prev_object == NULL)
1607 		return (TRUE);
1608 
1609 	vm_object_hold(prev_object);
1610 
1611 	if (prev_object->type != OBJT_DEFAULT &&
1612 	    prev_object->type != OBJT_SWAP) {
1613 		vm_object_drop(prev_object);
1614 		return (FALSE);
1615 	}
1616 
1617 #if 0
1618 	/* caller now checks this */
1619 	/*
1620 	 * Try to collapse the object first
1621 	 */
1622 	vm_object_collapse(prev_object, NULL);
1623 #endif
1624 
1625 #if 0
1626 	/* caller now checks this */
1627 	/*
1628 	 * We can't coalesce if we shadow another object (figuring out the
1629 	 * relationships become too complex).
1630 	 */
1631 	if (prev_object->backing_object != NULL) {
1632 		vm_object_chain_release(prev_object);
1633 		vm_object_drop(prev_object);
1634 		return (FALSE);
1635 	}
1636 #endif
1637 
1638 	prev_size >>= PAGE_SHIFT;
1639 	next_size >>= PAGE_SHIFT;
1640 	next_pindex = prev_pindex + prev_size;
1641 
1642 	/*
1643 	 * We can't if the object has more than one ref count unless we
1644 	 * are extending it into newly minted space.
1645 	 */
1646 	if (prev_object->ref_count > 1 &&
1647 	    prev_object->size != next_pindex) {
1648 		vm_object_drop(prev_object);
1649 		return (FALSE);
1650 	}
1651 
1652 	/*
1653 	 * Remove any pages that may still be in the object from a previous
1654 	 * deallocation.
1655 	 */
1656 	if (next_pindex < prev_object->size) {
1657 		vm_object_page_remove(prev_object,
1658 				      next_pindex,
1659 				      next_pindex + next_size, FALSE);
1660 		if (prev_object->type == OBJT_SWAP)
1661 			swap_pager_freespace(prev_object,
1662 					     next_pindex, next_size);
1663 	}
1664 
1665 	/*
1666 	 * Extend the object if necessary.
1667 	 */
1668 	if (next_pindex + next_size > prev_object->size)
1669 		prev_object->size = next_pindex + next_size;
1670 	vm_object_drop(prev_object);
1671 
1672 	return (TRUE);
1673 }
1674 
1675 /*
1676  * Make the object writable and flag is being possibly dirty.
1677  *
1678  * The object might not be held (or might be held but held shared),
1679  * the related vnode is probably not held either.  Object and vnode are
1680  * stable by virtue of the vm_page busied by the caller preventing
1681  * destruction.
1682  *
1683  * If the related mount is flagged MNTK_THR_SYNC we need to call
1684  * vsetobjdirty().  Filesystems using this option usually shortcut
1685  * synchronization by only scanning the syncer list.
1686  */
1687 void
1688 vm_object_set_writeable_dirty(vm_object_t object)
1689 {
1690 	struct vnode *vp;
1691 
1692 	/*vm_object_assert_held(object);*/
1693 	/*
1694 	 * Avoid contention in vm fault path by checking the state before
1695 	 * issuing an atomic op on it.
1696 	 */
1697 	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1698 	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1699 		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1700 	}
1701 	if (object->type == OBJT_VNODE &&
1702 	    (vp = (struct vnode *)object->handle) != NULL) {
1703 		if ((vp->v_flag & VOBJDIRTY) == 0) {
1704 			if (vp->v_mount &&
1705 			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1706 				/*
1707 				 * New style THR_SYNC places vnodes on the
1708 				 * syncer list more deterministically.
1709 				 */
1710 				vsetobjdirty(vp);
1711 			} else {
1712 				/*
1713 				 * Old style scan would not necessarily place
1714 				 * a vnode on the syncer list when possibly
1715 				 * modified via mmap.
1716 				 */
1717 				vsetflags(vp, VOBJDIRTY);
1718 			}
1719 		}
1720 	}
1721 }
1722 
1723 #include "opt_ddb.h"
1724 #ifdef DDB
1725 #include <sys/cons.h>
1726 
1727 #include <ddb/ddb.h>
1728 
1729 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1730 				       vm_map_entry_t entry);
1731 static int	vm_object_in_map (vm_object_t object);
1732 
1733 /*
1734  * The caller must hold the object.
1735  */
1736 static int
1737 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1738 {
1739 	vm_map_backing_t ba;
1740 	vm_map_t tmpm;
1741 	vm_map_entry_t tmpe;
1742 	int entcount;
1743 
1744 	if (map == NULL)
1745 		return 0;
1746 	if (entry == NULL) {
1747 		tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1748 		entcount = map->nentries;
1749 		while (entcount-- && tmpe) {
1750 			if( _vm_object_in_map(map, object, tmpe)) {
1751 				return 1;
1752 			}
1753 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1754 		}
1755 		return (0);
1756 	}
1757 	switch(entry->maptype) {
1758 	case VM_MAPTYPE_SUBMAP:
1759 		tmpm = entry->ba.sub_map;
1760 		tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1761 		entcount = tmpm->nentries;
1762 		while (entcount-- && tmpe) {
1763 			if( _vm_object_in_map(tmpm, object, tmpe)) {
1764 				return 1;
1765 			}
1766 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1767 		}
1768 		break;
1769 	case VM_MAPTYPE_NORMAL:
1770 		ba = &entry->ba;
1771 		while (ba) {
1772 			if (ba->object == object)
1773 				return TRUE;
1774 			ba = ba->backing_ba;
1775 		}
1776 		break;
1777 	default:
1778 		break;
1779 	}
1780 	return 0;
1781 }
1782 
1783 static int vm_object_in_map_callback(struct proc *p, void *data);
1784 
1785 struct vm_object_in_map_info {
1786 	vm_object_t object;
1787 	int rv;
1788 };
1789 
1790 /*
1791  * Debugging only
1792  */
1793 static int
1794 vm_object_in_map(vm_object_t object)
1795 {
1796 	struct vm_object_in_map_info info;
1797 
1798 	info.rv = 0;
1799 	info.object = object;
1800 
1801 	allproc_scan(vm_object_in_map_callback, &info, 0);
1802 	if (info.rv)
1803 		return 1;
1804 	if( _vm_object_in_map(kernel_map, object, 0))
1805 		return 1;
1806 	if( _vm_object_in_map(pager_map, object, 0))
1807 		return 1;
1808 	if( _vm_object_in_map(buffer_map, object, 0))
1809 		return 1;
1810 	return 0;
1811 }
1812 
1813 /*
1814  * Debugging only
1815  */
1816 static int
1817 vm_object_in_map_callback(struct proc *p, void *data)
1818 {
1819 	struct vm_object_in_map_info *info = data;
1820 
1821 	if (p->p_vmspace) {
1822 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1823 			info->rv = 1;
1824 			return -1;
1825 		}
1826 	}
1827 	return (0);
1828 }
1829 
1830 DB_SHOW_COMMAND(vmochk, vm_object_check)
1831 {
1832 	struct vm_object_hash *hash;
1833 	vm_object_t object;
1834 	int n;
1835 
1836 	/*
1837 	 * make sure that internal objs are in a map somewhere
1838 	 * and none have zero ref counts.
1839 	 */
1840 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1841 		hash = &vm_object_hash[n];
1842 		for (object = TAILQ_FIRST(&hash->list);
1843 				object != NULL;
1844 				object = TAILQ_NEXT(object, object_entry)) {
1845 			if (object->type == OBJT_MARKER)
1846 				continue;
1847 			if (object->handle != NULL ||
1848 			    (object->type != OBJT_DEFAULT &&
1849 			     object->type != OBJT_SWAP)) {
1850 				continue;
1851 			}
1852 			if (object->ref_count == 0) {
1853 				db_printf("vmochk: internal obj has "
1854 					  "zero ref count: %ld\n",
1855 					  (long)object->size);
1856 			}
1857 			if (vm_object_in_map(object))
1858 				continue;
1859 			db_printf("vmochk: internal obj is not in a map: "
1860 				  "ref: %d, size: %lu: 0x%lx\n",
1861 				  object->ref_count, (u_long)object->size,
1862 				  (u_long)object->size);
1863 		}
1864 	}
1865 }
1866 
1867 /*
1868  * Debugging only
1869  */
1870 DB_SHOW_COMMAND(object, vm_object_print_static)
1871 {
1872 	/* XXX convert args. */
1873 	vm_object_t object = (vm_object_t)addr;
1874 	boolean_t full = have_addr;
1875 
1876 	vm_page_t p;
1877 
1878 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1879 #define	count	was_count
1880 
1881 	int count;
1882 
1883 	if (object == NULL)
1884 		return;
1885 
1886 	db_iprintf(
1887 	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1888 	    object, (int)object->type, (u_long)object->size,
1889 	    object->resident_page_count, object->ref_count, object->flags);
1890 	/*
1891 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1892 	 */
1893 	db_iprintf("\n");
1894 
1895 	if (!full)
1896 		return;
1897 
1898 	db_indent += 2;
1899 	count = 0;
1900 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1901 		if (count == 0)
1902 			db_iprintf("memory:=");
1903 		else if (count == 6) {
1904 			db_printf("\n");
1905 			db_iprintf(" ...");
1906 			count = 0;
1907 		} else
1908 			db_printf(",");
1909 		count++;
1910 
1911 		db_printf("(off=0x%lx,page=0x%lx)",
1912 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1913 	}
1914 	if (count != 0)
1915 		db_printf("\n");
1916 	db_indent -= 2;
1917 }
1918 
1919 /* XXX. */
1920 #undef count
1921 
1922 /*
1923  * XXX need this non-static entry for calling from vm_map_print.
1924  *
1925  * Debugging only
1926  */
1927 void
1928 vm_object_print(/* db_expr_t */ long addr,
1929 		boolean_t have_addr,
1930 		/* db_expr_t */ long count,
1931 		char *modif)
1932 {
1933 	vm_object_print_static(addr, have_addr, count, modif);
1934 }
1935 
1936 /*
1937  * Debugging only
1938  */
1939 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1940 {
1941 	struct vm_object_hash *hash;
1942 	vm_object_t object;
1943 	int nl = 0;
1944 	int c;
1945 	int n;
1946 
1947 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1948 		hash = &vm_object_hash[n];
1949 		for (object = TAILQ_FIRST(&hash->list);
1950 				object != NULL;
1951 				object = TAILQ_NEXT(object, object_entry)) {
1952 			vm_pindex_t idx, fidx;
1953 			vm_pindex_t osize;
1954 			vm_paddr_t pa = -1, padiff;
1955 			int rcount;
1956 			vm_page_t m;
1957 
1958 			if (object->type == OBJT_MARKER)
1959 				continue;
1960 			db_printf("new object: %p\n", (void *)object);
1961 			if ( nl > 18) {
1962 				c = cngetc();
1963 				if (c != ' ')
1964 					return;
1965 				nl = 0;
1966 			}
1967 			nl++;
1968 			rcount = 0;
1969 			fidx = 0;
1970 			osize = object->size;
1971 			if (osize > 128)
1972 				osize = 128;
1973 			for (idx = 0; idx < osize; idx++) {
1974 				m = vm_page_lookup(object, idx);
1975 				if (m == NULL) {
1976 					if (rcount) {
1977 						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1978 							(long)fidx, rcount, (long)pa);
1979 						if ( nl > 18) {
1980 							c = cngetc();
1981 							if (c != ' ')
1982 								return;
1983 							nl = 0;
1984 						}
1985 						nl++;
1986 						rcount = 0;
1987 					}
1988 					continue;
1989 				}
1990 
1991 				if (rcount &&
1992 					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1993 					++rcount;
1994 					continue;
1995 				}
1996 				if (rcount) {
1997 					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1998 					padiff >>= PAGE_SHIFT;
1999 					padiff &= PQ_L2_MASK;
2000 					if (padiff == 0) {
2001 						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
2002 						++rcount;
2003 						continue;
2004 					}
2005 					db_printf(" index(%ld)run(%d)pa(0x%lx)",
2006 						(long)fidx, rcount, (long)pa);
2007 					db_printf("pd(%ld)\n", (long)padiff);
2008 					if ( nl > 18) {
2009 						c = cngetc();
2010 						if (c != ' ')
2011 							return;
2012 						nl = 0;
2013 					}
2014 					nl++;
2015 				}
2016 				fidx = idx;
2017 				pa = VM_PAGE_TO_PHYS(m);
2018 				rcount = 1;
2019 			}
2020 			if (rcount) {
2021 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2022 					(long)fidx, rcount, (long)pa);
2023 				if ( nl > 18) {
2024 					c = cngetc();
2025 					if (c != ' ')
2026 						return;
2027 					nl = 0;
2028 				}
2029 				nl++;
2030 			}
2031 		}
2032 	}
2033 }
2034 #endif /* DDB */
2035